added _enhanced_memory_cleanup
Gitea Actions For AIIA / Explore-Gitea-Actions (push) Successful in 9m44s Details

This commit is contained in:
Falko Victor Habel 2025-07-08 15:01:13 +02:00
parent 622cc2cc35
commit 2bb73cc8f4
1 changed files with 41 additions and 0 deletions

View File

@ -193,6 +193,47 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
self.model.train()
return val_loss / max(num_batches, 1)
def _enhanced_memory_cleanup(self):
"""Enhanced memory cleanup for CUDAGraph compatibility and memory optimization"""
# Clear gradients properly - set to None for better memory efficiency
if hasattr(self, 'optimizer') and self.optimizer is not None:
self.optimizer.zero_grad(set_to_none=True)
# Clear model gradients explicitly
if hasattr(self, 'model') and self.model is not None:
for param in self.model.parameters():
if param.grad is not None:
param.grad = None
# Force Python garbage collection
import gc
gc.collect()
# Clear PyTorch CUDA cache and synchronize
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
# Reset memory stats periodically
if hasattr(torch.cuda, 'reset_peak_memory_stats'):
torch.cuda.reset_peak_memory_stats()
# Clear any lingering autograd computation graphs
with torch.no_grad():
pass
# Mark new step for CUDAGraphs to prevent tensor conflicts
if self.use_model_compilation and hasattr(torch.compiler, 'cudagraph_mark_step_begin'):
torch.compiler.cudagraph_mark_step_begin()
# Clear any cached compilation artifacts
if hasattr(torch, '_dynamo') and hasattr(torch._dynamo, 'reset'):
try:
torch._dynamo.reset()
except Exception:
pass # Ignore if reset fails
def finetune(self, output_path, epochs=10, lr=1e-4, patience=3, min_delta=0.001):
"""Enhanced training with memory optimizations"""