added _enhanced_memory_cleanup

2025-07-08 15:01:13 +02:00 · 2025-07-08 15:01:13 +02:00 · 2bb73cc8f4
parent 622cc2cc35
commit 2bb73cc8f4
1 changed files with 41 additions and 0 deletions
--- a/src/aiunn/finetune/memory_trainer.py
+++ b/src/aiunn/finetune/memory_trainer.py
@ -193,6 +193,47 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
        self.model.train()
        return val_loss / max(num_batches, 1)

+    def _enhanced_memory_cleanup(self):
+        """Enhanced memory cleanup for CUDAGraph compatibility and memory optimization"""
+        
+        # Clear gradients properly - set to None for better memory efficiency
+        if hasattr(self, 'optimizer') and self.optimizer is not None:
+            self.optimizer.zero_grad(set_to_none=True)
+        
+        # Clear model gradients explicitly
+        if hasattr(self, 'model') and self.model is not None:
+            for param in self.model.parameters():
+                if param.grad is not None:
+                    param.grad = None
+        
+        # Force Python garbage collection
+        import gc
+        gc.collect()
+        
+        # Clear PyTorch CUDA cache and synchronize
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+            
+            # Reset memory stats periodically
+            if hasattr(torch.cuda, 'reset_peak_memory_stats'):
+                torch.cuda.reset_peak_memory_stats()
+        
+        # Clear any lingering autograd computation graphs
+        with torch.no_grad():
+            pass
+        
+        # Mark new step for CUDAGraphs to prevent tensor conflicts
+        if self.use_model_compilation and hasattr(torch.compiler, 'cudagraph_mark_step_begin'):
+            torch.compiler.cudagraph_mark_step_begin()
+        
+        # Clear any cached compilation artifacts
+        if hasattr(torch, '_dynamo') and hasattr(torch._dynamo, 'reset'):
+            try:
+                torch._dynamo.reset()
+            except Exception:
+                pass  # Ignore if reset fails
+

    def finetune(self, output_path, epochs=10, lr=1e-4, patience=3, min_delta=0.001):
        """Enhanced training with memory optimizations"""