Merge pull request 'updated memory fix' (#26 ) from feat/save_fix into main

Reviewed-on: #26
updated memory fix
2025-07-03 11:50:07 +00:00 · 2025-07-03 13:49:45 +02:00
1 changed files with 18 additions and 5 deletions
--- a/src/aiunn/finetune/memory_trainer.py
+++ b/src/aiunn/finetune/memory_trainer.py
@ -237,9 +237,10 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
            start_idx = start_batch if epoch == start_epoch else 0
            
            progress_bar = tqdm(train_batches[start_idx:], 
-                              initial=start_idx, 
-                              total=len(train_batches),
-                              desc=f"Epoch {epoch + 1}/{epochs}")
+                                initial=start_idx, 
+                                total=len(train_batches),
+                                desc=f"Epoch {epoch + 1}/{epochs}")
+

            for batch_idx, (low_res, high_res) in progress_bar:
                # Move data to device
@ -251,8 +252,10 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
                    if hasattr(self, 'use_checkpointing') and self.use_checkpointing:
                        low_res.requires_grad_()
                        outputs = checkpoint(self.model, low_res)
+                        outputs = outputs.clone()  # <-- Clone added here
                    else:
                        outputs = self.model(low_res)
+                        outputs = outputs.clone()  # <-- Clone added here
                    loss = self.criterion(outputs, high_res)
                    
                    # Scale loss for gradient accumulation
@ -266,8 +269,8 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
                
                # Update weights every accumulation_steps or at the end of epoch
                should_step = (not self.use_gradient_accumulation or 
-                             (batch_idx + 1) % self.accumulation_steps == 0 or 
-                             batch_idx == len(train_batches) - 1)
+                            (batch_idx + 1) % self.accumulation_steps == 0 or 
+                            batch_idx == len(train_batches) - 1)
                
                if should_step:
                    self.scaler.step(self.optimizer)
@ -348,6 +351,16 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
        
        return self.best_loss

+        
+        # Stop memory monitoring
+        if self.use_memory_profiling:
+            self.stop_monitoring = True
+            if self.memory_monitor_thread:
+                self.memory_monitor_thread.join(timeout=1)
+            print(f"Training completed. Peak GPU memory usage: {self.peak_memory:.2f}GB")
+        
+        return self.best_loss
+
    def get_memory_summary(self):
        """Get a summary of memory usage during training"""
        if not self.memory_stats:
Author	SHA1	Message	Date
Falko Victor Habel	e7b9da37d6	Merge pull request 'updated memory fix' (#26 ) from feat/save_fix into main Run VectorLoader Script / Explore-Gitea-Actions (push) Successful in 20s Details Gitea Actions For AIIA / Explore-Gitea-Actions (push) Successful in 9m36s Details Reviewed-on: #26	2025-07-03 11:50:07 +00:00
Falko Victor Habel	159ada872b	updated memory fix Gitea Actions For AIIA / Explore-Gitea-Actions (push) Successful in 9m43s Details	2025-07-03 13:49:45 +02:00