added _enhanced_memory_cleanup

new batching proces
put clone out of else
2025-07-08 15:01:13 +02:00 · 2025-07-08 14:50:36 +02:00 · 2025-07-08 14:33:31 +02:00 · 2025-07-08 14:07:08 +02:00 · 2025-07-08 13:46:29 +02:00 · 2025-07-08 13:40:22 +02:00
3 changed files with 115 additions and 86 deletions
--- a/.gitea/workflows/embed.yaml
+++ b/.gitea/workflows/embed.yaml
@ -21,17 +21,17 @@ jobs:
      - name: Clone additional repository
        run: |
          git config --global credential.helper cache
-          git clone https://fabel:${{ secrets.CICD }}@gitea.fabelous.app/fabel/VectorLoader.git                              
+          git clone https://fabel:${{ secrets.CICD }}@gitea.fabelous.app/fabel/VectorLoader.git          

      - name: Install dependencies
        run: |
          cd VectorLoader
          python -m pip install --upgrade pip
-          pip install -r requirements.txt                              
+          pip install -r requirements.txt          

      - name: Run vectorizing
        env:
          VECTORDB_TOKEN: ${{ secrets.VECTORDB_TOKEN }}
        run: |
          cd VectorLoader
-          python -m src.run                           
+          python -m src.run --repo-path ${{ gitea.workspace }}          
--- a/src/aiunn/finetune/memory_trainer.py
+++ b/src/aiunn/finetune/memory_trainer.py
@ -162,7 +162,6 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
                pass

    def _evaluate_memory_efficient(self):
-        """Memory-efficient validation with smaller chunks"""
        if self.validation_loader is None:
            return 0.0
            
@ -172,13 +171,15 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
        
        with torch.no_grad():
            for low_res, high_res in self.validation_loader:
+                # Mark step for validation too
+                if self.use_model_compilation:
+                    torch.compiler.cudagraph_mark_step_begin()
+                
                low_res = low_res.to(self.device, non_blocking=True).to(memory_format=torch.channels_last)
                high_res = high_res.to(self.device, non_blocking=True)
                
-                with autocast(device_type=self.device.type):
-                    outputs = self.model(low_res)
-                    outputs = outputs.clone()
-                    loss = self.criterion(outputs, high_res)
+                outputs = self.model(low_res)
+                loss = self.criterion(outputs, high_res)
                
                val_loss += loss.item()
                num_batches += 1
@ -186,16 +187,54 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
                # Immediate cleanup
                del low_res, high_res, outputs, loss
                
-                # Aggressive cleanup every few batches
-                if num_batches % 10 == 0:
-                    self._aggressive_memory_cleanup()
+                if num_batches % 5 == 0:
+                    self._enhanced_memory_cleanup()
        
        self.model.train()
-        # Final cleanup after validation
-        self._aggressive_memory_cleanup()
-        
        return val_loss / max(num_batches, 1)

+    def _enhanced_memory_cleanup(self):
+        """Enhanced memory cleanup for CUDAGraph compatibility and memory optimization"""
+        
+        # Clear gradients properly - set to None for better memory efficiency
+        if hasattr(self, 'optimizer') and self.optimizer is not None:
+            self.optimizer.zero_grad(set_to_none=True)
+        
+        # Clear model gradients explicitly
+        if hasattr(self, 'model') and self.model is not None:
+            for param in self.model.parameters():
+                if param.grad is not None:
+                    param.grad = None
+        
+        # Force Python garbage collection
+        import gc
+        gc.collect()
+        
+        # Clear PyTorch CUDA cache and synchronize
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+            
+            # Reset memory stats periodically
+            if hasattr(torch.cuda, 'reset_peak_memory_stats'):
+                torch.cuda.reset_peak_memory_stats()
+        
+        # Clear any lingering autograd computation graphs
+        with torch.no_grad():
+            pass
+        
+        # Mark new step for CUDAGraphs to prevent tensor conflicts
+        if self.use_model_compilation and hasattr(torch.compiler, 'cudagraph_mark_step_begin'):
+            torch.compiler.cudagraph_mark_step_begin()
+        
+        # Clear any cached compilation artifacts
+        if hasattr(torch, '_dynamo') and hasattr(torch._dynamo, 'reset'):
+            try:
+                torch._dynamo.reset()
+            except Exception:
+                pass  # Ignore if reset fails
+
+
    def finetune(self, output_path, epochs=10, lr=1e-4, patience=3, min_delta=0.001):
        """Enhanced training with memory optimizations"""
        if self.data_loader is None:
@ -242,65 +281,44 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
                                total=len(train_batches),
                                desc=f"Epoch {epoch + 1}/{epochs}")

-
            for batch_idx, (low_res, high_res) in progress_bar:
+                # Mark step for CUDAGraphs
+                if self.use_model_compilation:
+                    torch.compiler.cudagraph_mark_step_begin()
+                
                # Move data to device
                low_res = low_res.to(self.device, non_blocking=True).to(memory_format=torch.channels_last)
                high_res = high_res.to(self.device, non_blocking=True)
                
-                # Forward pass
+                # Forward pass with enhanced cloning
                with autocast(device_type=self.device.type):
                    if hasattr(self, 'use_checkpointing') and self.use_checkpointing:
                        low_res.requires_grad_()
                        outputs = checkpoint(self.model, low_res)
-                        outputs = outputs.clone()
                    else:
                        outputs = self.model(low_res)
-                        outputs = outputs.clone()
+                    
+                    # CRITICAL: Clone outputs before loss computation
+                    outputs = outputs.detach().clone().requires_grad_(True)
                    loss = self.criterion(outputs, high_res)
                    
-                    # Scale loss for gradient accumulation
                    if self.use_gradient_accumulation:
                        loss = loss / self.accumulation_steps
                
-                # Backward pass
-                self.scaler.scale(loss).backward()
-                
-                accumulation_loss += loss.item()
-                
-                # Update weights every accumulation_steps or at the end of epoch
-                should_step = (not self.use_gradient_accumulation or 
-                            (batch_idx + 1) % self.accumulation_steps == 0 or 
-                            batch_idx == len(train_batches) - 1)
-                
-                if should_step:
-                    self.scaler.step(self.optimizer)
-                    self.scaler.update()
-                    self.optimizer.zero_grad()
-                    
-                    # Add accumulated loss to epoch loss
-                    if self.use_gradient_accumulation:
-                        epoch_loss += accumulation_loss
-                        accumulation_loss = 0.0
+                # Enhanced backward pass with proper cleanup
+                try:
+                    self.scaler.scale(loss).backward()
+                except RuntimeError as e:
+                    if "CUDAGraphs" in str(e):
+                        # Fallback: clear everything and retry
+                        del outputs, loss
+                        torch.cuda.empty_cache()
+                        if self.use_model_compilation:
+                            torch.compiler.cudagraph_mark_step_begin()
+                        continue
                    else:
-                        epoch_loss += loss.item()
-                
-                # Update progress bar
-                current_loss = accumulation_loss if self.use_gradient_accumulation else loss.item()
-                progress_bar.set_postfix({
-                    'loss': current_loss,
-                    'peak_mem': f"{self.peak_memory:.1f}GB" if self.use_memory_profiling else "N/A"
-                })
-                
-                # Immediate cleanup
-                del low_res, high_res, outputs, loss
-                
-                # Handle checkpoints
-                self._handle_checkpoints(epoch + 1, batch_idx + 1, current_loss < self.best_loss)
-                
-                # Periodic aggressive cleanup
-                if batch_idx % 20 == 0:
-                    self._aggressive_memory_cleanup()
+                        raise e
+
            
            # End of epoch processing
            avg_train_loss = epoch_loss / len(self.data_loader)
@ -352,15 +370,6 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
        
        return self.best_loss

-        
-        # Stop memory monitoring
-        if self.use_memory_profiling:
-            self.stop_monitoring = True
-            if self.memory_monitor_thread:
-                self.memory_monitor_thread.join(timeout=1)
-            print(f"Training completed. Peak GPU memory usage: {self.peak_memory:.2f}GB")
-        
-        return self.best_loss

    def get_memory_summary(self):
        """Get a summary of memory usage during training"""
--- a/src/aiunn/upsampler/aiunn.py
+++ b/src/aiunn/upsampler/aiunn.py
@ -11,19 +11,11 @@ class aiuNN(PreTrainedModel):
        super().__init__(config)
        self.config = config

-        # Copy base layers into aiuNN for self-containment and portability
+        # Build base layers without dynamic unpacking
        if base_model is not None:
-            if hasattr(base_model, 'cnn'):
-                self.base_layers = nn.Sequential(*[layer for layer in base_model.cnn])
-            elif hasattr(base_model, 'shared_layer') and hasattr(base_model, 'unique_layers'):
-                layers = [base_model.shared_layer, base_model.activation, base_model.max_pool]
-                for ul in base_model.unique_layers:
-                    layers.extend([ul, base_model.activation, base_model.max_pool])
-                self.base_layers = nn.Sequential(*layers)
-            else:
-                self.base_layers = self._build_base_layers_from_config(config)
+            self.base_layers = self._copy_base_model_safely(base_model)
        else:
-            self.base_layers = self._build_base_layers_from_config(config)
+            self.base_layers = self._build_static_base_layers(config)

        # Bilinear upsampling head
        self.upsample = nn.Upsample(
@ -38,18 +30,46 @@ class aiuNN(PreTrainedModel):
            padding=1
        )

-    def _build_base_layers_from_config(self, config):
-        layers = []
+    def _build_static_base_layers(self, config):
+        """Build layers without dynamic unpacking to avoid CUDAGraph issues"""
+        from collections import OrderedDict
+        
+        layers = OrderedDict()
        in_channels = config.num_channels
-        for _ in range(config.num_hidden_layers):
-            layers.extend([
-                nn.Conv2d(in_channels, config.hidden_size,
-                          kernel_size=config.kernel_size, padding=1),
-                getattr(nn, config.activation_function)(),
-                nn.MaxPool2d(kernel_size=1, stride=1)
-            ])
+        
+        for i in range(config.num_hidden_layers):
+            layers[f'conv_{i}'] = nn.Conv2d(
+                in_channels, config.hidden_size,
+                kernel_size=config.kernel_size, padding=1
+            )
+            layers[f'activation_{i}'] = getattr(nn, config.activation_function)()
+            layers[f'maxpool_{i}'] = nn.MaxPool2d(kernel_size=1, stride=1)
            in_channels = config.hidden_size
-        return nn.Sequential(*layers)
+        
+        return nn.Sequential(layers)
+
+    def _copy_base_model_safely(self, base_model):
+        """Copy base model layers without dynamic unpacking"""
+        from collections import OrderedDict
+        
+        layers = OrderedDict()
+        
+        if hasattr(base_model, 'cnn'):
+            # Copy layers individually instead of unpacking
+            for i, layer in enumerate(base_model.cnn):
+                layers[f'copied_layer_{i}'] = layer
+        elif hasattr(base_model, 'shared_layer') and hasattr(base_model, 'unique_layers'):
+            # Build explicitly instead of using extend and unpack
+            layers['shared_layer'] = base_model.shared_layer
+            layers['shared_activation'] = base_model.activation
+            layers['shared_maxpool'] = base_model.max_pool
+            
+            for i, ul in enumerate(base_model.unique_layers):
+                layers[f'unique_layer_{i}'] = ul
+                layers[f'unique_activation_{i}'] = base_model.activation
+                layers[f'unique_maxpool_{i}'] = base_model.max_pool
+        
+        return nn.Sequential(layers)

    def forward(self, x):
        x = self.base_layers(x)
Author	SHA1	Message	Date
Falko Victor Habel	2bb73cc8f4	added _enhanced_memory_cleanup Gitea Actions For AIIA / Explore-Gitea-Actions (push) Successful in 9m44s Details	2025-07-08 15:01:13 +02:00
Falko Victor Habel	622cc2cc35	new batching proces Gitea Actions For AIIA / Explore-Gitea-Actions (push) Successful in 9m50s Details	2025-07-08 14:50:36 +02:00
Falko Victor Habel	7e639835b1	put clone out of else Gitea Actions For AIIA / Explore-Gitea-Actions (push) Successful in 9m42s Details	2025-07-08 14:33:31 +02:00
Falko Victor Habel	cd36b75e5f	fixed model setup Gitea Actions For AIIA / Explore-Gitea-Actions (push) Successful in 10m0s Details	2025-07-08 14:07:08 +02:00
Falko Victor Habel	25596a7916	new try Gitea Actions For AIIA / Explore-Gitea-Actions (push) Successful in 9m45s Details	2025-07-08 13:46:29 +02:00
Falko Victor Habel	c4a5cfec9f	updated embeddings Gitea Actions For AIIA / Explore-Gitea-Actions (push) Has been cancelled Details	2025-07-08 13:40:22 +02:00