Compare commits

...

6 Commits
main ... fixes

Author SHA1 Message Date
Falko Victor Habel 2bb73cc8f4 added _enhanced_memory_cleanup
Gitea Actions For AIIA / Explore-Gitea-Actions (push) Successful in 9m44s Details
2025-07-08 15:01:13 +02:00
Falko Victor Habel 622cc2cc35 new batching proces
Gitea Actions For AIIA / Explore-Gitea-Actions (push) Successful in 9m50s Details
2025-07-08 14:50:36 +02:00
Falko Victor Habel 7e639835b1 put clone out of else
Gitea Actions For AIIA / Explore-Gitea-Actions (push) Successful in 9m42s Details
2025-07-08 14:33:31 +02:00
Falko Victor Habel cd36b75e5f fixed model setup
Gitea Actions For AIIA / Explore-Gitea-Actions (push) Successful in 10m0s Details
2025-07-08 14:07:08 +02:00
Falko Victor Habel 25596a7916 new try
Gitea Actions For AIIA / Explore-Gitea-Actions (push) Successful in 9m45s Details
2025-07-08 13:46:29 +02:00
Falko Victor Habel c4a5cfec9f updated embeddings
Gitea Actions For AIIA / Explore-Gitea-Actions (push) Has been cancelled Details
2025-07-08 13:40:22 +02:00
3 changed files with 115 additions and 86 deletions

View File

@ -21,17 +21,17 @@ jobs:
- name: Clone additional repository
run: |
git config --global credential.helper cache
git clone https://fabel:${{ secrets.CICD }}@gitea.fabelous.app/fabel/VectorLoader.git
git clone https://fabel:${{ secrets.CICD }}@gitea.fabelous.app/fabel/VectorLoader.git
- name: Install dependencies
run: |
cd VectorLoader
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install -r requirements.txt
- name: Run vectorizing
env:
VECTORDB_TOKEN: ${{ secrets.VECTORDB_TOKEN }}
run: |
cd VectorLoader
python -m src.run
python -m src.run --repo-path ${{ gitea.workspace }}

View File

@ -162,7 +162,6 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
pass
def _evaluate_memory_efficient(self):
"""Memory-efficient validation with smaller chunks"""
if self.validation_loader is None:
return 0.0
@ -172,13 +171,15 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
with torch.no_grad():
for low_res, high_res in self.validation_loader:
# Mark step for validation too
if self.use_model_compilation:
torch.compiler.cudagraph_mark_step_begin()
low_res = low_res.to(self.device, non_blocking=True).to(memory_format=torch.channels_last)
high_res = high_res.to(self.device, non_blocking=True)
with autocast(device_type=self.device.type):
outputs = self.model(low_res)
outputs = outputs.clone()
loss = self.criterion(outputs, high_res)
outputs = self.model(low_res)
loss = self.criterion(outputs, high_res)
val_loss += loss.item()
num_batches += 1
@ -186,16 +187,54 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
# Immediate cleanup
del low_res, high_res, outputs, loss
# Aggressive cleanup every few batches
if num_batches % 10 == 0:
self._aggressive_memory_cleanup()
if num_batches % 5 == 0:
self._enhanced_memory_cleanup()
self.model.train()
# Final cleanup after validation
self._aggressive_memory_cleanup()
return val_loss / max(num_batches, 1)
def _enhanced_memory_cleanup(self):
"""Enhanced memory cleanup for CUDAGraph compatibility and memory optimization"""
# Clear gradients properly - set to None for better memory efficiency
if hasattr(self, 'optimizer') and self.optimizer is not None:
self.optimizer.zero_grad(set_to_none=True)
# Clear model gradients explicitly
if hasattr(self, 'model') and self.model is not None:
for param in self.model.parameters():
if param.grad is not None:
param.grad = None
# Force Python garbage collection
import gc
gc.collect()
# Clear PyTorch CUDA cache and synchronize
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
# Reset memory stats periodically
if hasattr(torch.cuda, 'reset_peak_memory_stats'):
torch.cuda.reset_peak_memory_stats()
# Clear any lingering autograd computation graphs
with torch.no_grad():
pass
# Mark new step for CUDAGraphs to prevent tensor conflicts
if self.use_model_compilation and hasattr(torch.compiler, 'cudagraph_mark_step_begin'):
torch.compiler.cudagraph_mark_step_begin()
# Clear any cached compilation artifacts
if hasattr(torch, '_dynamo') and hasattr(torch._dynamo, 'reset'):
try:
torch._dynamo.reset()
except Exception:
pass # Ignore if reset fails
def finetune(self, output_path, epochs=10, lr=1e-4, patience=3, min_delta=0.001):
"""Enhanced training with memory optimizations"""
if self.data_loader is None:
@ -242,65 +281,44 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
total=len(train_batches),
desc=f"Epoch {epoch + 1}/{epochs}")
for batch_idx, (low_res, high_res) in progress_bar:
# Mark step for CUDAGraphs
if self.use_model_compilation:
torch.compiler.cudagraph_mark_step_begin()
# Move data to device
low_res = low_res.to(self.device, non_blocking=True).to(memory_format=torch.channels_last)
high_res = high_res.to(self.device, non_blocking=True)
# Forward pass
# Forward pass with enhanced cloning
with autocast(device_type=self.device.type):
if hasattr(self, 'use_checkpointing') and self.use_checkpointing:
low_res.requires_grad_()
outputs = checkpoint(self.model, low_res)
outputs = outputs.clone()
else:
outputs = self.model(low_res)
outputs = outputs.clone()
# CRITICAL: Clone outputs before loss computation
outputs = outputs.detach().clone().requires_grad_(True)
loss = self.criterion(outputs, high_res)
# Scale loss for gradient accumulation
if self.use_gradient_accumulation:
loss = loss / self.accumulation_steps
# Backward pass
self.scaler.scale(loss).backward()
accumulation_loss += loss.item()
# Update weights every accumulation_steps or at the end of epoch
should_step = (not self.use_gradient_accumulation or
(batch_idx + 1) % self.accumulation_steps == 0 or
batch_idx == len(train_batches) - 1)
if should_step:
self.scaler.step(self.optimizer)
self.scaler.update()
self.optimizer.zero_grad()
# Add accumulated loss to epoch loss
if self.use_gradient_accumulation:
epoch_loss += accumulation_loss
accumulation_loss = 0.0
# Enhanced backward pass with proper cleanup
try:
self.scaler.scale(loss).backward()
except RuntimeError as e:
if "CUDAGraphs" in str(e):
# Fallback: clear everything and retry
del outputs, loss
torch.cuda.empty_cache()
if self.use_model_compilation:
torch.compiler.cudagraph_mark_step_begin()
continue
else:
epoch_loss += loss.item()
# Update progress bar
current_loss = accumulation_loss if self.use_gradient_accumulation else loss.item()
progress_bar.set_postfix({
'loss': current_loss,
'peak_mem': f"{self.peak_memory:.1f}GB" if self.use_memory_profiling else "N/A"
})
# Immediate cleanup
del low_res, high_res, outputs, loss
# Handle checkpoints
self._handle_checkpoints(epoch + 1, batch_idx + 1, current_loss < self.best_loss)
# Periodic aggressive cleanup
if batch_idx % 20 == 0:
self._aggressive_memory_cleanup()
raise e
# End of epoch processing
avg_train_loss = epoch_loss / len(self.data_loader)
@ -352,15 +370,6 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
return self.best_loss
# Stop memory monitoring
if self.use_memory_profiling:
self.stop_monitoring = True
if self.memory_monitor_thread:
self.memory_monitor_thread.join(timeout=1)
print(f"Training completed. Peak GPU memory usage: {self.peak_memory:.2f}GB")
return self.best_loss
def get_memory_summary(self):
"""Get a summary of memory usage during training"""

View File

@ -11,19 +11,11 @@ class aiuNN(PreTrainedModel):
super().__init__(config)
self.config = config
# Copy base layers into aiuNN for self-containment and portability
# Build base layers without dynamic unpacking
if base_model is not None:
if hasattr(base_model, 'cnn'):
self.base_layers = nn.Sequential(*[layer for layer in base_model.cnn])
elif hasattr(base_model, 'shared_layer') and hasattr(base_model, 'unique_layers'):
layers = [base_model.shared_layer, base_model.activation, base_model.max_pool]
for ul in base_model.unique_layers:
layers.extend([ul, base_model.activation, base_model.max_pool])
self.base_layers = nn.Sequential(*layers)
else:
self.base_layers = self._build_base_layers_from_config(config)
self.base_layers = self._copy_base_model_safely(base_model)
else:
self.base_layers = self._build_base_layers_from_config(config)
self.base_layers = self._build_static_base_layers(config)
# Bilinear upsampling head
self.upsample = nn.Upsample(
@ -38,18 +30,46 @@ class aiuNN(PreTrainedModel):
padding=1
)
def _build_base_layers_from_config(self, config):
layers = []
def _build_static_base_layers(self, config):
"""Build layers without dynamic unpacking to avoid CUDAGraph issues"""
from collections import OrderedDict
layers = OrderedDict()
in_channels = config.num_channels
for _ in range(config.num_hidden_layers):
layers.extend([
nn.Conv2d(in_channels, config.hidden_size,
kernel_size=config.kernel_size, padding=1),
getattr(nn, config.activation_function)(),
nn.MaxPool2d(kernel_size=1, stride=1)
])
for i in range(config.num_hidden_layers):
layers[f'conv_{i}'] = nn.Conv2d(
in_channels, config.hidden_size,
kernel_size=config.kernel_size, padding=1
)
layers[f'activation_{i}'] = getattr(nn, config.activation_function)()
layers[f'maxpool_{i}'] = nn.MaxPool2d(kernel_size=1, stride=1)
in_channels = config.hidden_size
return nn.Sequential(*layers)
return nn.Sequential(layers)
def _copy_base_model_safely(self, base_model):
"""Copy base model layers without dynamic unpacking"""
from collections import OrderedDict
layers = OrderedDict()
if hasattr(base_model, 'cnn'):
# Copy layers individually instead of unpacking
for i, layer in enumerate(base_model.cnn):
layers[f'copied_layer_{i}'] = layer
elif hasattr(base_model, 'shared_layer') and hasattr(base_model, 'unique_layers'):
# Build explicitly instead of using extend and unpack
layers['shared_layer'] = base_model.shared_layer
layers['shared_activation'] = base_model.activation
layers['shared_maxpool'] = base_model.max_pool
for i, ul in enumerate(base_model.unique_layers):
layers[f'unique_layer_{i}'] = ul
layers[f'unique_activation_{i}'] = base_model.activation
layers[f'unique_maxpool_{i}'] = base_model.max_pool
return nn.Sequential(layers)
def forward(self, x):
x = self.base_layers(x)