Compare commits
6 Commits
Author | SHA1 | Date |
---|---|---|
|
2bb73cc8f4 | |
|
622cc2cc35 | |
|
7e639835b1 | |
|
cd36b75e5f | |
|
25596a7916 | |
|
c4a5cfec9f |
|
@ -21,17 +21,17 @@ jobs:
|
|||
- name: Clone additional repository
|
||||
run: |
|
||||
git config --global credential.helper cache
|
||||
git clone https://fabel:${{ secrets.CICD }}@gitea.fabelous.app/fabel/VectorLoader.git
|
||||
git clone https://fabel:${{ secrets.CICD }}@gitea.fabelous.app/fabel/VectorLoader.git
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
cd VectorLoader
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Run vectorizing
|
||||
env:
|
||||
VECTORDB_TOKEN: ${{ secrets.VECTORDB_TOKEN }}
|
||||
run: |
|
||||
cd VectorLoader
|
||||
python -m src.run
|
||||
python -m src.run --repo-path ${{ gitea.workspace }}
|
|
@ -162,7 +162,6 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
|
|||
pass
|
||||
|
||||
def _evaluate_memory_efficient(self):
|
||||
"""Memory-efficient validation with smaller chunks"""
|
||||
if self.validation_loader is None:
|
||||
return 0.0
|
||||
|
||||
|
@ -172,13 +171,15 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
|
|||
|
||||
with torch.no_grad():
|
||||
for low_res, high_res in self.validation_loader:
|
||||
# Mark step for validation too
|
||||
if self.use_model_compilation:
|
||||
torch.compiler.cudagraph_mark_step_begin()
|
||||
|
||||
low_res = low_res.to(self.device, non_blocking=True).to(memory_format=torch.channels_last)
|
||||
high_res = high_res.to(self.device, non_blocking=True)
|
||||
|
||||
with autocast(device_type=self.device.type):
|
||||
outputs = self.model(low_res)
|
||||
outputs = outputs.clone()
|
||||
loss = self.criterion(outputs, high_res)
|
||||
outputs = self.model(low_res)
|
||||
loss = self.criterion(outputs, high_res)
|
||||
|
||||
val_loss += loss.item()
|
||||
num_batches += 1
|
||||
|
@ -186,16 +187,54 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
|
|||
# Immediate cleanup
|
||||
del low_res, high_res, outputs, loss
|
||||
|
||||
# Aggressive cleanup every few batches
|
||||
if num_batches % 10 == 0:
|
||||
self._aggressive_memory_cleanup()
|
||||
if num_batches % 5 == 0:
|
||||
self._enhanced_memory_cleanup()
|
||||
|
||||
self.model.train()
|
||||
# Final cleanup after validation
|
||||
self._aggressive_memory_cleanup()
|
||||
|
||||
return val_loss / max(num_batches, 1)
|
||||
|
||||
def _enhanced_memory_cleanup(self):
|
||||
"""Enhanced memory cleanup for CUDAGraph compatibility and memory optimization"""
|
||||
|
||||
# Clear gradients properly - set to None for better memory efficiency
|
||||
if hasattr(self, 'optimizer') and self.optimizer is not None:
|
||||
self.optimizer.zero_grad(set_to_none=True)
|
||||
|
||||
# Clear model gradients explicitly
|
||||
if hasattr(self, 'model') and self.model is not None:
|
||||
for param in self.model.parameters():
|
||||
if param.grad is not None:
|
||||
param.grad = None
|
||||
|
||||
# Force Python garbage collection
|
||||
import gc
|
||||
gc.collect()
|
||||
|
||||
# Clear PyTorch CUDA cache and synchronize
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Reset memory stats periodically
|
||||
if hasattr(torch.cuda, 'reset_peak_memory_stats'):
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
|
||||
# Clear any lingering autograd computation graphs
|
||||
with torch.no_grad():
|
||||
pass
|
||||
|
||||
# Mark new step for CUDAGraphs to prevent tensor conflicts
|
||||
if self.use_model_compilation and hasattr(torch.compiler, 'cudagraph_mark_step_begin'):
|
||||
torch.compiler.cudagraph_mark_step_begin()
|
||||
|
||||
# Clear any cached compilation artifacts
|
||||
if hasattr(torch, '_dynamo') and hasattr(torch._dynamo, 'reset'):
|
||||
try:
|
||||
torch._dynamo.reset()
|
||||
except Exception:
|
||||
pass # Ignore if reset fails
|
||||
|
||||
|
||||
def finetune(self, output_path, epochs=10, lr=1e-4, patience=3, min_delta=0.001):
|
||||
"""Enhanced training with memory optimizations"""
|
||||
if self.data_loader is None:
|
||||
|
@ -242,65 +281,44 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
|
|||
total=len(train_batches),
|
||||
desc=f"Epoch {epoch + 1}/{epochs}")
|
||||
|
||||
|
||||
for batch_idx, (low_res, high_res) in progress_bar:
|
||||
# Mark step for CUDAGraphs
|
||||
if self.use_model_compilation:
|
||||
torch.compiler.cudagraph_mark_step_begin()
|
||||
|
||||
# Move data to device
|
||||
low_res = low_res.to(self.device, non_blocking=True).to(memory_format=torch.channels_last)
|
||||
high_res = high_res.to(self.device, non_blocking=True)
|
||||
|
||||
# Forward pass
|
||||
# Forward pass with enhanced cloning
|
||||
with autocast(device_type=self.device.type):
|
||||
if hasattr(self, 'use_checkpointing') and self.use_checkpointing:
|
||||
low_res.requires_grad_()
|
||||
outputs = checkpoint(self.model, low_res)
|
||||
outputs = outputs.clone()
|
||||
else:
|
||||
outputs = self.model(low_res)
|
||||
outputs = outputs.clone()
|
||||
|
||||
# CRITICAL: Clone outputs before loss computation
|
||||
outputs = outputs.detach().clone().requires_grad_(True)
|
||||
loss = self.criterion(outputs, high_res)
|
||||
|
||||
# Scale loss for gradient accumulation
|
||||
if self.use_gradient_accumulation:
|
||||
loss = loss / self.accumulation_steps
|
||||
|
||||
# Backward pass
|
||||
self.scaler.scale(loss).backward()
|
||||
|
||||
accumulation_loss += loss.item()
|
||||
|
||||
# Update weights every accumulation_steps or at the end of epoch
|
||||
should_step = (not self.use_gradient_accumulation or
|
||||
(batch_idx + 1) % self.accumulation_steps == 0 or
|
||||
batch_idx == len(train_batches) - 1)
|
||||
|
||||
if should_step:
|
||||
self.scaler.step(self.optimizer)
|
||||
self.scaler.update()
|
||||
self.optimizer.zero_grad()
|
||||
|
||||
# Add accumulated loss to epoch loss
|
||||
if self.use_gradient_accumulation:
|
||||
epoch_loss += accumulation_loss
|
||||
accumulation_loss = 0.0
|
||||
# Enhanced backward pass with proper cleanup
|
||||
try:
|
||||
self.scaler.scale(loss).backward()
|
||||
except RuntimeError as e:
|
||||
if "CUDAGraphs" in str(e):
|
||||
# Fallback: clear everything and retry
|
||||
del outputs, loss
|
||||
torch.cuda.empty_cache()
|
||||
if self.use_model_compilation:
|
||||
torch.compiler.cudagraph_mark_step_begin()
|
||||
continue
|
||||
else:
|
||||
epoch_loss += loss.item()
|
||||
|
||||
# Update progress bar
|
||||
current_loss = accumulation_loss if self.use_gradient_accumulation else loss.item()
|
||||
progress_bar.set_postfix({
|
||||
'loss': current_loss,
|
||||
'peak_mem': f"{self.peak_memory:.1f}GB" if self.use_memory_profiling else "N/A"
|
||||
})
|
||||
|
||||
# Immediate cleanup
|
||||
del low_res, high_res, outputs, loss
|
||||
|
||||
# Handle checkpoints
|
||||
self._handle_checkpoints(epoch + 1, batch_idx + 1, current_loss < self.best_loss)
|
||||
|
||||
# Periodic aggressive cleanup
|
||||
if batch_idx % 20 == 0:
|
||||
self._aggressive_memory_cleanup()
|
||||
raise e
|
||||
|
||||
|
||||
# End of epoch processing
|
||||
avg_train_loss = epoch_loss / len(self.data_loader)
|
||||
|
@ -352,15 +370,6 @@ class MemoryOptimizedTrainer(aiuNNTrainer):
|
|||
|
||||
return self.best_loss
|
||||
|
||||
|
||||
# Stop memory monitoring
|
||||
if self.use_memory_profiling:
|
||||
self.stop_monitoring = True
|
||||
if self.memory_monitor_thread:
|
||||
self.memory_monitor_thread.join(timeout=1)
|
||||
print(f"Training completed. Peak GPU memory usage: {self.peak_memory:.2f}GB")
|
||||
|
||||
return self.best_loss
|
||||
|
||||
def get_memory_summary(self):
|
||||
"""Get a summary of memory usage during training"""
|
||||
|
|
|
@ -11,19 +11,11 @@ class aiuNN(PreTrainedModel):
|
|||
super().__init__(config)
|
||||
self.config = config
|
||||
|
||||
# Copy base layers into aiuNN for self-containment and portability
|
||||
# Build base layers without dynamic unpacking
|
||||
if base_model is not None:
|
||||
if hasattr(base_model, 'cnn'):
|
||||
self.base_layers = nn.Sequential(*[layer for layer in base_model.cnn])
|
||||
elif hasattr(base_model, 'shared_layer') and hasattr(base_model, 'unique_layers'):
|
||||
layers = [base_model.shared_layer, base_model.activation, base_model.max_pool]
|
||||
for ul in base_model.unique_layers:
|
||||
layers.extend([ul, base_model.activation, base_model.max_pool])
|
||||
self.base_layers = nn.Sequential(*layers)
|
||||
else:
|
||||
self.base_layers = self._build_base_layers_from_config(config)
|
||||
self.base_layers = self._copy_base_model_safely(base_model)
|
||||
else:
|
||||
self.base_layers = self._build_base_layers_from_config(config)
|
||||
self.base_layers = self._build_static_base_layers(config)
|
||||
|
||||
# Bilinear upsampling head
|
||||
self.upsample = nn.Upsample(
|
||||
|
@ -38,18 +30,46 @@ class aiuNN(PreTrainedModel):
|
|||
padding=1
|
||||
)
|
||||
|
||||
def _build_base_layers_from_config(self, config):
|
||||
layers = []
|
||||
def _build_static_base_layers(self, config):
|
||||
"""Build layers without dynamic unpacking to avoid CUDAGraph issues"""
|
||||
from collections import OrderedDict
|
||||
|
||||
layers = OrderedDict()
|
||||
in_channels = config.num_channels
|
||||
for _ in range(config.num_hidden_layers):
|
||||
layers.extend([
|
||||
nn.Conv2d(in_channels, config.hidden_size,
|
||||
kernel_size=config.kernel_size, padding=1),
|
||||
getattr(nn, config.activation_function)(),
|
||||
nn.MaxPool2d(kernel_size=1, stride=1)
|
||||
])
|
||||
|
||||
for i in range(config.num_hidden_layers):
|
||||
layers[f'conv_{i}'] = nn.Conv2d(
|
||||
in_channels, config.hidden_size,
|
||||
kernel_size=config.kernel_size, padding=1
|
||||
)
|
||||
layers[f'activation_{i}'] = getattr(nn, config.activation_function)()
|
||||
layers[f'maxpool_{i}'] = nn.MaxPool2d(kernel_size=1, stride=1)
|
||||
in_channels = config.hidden_size
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
return nn.Sequential(layers)
|
||||
|
||||
def _copy_base_model_safely(self, base_model):
|
||||
"""Copy base model layers without dynamic unpacking"""
|
||||
from collections import OrderedDict
|
||||
|
||||
layers = OrderedDict()
|
||||
|
||||
if hasattr(base_model, 'cnn'):
|
||||
# Copy layers individually instead of unpacking
|
||||
for i, layer in enumerate(base_model.cnn):
|
||||
layers[f'copied_layer_{i}'] = layer
|
||||
elif hasattr(base_model, 'shared_layer') and hasattr(base_model, 'unique_layers'):
|
||||
# Build explicitly instead of using extend and unpack
|
||||
layers['shared_layer'] = base_model.shared_layer
|
||||
layers['shared_activation'] = base_model.activation
|
||||
layers['shared_maxpool'] = base_model.max_pool
|
||||
|
||||
for i, ul in enumerate(base_model.unique_layers):
|
||||
layers[f'unique_layer_{i}'] = ul
|
||||
layers[f'unique_activation_{i}'] = base_model.activation
|
||||
layers[f'unique_maxpool_{i}'] = base_model.max_pool
|
||||
|
||||
return nn.Sequential(layers)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.base_layers(x)
|
||||
|
|
Loading…
Reference in New Issue