improved pretraining

2025-01-28 11:27:42 +01:00 · 2025-01-28 11:27:42 +01:00 · a369c49f15
parent 7de7eef081
commit a369c49f15
3 changed files with 71 additions and 22 deletions
--- a/README.md
+++ b/README.md
@ -3,17 +3,28 @@

 ## Example Usage: 
 ```Python
-if __name__ == "__main__":
-    data_path1 = "/root/training_data/vision-dataset/images_checkpoint.parquet"
-    data_path2 = "/root/training_data/vision-dataset/vec_images_dataset.parquet"
-    
-    from aiia.model import AIIABase
-    from aiia.model.config import AIIAConfig
-    from aiia.pretrain import Pretrainer
-    
-    config = AIIAConfig(model_name="AIIA-Base-512x20k")
-    model = AIIABase(config)
-    
-    pretrainer = Pretrainer(model, learning_rate=1e-4)
-    pretrainer.train(data_path1, data_path2, num_epochs=10)
+from aiia.model import AIIABase
+from aiia.model.config import AIIAConfig
+from aiia.pretrain import Pretrainer
+
+# Create your model
+config = AIIAConfig(model_name="AIIA-Base-512x20k")
+model = AIIABase(config)
+
+# Initialize pretrainer with the model
+pretrainer = Pretrainer(model, learning_rate=1e-4)
+
+# List of dataset paths
+dataset_paths = [
+    "/path/to/dataset1.parquet",
+    "/path/to/dataset2.parquet"
+]
+
+# Start training with multiple datasets
+pretrainer.train(
+    dataset_paths=dataset_paths,
+    num_epochs=10,
+    batch_size=2,
+    sample_size=10000
+)
 ```
--- a/example.py
+++ b/example.py
@ -0,0 +1,27 @@
+data_path1 = "/root/training_data/vision-dataset/images_checkpoint.parquet"
+data_path2 = "/root/training_data/vision-dataset/vec_images_dataset.parquet"
+    
+from aiia.model import AIIABase
+from aiia.model.config import AIIAConfig
+from aiia.pretrain import Pretrainer
+
+# Create your model
+config = AIIAConfig(model_name="AIIA-Base-512x10k-small", num_hidden_layers=6, hidden_size=256)
+model = AIIABase(config)
+
+# Initialize pretrainer with the model
+pretrainer = Pretrainer(model, learning_rate=config.learning_rate)
+
+# List of dataset paths
+dataset_paths = [
+    data_path1,
+    data_path2
+]
+
+# Start training with multiple datasets
+pretrainer.train(
+    dataset_paths=dataset_paths,
+    num_epochs=10,
+    batch_size=2,
+    sample_size=10000
+)
--- a/src/aiia/pretrain/pretrainer.py
+++ b/src/aiia/pretrain/pretrainer.py
@ -108,26 +108,37 @@ class Pretrainer:
            
        return batch_loss

-    def train(self, data_path1, data_path2, num_epochs=3, batch_size=2, sample_size=10000):
+    def train(self, dataset_paths, column="image_bytes",  num_epochs=3, batch_size=2, sample_size=10000):
        """
-        Train the model using the specified datasets.
+        Train the model using multiple specified datasets.
        
        Args:
-            data_path1 (str): Path to first dataset
-            data_path2 (str): Path to second dataset
+            dataset_paths (list): List of paths to parquet datasets
            num_epochs (int): Number of training epochs
            batch_size (int): Batch size for training
            sample_size (int): Number of samples to use from each dataset
        """
-        # Read and merge datasets
-        df1 = pd.read_parquet(data_path1).head(sample_size)
-        df2 = pd.read_parquet(data_path2).head(sample_size)
-        merged_df = pd.concat([df1, df2], ignore_index=True)
+        if not dataset_paths:
+            raise ValueError("No dataset paths provided")
+
+        # Read and merge all datasets
+        dataframes = []
+        for path in dataset_paths:
+            try:
+                df = pd.read_parquet(path).head(sample_size)
+                dataframes.append(df)
+            except Exception as e:
+                print(f"Error loading dataset {path}: {e}")
+        
+        if not dataframes:
+            raise ValueError("No valid datasets could be loaded")
+        
+        merged_df = pd.concat(dataframes, ignore_index=True)

        # Initialize data loader
        aiia_loader = AIIADataLoader(
            merged_df,
-            column="image_bytes",
+            column=column,
            batch_size=batch_size,
            pretraining=True,
            collate_fn=self.safe_collate