removed old generation model

2024-09-03 09:21:39 +02:00 · 2024-09-03 09:21:39 +02:00 · c47f54f452
parent 4af1d651da
commit c47f54f452
10 changed files with 69 additions and 119889 deletions
--- a/src/model/Inference.py
+++ b/src/model/Inference.py
@ -1,43 +0,0 @@
-from transformers import BertTokenizer, BertForSequenceClassification, AdamW
-import pyarrow.parquet as pq
-import torch
-
-
-class FakeNewsInference:
-    def __init__(self, model_path):
-        self.tokenizer = BertTokenizer.from_pretrained(model_path)
-        self.model = BertForSequenceClassification.from_pretrained(model_path)
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.model.to(self.device)
-        self.model.eval()
-
-    def predict(self, text):
-        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
-        inputs = {k: v.to(self.device) for k, v in inputs.items()}
-
-        with torch.no_grad():
-            outputs = self.model(**inputs)
-            probabilities = torch.softmax(outputs.logits, dim=1)
-            prediction = torch.argmax(probabilities, dim=1).item()
-
-        return prediction, probabilities[0][prediction].item()
-
-
-if __name__ == '__main__':
-    inference = FakeNewsInference('src/model/VeriMind')
-    # Beispiel von Postillion
-    sample_text = """München, Eindhoven (dpo) - Es ist ein wahrer Transfer-Hammer kurz vor Ablauf der offiziellen Frist: Der FC Bayern München verkauft seinen Ehrenpräsidenten Uli Hoeneß für 720.000 Euro an die PSV Eindhoven.
-
-"Der Vertrag ist unterzeichnet", bestätigte ein sichtlich erleichterter Bayern-Sportvorstand Max Eberl. "Da gibt's nichts mehr dran zu rütteln. Das ist fix. Wir danken Uli für die vielen Jahre der Loyalität und Treue und wünschen ihm einen super Start in den Niederlanden."
-
-In Eindhoven soll Hoeneß wie bereits bei den Münchnern das Amt des Ehrenpräsidenten übernehmen, der an allen Entscheidungen rumnörgelt, den Trainer vor der Boulevardpresse in die Pfanne haut und mit Fans streitet.
-
-Die Ablösesumme in Höhe von 720.000 Euro gilt als relativ gering für einen verdienten Fußballfunktionär wie Uli Hoeneß. Insidern zufolge sollen die Münchner der PSV finanziell stark entgegengekommen sein, damit der Deal auch ganz sicher zustande kommt.
-
-Auch der Zeitpunkt des Wechsels kurz vor Transferschluss gilt als ungewöhnlich, könnte aber das Ziel gehabt haben, Hoeneß zu überrumpeln und sicherzustellen, dass er auf keinen Fall wieder zurückwechselt.
-
-Angeblich befindet sich der Ex-Bayern-Präsident bereits auf dem Weg nach Eindhoven, wo er nach einem kurzen Medizincheck schon ab Montag zum ersten Mal das Training besuchen und dort Spieler und Trainer anmaulen könnte – vorerst noch mithilfe eines Dolmetschers, bis er die nötigen niederländischen Schimpfwörter gelernt hat.
-    """
-    prediction, confidence = inference.predict(sample_text)
-    print(f"Prediction: {'Real' if prediction == 1 else 'Fake'}")
-    print(f"Confidence: {confidence:.4f}")
--- a/src/model/VeriMind/config.json
+++ b/src/model/VeriMind/config.json
@ -1,32 +0,0 @@
-{
-  "_name_or_path": "google-bert/bert-base-multilingual-cased",
-  "architectures": [
-    "BertForSequenceClassification"
-  ],
-  "attention_probs_dropout_prob": 0.1,
-  "classifier_dropout": null,
-  "directionality": "bidi",
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 768,
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "layer_norm_eps": 1e-12,
-  "max_position_embeddings": 512,
-  "model_type": "bert",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "pad_token_id": 0,
-  "pooler_fc_size": 768,
-  "pooler_num_attention_heads": 12,
-  "pooler_num_fc_layers": 3,
-  "pooler_size_per_head": 128,
-  "pooler_type": "first_token_transform",
-  "position_embedding_type": "absolute",
-  "problem_type": "single_label_classification",
-  "torch_dtype": "float32",
-  "transformers_version": "4.44.0",
-  "type_vocab_size": 2,
-  "use_cache": true,
-  "vocab_size": 119547
-}
--- a/src/model/VeriMind/model.safetensors
+++ b/src/model/VeriMind/model.safetensors
--- a/src/model/VeriMind/special_tokens_map.json
+++ b/src/model/VeriMind/special_tokens_map.json
@ -1,7 +0,0 @@
-{
-  "cls_token": "[CLS]",
-  "mask_token": "[MASK]",
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "unk_token": "[UNK]"
-}
--- a/src/model/VeriMind/tokenizer_config.json
+++ b/src/model/VeriMind/tokenizer_config.json
@ -1,57 +0,0 @@
-{
-  "added_tokens_decoder": {
-    "0": {
-      "content": "[PAD]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "100": {
-      "content": "[UNK]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "101": {
-      "content": "[CLS]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "102": {
-      "content": "[SEP]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "103": {
-      "content": "[MASK]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "clean_up_tokenization_spaces": true,
-  "cls_token": "[CLS]",
-  "do_basic_tokenize": true,
-  "do_lower_case": false,
-  "mask_token": "[MASK]",
-  "model_max_length": 512,
-  "never_split": null,
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "strip_accents": null,
-  "tokenize_chinese_chars": true,
-  "tokenizer_class": "BertTokenizer",
-  "unk_token": "[UNK]"
-}
--- a/src/model/VeriMind/vocab.txt
+++ b/src/model/VeriMind/vocab.txt
--- a/src/model/prompt.txt
+++ b/src/model/prompt.txt
@ -0,0 +1,14 @@
+I have 4 Datasets:
+
+One English Dataset looking like that: 
+Dataset contains four columns: Serial number (starting from 0); Title (about the text news heading); Text (about the news content); and Label (0 = fake and 1 = real).
+
+Then 2 Datasets with real german news articles looking like that: 
+**Category | Text**
+Where One is for training and one for testing. 
+
+and then the last one only containging fake german news articles looking like that: 
+**id** | **url** | **Titel** | **Body** [...]
+
+They are all csv files and  I want to train a multilingual fake news classification model with that. 
+that should tell me if a article is true or not. 
--- a/src/model/text_preprocess.py
+++ b/src/model/text_preprocess.py
@ -0,0 +1,46 @@
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+# Load datasets
+english_df = pd.read_csv('english_dataset.csv')
+german_real_train_df = pd.read_csv('german_real_train.csv')
+german_real_test_df = pd.read_csv('german_real_test.csv')
+german_fake_df = pd.read_csv('german_fake.csv')
+
+# Process English dataset
+english_df['language'] = 'en'
+english_df = english_df[['Title', 'Text', 'Label', 'language']]
+
+# Process German datasets
+german_real_df = pd.concat([german_real_train_df, german_real_test_df])
+german_real_df['Label'] = 1  # Real news
+german_real_df['language'] = 'de'
+german_real_df = german_real_df.rename(columns={'Category': 'Title'})
+
+german_fake_df['Label'] = 0  # Fake news
+german_fake_df['language'] = 'de'
+german_fake_df = german_fake_df.rename(columns={'Titel': 'Title', 'Body': 'Text'})
+german_fake_df = german_fake_df[['Title', 'Text', 'Label', 'language']]
+
+# Combine all datasets
+combined_df = pd.concat([english_df, german_real_df, german_fake_df], ignore_index=True)
+
+# Shuffle the dataset
+combined_df = combined_df.sample(frac=1).reset_index(drop=True)
+
+# Split into train and test sets
+train_df, test_df = train_test_split(combined_df, test_size=0.2, stratify=combined_df['Label'], random_state=42)
+
+# Save processed datasets
+train_df.to_csv('multilingual_fake_news_train.csv', index=False)
+test_df.to_csv('multilingual_fake_news_test.csv', index=False)
+
+print("Dataset statistics:")
+print(f"Total samples: {len(combined_df)}")
+print(f"Training samples: {len(train_df)}")
+print(f"Test samples: {len(test_df)}")
+print("\nLabel distribution:")
+print(combined_df['Label'].value_counts(normalize=True))
+print("\nLanguage distribution:")
+print(combined_df['language'].value_counts(normalize=True))
--- a/src/model/train.py
+++ b/src/model/train.py
@ -1,182 +0,0 @@
-import pandas as pd
-import numpy as np
-from sklearn.model_selection import train_test_split
-from transformers import BertTokenizer, BertConfig, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
-from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
-import torch
-from tqdm import tqdm
-import pyarrow.parquet as pq
-from sklearn.metrics import classification_report, confusion_matrix
-
-class FakeNewsModelTrainer:
-    def __init__(self, model_name='google-bert/bert-base-multilingual-cased', max_length=512, size_factor=0.5):
-        self.model_name = model_name
-        self.max_length = max_length
-        self.tokenizer = BertTokenizer.from_pretrained(model_name)
-        self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.model.to(self.device)
-
-        # Load the original config
-        original_config = BertConfig.from_pretrained(model_name)
-        
-        # Calculate new dimensions
-        new_num_attention_heads = max(int(original_config.num_attention_heads * size_factor ** 0.5), 1)
-        new_hidden_size = new_num_attention_heads * max(int((original_config.hidden_size // original_config.num_attention_heads) * size_factor ** 0.5), 1)
-        new_num_hidden_layers = max(int(original_config.num_hidden_layers * size_factor ** 0.5), 1)
-        
-        # Create a new config with reduced size
-        config = BertConfig(
-            vocab_size=original_config.vocab_size,
-            hidden_size=new_hidden_size,
-            num_hidden_layers=new_num_hidden_layers,
-            num_attention_heads=new_num_attention_heads,
-            intermediate_size=new_hidden_size * 4,
-            max_position_embeddings=original_config.max_position_embeddings,
-            num_labels=2
-        )
-        
-        # Initialize the model with the new config
-        self.model = BertForSequenceClassification(config)
-
-    def prepare_data(self, df):
-        texts = df.apply(lambda row: f"{row['title'] or ''} {row['text'] or ''}".strip(), axis=1).tolist()
-        labels = df['label'].tolist()
-
-        valid_texts = []
-        valid_labels = []
-        for text, label in zip(texts, labels):
-            if text.strip():
-                valid_texts.append(text)
-                valid_labels.append(label)
-
-        if not valid_texts:
-            raise ValueError("No valid texts found in the dataset after filtering empty entries.")
-
-        encoded_texts = self.tokenizer(valid_texts, padding=True, truncation=True, max_length=self.max_length, return_tensors='pt')
-        input_ids = encoded_texts['input_ids']
-        attention_mask = encoded_texts['attention_mask']
-        labels = torch.tensor(valid_labels)
-
-        # Create a weighted sampler for balanced batches
-        class_sample_count = np.array([len(np.where(valid_labels == t)[0]) for t in np.unique(valid_labels)])
-        weight = 1. / class_sample_count
-        samples_weight = np.array([weight[t] for t in valid_labels])
-        samples_weight = torch.from_numpy(samples_weight)
-        sampler = WeightedRandomSampler(samples_weight.type('torch.DoubleTensor'), len(samples_weight))
-
-        return TensorDataset(input_ids, attention_mask, labels), sampler
-
-    def train(self, train_data, val_data, epochs=13, batch_size=16):
-        train_dataset, train_sampler = train_data
-        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
-        val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
-
-        optimizer = AdamW(self.model.parameters(), lr=2e-5, eps=1e-8)
-        total_steps = len(train_dataloader) * epochs
-        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
-
-        self.model.to(self.device)  # Ensure model is on the correct device
-
-        for epoch in range(epochs):
-            self.model.train()
-            total_loss = 0
-
-            for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
-                input_ids, attention_mask, labels = [b.to(self.device) for b in batch]
-
-                self.model.zero_grad()
-                outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
-                loss = outputs.loss
-                total_loss += loss.item()
-
-                loss.backward()
-                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
-                optimizer.step()
-                scheduler.step()
-
-            avg_train_loss = total_loss / len(train_dataloader)
-            print(f'Average training loss: {avg_train_loss:.4f}')
-
-            val_accuracy, val_report = self.evaluate(val_dataloader)
-            print(f'Validation accuracy: {val_accuracy:.4f}')
-            print('Validation Classification Report:')
-            print(val_report)
-
-    def evaluate(self, dataloader):
-        self.model.eval()
-        predictions = []
-        true_labels = []
-
-        with torch.no_grad():
-            for batch in dataloader:
-                input_ids, attention_mask, labels = [b.to(self.device) for b in batch]
-
-                outputs = self.model(input_ids, attention_mask=attention_mask)
-                _, preds = torch.max(outputs.logits, dim=1)
-
-                predictions.extend(preds.cpu().tolist())
-                true_labels.extend(labels.cpu().tolist())
-
-        accuracy = sum(1 for p, t in zip(predictions, true_labels) if p == t) / len(true_labels)
-        report = classification_report(true_labels, predictions, target_names=['Fake', 'Real'])
-        
-        print('Confusion Matrix:')
-        print(confusion_matrix(true_labels, predictions))
-
-        return accuracy, report
-
-    def save_model(self, path):
-        self.model.save_pretrained(path)
-        self.tokenizer.save_pretrained(path)
-
-
-class FakeNewsInference:
-    def __init__(self, model_path):
-        self.tokenizer = BertTokenizer.from_pretrained(model_path)
-        self.model = BertForSequenceClassification.from_pretrained(model_path)
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.model.to(self.device)
-        self.model.eval()
-
-    def predict(self, text):
-        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
-        inputs = {k: v.to(self.device) for k, v in inputs.items()}
-
-        with torch.no_grad():
-            outputs = self.model(**inputs)
-            probabilities = torch.softmax(outputs.logits, dim=1)
-            prediction = torch.argmax(probabilities, dim=1).item()
-
-        return prediction, probabilities[0][prediction].item()
-
-# Usage example
-if __name__ == '__main__':
-    # Load and preprocess the data
-    df = pq.read_table('dataset.parquet').to_pandas()
-    
-    # Split the data
-    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
-
-    # Initialize and train the model
-    trainer = FakeNewsModelTrainer()
-    train_data = trainer.prepare_data(train_df)
-    val_data = trainer.prepare_data(val_df)[0]
-    trainer.train(train_data, val_data)
-
-    # Save the model
-    trainer.save_model('VeriMind')
-
-    # Inference example
-    inference = FakeNewsInference('fake_news_detector_model')
-    sample_texts = [
-        "Breaking news: Scientists discover new planet in solar system",
-        "Celebrity secretly lizard person, unnamed sources claim",
-        "New study shows benefits of regular exercise",
-        "Government admits to hiding alien life, whistleblower reveals"
-    ]
-    for text in sample_texts:
-        prediction, confidence = inference.predict(text)
-        print(f"Text: {text}")
-        print(f"Prediction: {'Real' if prediction == 1 else 'Fake'}")
-        print(f"Confidence: {confidence:.4f}\n")
--- a/src/model/translate.py
+++ b/src/model/translate.py
@ -18,8 +18,8 @@ if 'Unnamed: 0' not in df.columns:
    print("'Unnamed: 0' column not found. Please check your CSV file.")
    exit(1)

-# Take a sample of 10 entries
-sample_size = 5000
+# Take a sample of 10,000 entries
+sample_size = 10000
 df_sample = df.sample(n=sample_size, random_state=42)

 # Load the translation model
@ -28,17 +28,20 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

 # Function to translate text
-def translate(text):
+def translate(text, max_length=512):
    if pd.isna(text) or text == '':
        return ''
-    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    # Remove special characters and limit length
+    text = ''.join(char for char in text if char.isalnum() or char.isspace())
+    text = text[:max_length]
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

 # Translate 'text' and 'title' columns
 tqdm.pandas()
 df_sample['title_de'] = df_sample['title'].fillna('').progress_apply(translate)
-df_sample['text_de'] = df_sample['text'].fillna('').progress_apply(translate)
+df_sample['text_de'] = df_sample['text'].fillna('').progress_apply(lambda x: translate(x, max_length=1024))

 # Calculate the new serial numbers
 max_serial = df['Unnamed: 0'].max()
@ -47,19 +50,4 @@ df_sample['Unnamed: 0_de'] = df_sample['Unnamed: 0'].apply(lambda x: x + max_ser
 # Create new rows with translated content
 df_translated = df_sample.copy()
 df_translated['Unnamed: 0'] = df_translated['Unnamed: 0_de']
-df_translated['title'] = df_translated['title_de']
-df_translated['text'] = df_translated['text_de']
-
-# Drop the temporary columns
-df_translated = df_translated.drop(['Unnamed: 0_de', 'title_de', 'text_de'], axis=1)
-
-# Combine original and translated DataFrames
-df_combined = pd.concat([df, df_translated], ignore_index=True)
-
-# Sort by 'Unnamed: 0' (serial) number
-df_combined = df_combined.sort_values('Unnamed: 0').reset_index(drop=True)
-
-# Save as parquet
-df_combined.to_parquet('combined_with_translations_10_samples.parquet', index=False)
-
-print("Translation, combination, and saving completed.")
+df_translated['title'] = df_translated['title_de