diff --git a/src/model/train.py b/src/model/train.py deleted file mode 100644 index 48bf22c..0000000 --- a/src/model/train.py +++ /dev/null @@ -1,173 +0,0 @@ -import pandas as pd -import torch -from torch.utils.data import Dataset, DataLoader -from transformers import BertTokenizer, BertForSequenceClassification, AdamW -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score, classification_report - -class NewsDataset(Dataset): - def __init__(self, texts, labels, tokenizer, max_len=512): - self.texts = texts - self.labels = labels - self.tokenizer = tokenizer - self.max_len = max_len - - def __len__(self): - return len(self.texts) - - def __getitem__(self, item): - text = str(self.texts[item]) - label = self.labels[item] - - encoding = self.tokenizer.encode_plus( - text, - add_special_tokens=True, - max_length=self.max_len, - return_token_type_ids=False, - padding='max_length', - truncation=True, - return_attention_mask=True, - return_tensors='pt', - ) - - return { - 'text': text, - 'input_ids': encoding['input_ids'].flatten(), - 'attention_mask': encoding['attention_mask'].flatten(), - 'labels': torch.tensor(label, dtype=torch.long) - } - -class FakeNewsTrainer: - def __init__(self, model, tokenizer, device): - self.model = model - self.tokenizer = tokenizer - self.device = device - self.model.to(self.device) - - def train(self, train_texts, train_labels, val_texts, val_labels, - batch_size=16, num_epochs=5, learning_rate=2e-5): - train_dataset = NewsDataset(train_texts, train_labels, self.tokenizer) - val_dataset = NewsDataset(val_texts, val_labels, self.tokenizer) - - train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) - val_loader = DataLoader(val_dataset, batch_size=batch_size) - - optimizer = AdamW(self.model.parameters(), lr=learning_rate) - - for epoch in range(num_epochs): - print(f'Epoch {epoch + 1}/{num_epochs}') - self._train_epoch(train_loader, optimizer) - accuracy, report = self._evaluate(val_loader) - print(f'Validation Accuracy: {accuracy}') - print(f'Classification Report:\n{report}') - - def _train_epoch(self, data_loader, optimizer): - self.model.train() - for batch in data_loader: - optimizer.zero_grad() - input_ids = batch['input_ids'].to(self.device) - attention_mask = batch['attention_mask'].to(self.device) - labels = batch['labels'].to(self.device) - outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels) - loss = outputs.loss - loss.backward() - optimizer.step() - - def _evaluate(self, data_loader): - self.model.eval() - predictions = [] - actual_labels = [] - with torch.no_grad(): - for batch in data_loader: - input_ids = batch['input_ids'].to(self.device) - attention_mask = batch['attention_mask'].to(self.device) - labels = batch['labels'].to(self.device) - outputs = self.model(input_ids, attention_mask=attention_mask) - _, preds = torch.max(outputs.logits, dim=1) - predictions.extend(preds.cpu().tolist()) - actual_labels.extend(labels.cpu().tolist()) - return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions) - - def save_model(self, path): - self.model.save_pretrained(path) - self.tokenizer.save_pretrained(path) - -class FakeNewsInference: - def __init__(self, model_path, device): - self.model = BertForSequenceClassification.from_pretrained(model_path) - self.tokenizer = BertTokenizer.from_pretrained(model_path) - self.model.to(device) - self.model.eval() - self.device = device - - def predict(self, text): - encoding = self.tokenizer.encode_plus( - text, - add_special_tokens=True, - max_length=512, - return_token_type_ids=False, - padding='max_length', - truncation=True, - return_attention_mask=True, - return_tensors='pt', - ) - input_ids = encoding['input_ids'].to(self.device) - attention_mask = encoding['attention_mask'].to(self.device) - with torch.no_grad(): - outputs = self.model(input_ids, attention_mask=attention_mask) - _, preds = torch.max(outputs.logits, dim=1) - return 'Real' if preds.item() == 1 else 'Fake' - -class FakeNewsModel: - def __init__(self, model_path=None): - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - if model_path: - self.inference = FakeNewsInference(model_path, self.device) - self.tokenizer = self.inference.tokenizer - else: - self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') - self.inference = None - - def train(self, csv_path, model_save_path, test_size=0.2, **kwargs): - df = pd.read_csv(csv_path) - df['combined'] = df['Title'] + ' ' + df['Text'] - - train_texts, val_texts, train_labels, val_labels = train_test_split( - df['combined'].tolist(), df['Label'].tolist(), test_size=test_size, random_state=42 - ) - - model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2) - trainer = FakeNewsTrainer(model, self.tokenizer, self.device) - trainer.train(train_texts, train_labels, val_texts, val_labels, **kwargs) - trainer.save_model(model_save_path) - - self.inference = FakeNewsInference(model_save_path, self.device) - - def predict(self, text): - if self.inference is None: - raise ValueError("Model not trained or loaded. Call train() or load a pre-trained model.") - return self.inference.predict(text) - -# Example usage -if __name__ == "__main__": - # Initialize the model - fake_news_model = FakeNewsModel() - - # Train the model - fake_news_model.train( - csv_path='/root/schule/WELFake_Dataset.csv', - model_save_path='VeriMind', - batch_size=32, - num_epochs=13, - learning_rate=2e-5 - ) - - # Make a prediction - sample_text = "Your sample news article text here" - prediction = fake_news_model.predict(sample_text) - print(f"The article is predicted to be: {prediction}") - - # Load a pre-trained model - pretrained_model = FakeNewsModel('VeriMind') - prediction = pretrained_model.predict(sample_text) - print(f"Prediction from pre-trained model: {prediction}") diff --git a/src/model/translate.py b/src/model/translate.py deleted file mode 100644 index 4c58aec..0000000 --- a/src/model/translate.py +++ /dev/null @@ -1,51 +0,0 @@ -import pandas as pd -import torch -from transformers import AutoTokenizer, AutoModelForSeq2SeqLM -from tqdm import tqdm - -# Load the CSV file -df = pd.read_csv('/root/schule/WELFake_Dataset.csv') - -# Take a 10% sample -sample_size = int(len(df) * 0.1) -df_sample = df.sample(n=sample_size, random_state=42) - -# Load the translation model -model_name = "Helsinki-NLP/opus-mt-en-de" -tokenizer = AutoTokenizer.from_pretrained(model_name) -model = AutoModelForSeq2SeqLM.from_pretrained(model_name) - -# Function to translate text -def translate(text): - inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) - translated = model.generate(**inputs) - return tokenizer.decode(translated[0], skip_special_tokens=True) - -# Translate 'text' and 'title' columns -tqdm.pandas() -df_sample['title_de'] = df_sample['title'].progress_apply(translate) -df_sample['text_de'] = df_sample['text'].progress_apply(translate) - -# Calculate the new serial numbers -max_serial = df['Serial'].max() -df_sample['Serial_de'] = df_sample['Serial'].apply(lambda x: x + max_serial + 1) - -# Create new rows with translated content -df_translated = df_sample.copy() -df_translated['Serial'] = df_translated['Serial_de'] -df_translated['title'] = df_translated['title_de'] -df_translated['text'] = df_translated['text_de'] - -# Drop the temporary columns -df_translated = df_translated.drop(['Serial_de', 'title_de', 'text_de'], axis=1) - -# Combine original and translated DataFrames -df_combined = pd.concat([df, df_translated], ignore_index=True) - -# Sort by Serial number -df_combined = df_combined.sort_values('Serial').reset_index(drop=True) - -# Save as parquet -df_combined.to_parquet('combined_with_translations.parquet', index=False) - -print("Translation, combination, and saving completed.")