removed wrong files

This commit is contained in:
Falko Victor Habel 2024-08-29 11:50:47 +02:00
parent 6a1467eea1
commit f23023f948
2 changed files with 0 additions and 224 deletions

View File

@ -1,173 +0,0 @@
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
class NewsDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_len=512):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, item):
text = str(self.texts[item])
label = self.labels[item]
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
return {
'text': text,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'labels': torch.tensor(label, dtype=torch.long)
}
class FakeNewsTrainer:
def __init__(self, model, tokenizer, device):
self.model = model
self.tokenizer = tokenizer
self.device = device
self.model.to(self.device)
def train(self, train_texts, train_labels, val_texts, val_labels,
batch_size=16, num_epochs=5, learning_rate=2e-5):
train_dataset = NewsDataset(train_texts, train_labels, self.tokenizer)
val_dataset = NewsDataset(val_texts, val_labels, self.tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
optimizer = AdamW(self.model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
print(f'Epoch {epoch + 1}/{num_epochs}')
self._train_epoch(train_loader, optimizer)
accuracy, report = self._evaluate(val_loader)
print(f'Validation Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')
def _train_epoch(self, data_loader, optimizer):
self.model.train()
for batch in data_loader:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(self.device)
attention_mask = batch['attention_mask'].to(self.device)
labels = batch['labels'].to(self.device)
outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
def _evaluate(self, data_loader):
self.model.eval()
predictions = []
actual_labels = []
with torch.no_grad():
for batch in data_loader:
input_ids = batch['input_ids'].to(self.device)
attention_mask = batch['attention_mask'].to(self.device)
labels = batch['labels'].to(self.device)
outputs = self.model(input_ids, attention_mask=attention_mask)
_, preds = torch.max(outputs.logits, dim=1)
predictions.extend(preds.cpu().tolist())
actual_labels.extend(labels.cpu().tolist())
return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)
def save_model(self, path):
self.model.save_pretrained(path)
self.tokenizer.save_pretrained(path)
class FakeNewsInference:
def __init__(self, model_path, device):
self.model = BertForSequenceClassification.from_pretrained(model_path)
self.tokenizer = BertTokenizer.from_pretrained(model_path)
self.model.to(device)
self.model.eval()
self.device = device
def predict(self, text):
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=512,
return_token_type_ids=False,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
input_ids = encoding['input_ids'].to(self.device)
attention_mask = encoding['attention_mask'].to(self.device)
with torch.no_grad():
outputs = self.model(input_ids, attention_mask=attention_mask)
_, preds = torch.max(outputs.logits, dim=1)
return 'Real' if preds.item() == 1 else 'Fake'
class FakeNewsModel:
def __init__(self, model_path=None):
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if model_path:
self.inference = FakeNewsInference(model_path, self.device)
self.tokenizer = self.inference.tokenizer
else:
self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
self.inference = None
def train(self, csv_path, model_save_path, test_size=0.2, **kwargs):
df = pd.read_csv(csv_path)
df['combined'] = df['Title'] + ' ' + df['Text']
train_texts, val_texts, train_labels, val_labels = train_test_split(
df['combined'].tolist(), df['Label'].tolist(), test_size=test_size, random_state=42
)
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
trainer = FakeNewsTrainer(model, self.tokenizer, self.device)
trainer.train(train_texts, train_labels, val_texts, val_labels, **kwargs)
trainer.save_model(model_save_path)
self.inference = FakeNewsInference(model_save_path, self.device)
def predict(self, text):
if self.inference is None:
raise ValueError("Model not trained or loaded. Call train() or load a pre-trained model.")
return self.inference.predict(text)
# Example usage
if __name__ == "__main__":
# Initialize the model
fake_news_model = FakeNewsModel()
# Train the model
fake_news_model.train(
csv_path='/root/schule/WELFake_Dataset.csv',
model_save_path='VeriMind',
batch_size=32,
num_epochs=13,
learning_rate=2e-5
)
# Make a prediction
sample_text = "Your sample news article text here"
prediction = fake_news_model.predict(sample_text)
print(f"The article is predicted to be: {prediction}")
# Load a pre-trained model
pretrained_model = FakeNewsModel('VeriMind')
prediction = pretrained_model.predict(sample_text)
print(f"Prediction from pre-trained model: {prediction}")

View File

@ -1,51 +0,0 @@
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
# Load the CSV file
df = pd.read_csv('/root/schule/WELFake_Dataset.csv')
# Take a 10% sample
sample_size = int(len(df) * 0.1)
df_sample = df.sample(n=sample_size, random_state=42)
# Load the translation model
model_name = "Helsinki-NLP/opus-mt-en-de"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Function to translate text
def translate(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
translated = model.generate(**inputs)
return tokenizer.decode(translated[0], skip_special_tokens=True)
# Translate 'text' and 'title' columns
tqdm.pandas()
df_sample['title_de'] = df_sample['title'].progress_apply(translate)
df_sample['text_de'] = df_sample['text'].progress_apply(translate)
# Calculate the new serial numbers
max_serial = df['Serial'].max()
df_sample['Serial_de'] = df_sample['Serial'].apply(lambda x: x + max_serial + 1)
# Create new rows with translated content
df_translated = df_sample.copy()
df_translated['Serial'] = df_translated['Serial_de']
df_translated['title'] = df_translated['title_de']
df_translated['text'] = df_translated['text_de']
# Drop the temporary columns
df_translated = df_translated.drop(['Serial_de', 'title_de', 'text_de'], axis=1)
# Combine original and translated DataFrames
df_combined = pd.concat([df, df_translated], ignore_index=True)
# Sort by Serial number
df_combined = df_combined.sort_values('Serial').reset_index(drop=True)
# Save as parquet
df_combined.to_parquet('combined_with_translations.parquet', index=False)
print("Translation, combination, and saving completed.")