Compare commits
2 Commits
9592ab8140
...
de0699d6ba
Author | SHA1 | Date |
---|---|---|
Falko Victor Habel | de0699d6ba | |
Falko Victor Habel | cbfcad6088 |
|
@ -0,0 +1,32 @@
|
||||||
|
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
|
||||||
|
import pyarrow.parquet as pq
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
class FakeNewsInference:
|
||||||
|
def __init__(self, model_path):
|
||||||
|
self.tokenizer = BertTokenizer.from_pretrained(model_path)
|
||||||
|
self.model = BertForSequenceClassification.from_pretrained(model_path)
|
||||||
|
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
self.model.to(self.device)
|
||||||
|
self.model.eval()
|
||||||
|
|
||||||
|
def predict(self, text):
|
||||||
|
inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
|
||||||
|
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = self.model(**inputs)
|
||||||
|
probabilities = torch.softmax(outputs.logits, dim=1)
|
||||||
|
prediction = torch.argmax(probabilities, dim=1).item()
|
||||||
|
|
||||||
|
return prediction, probabilities[0][prediction].item()
|
||||||
|
|
||||||
|
# Usage example
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# Inference example
|
||||||
|
inference = FakeNewsInference('VeriMind')
|
||||||
|
sample_text = "Breaking news: Scientists discover new planet in solar system"
|
||||||
|
prediction, confidence = inference.predict(sample_text)
|
||||||
|
print(f"Prediction: {'Real' if prediction == 1 else 'Fake'}")
|
||||||
|
print(f"Confidence: {confidence:.4f}")
|
|
@ -0,0 +1,95 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
|
||||||
|
from torch.utils.data import DataLoader, TensorDataset
|
||||||
|
import torch
|
||||||
|
from tqdm import tqdm
|
||||||
|
import pyarrow.parquet as pq
|
||||||
|
|
||||||
|
class FakeNewsModelTrainer:
|
||||||
|
def __init__(self, model_name='google-bert/bert-base-multilingual-cased', max_length=512):
|
||||||
|
self.model_name = model_name
|
||||||
|
self.max_length = max_length
|
||||||
|
self.tokenizer = BertTokenizer.from_pretrained(model_name)
|
||||||
|
self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
|
||||||
|
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
self.model.to(self.device)
|
||||||
|
|
||||||
|
def prepare_data(self, df):
|
||||||
|
texts = df['text'].tolist()
|
||||||
|
labels = df['label'].tolist()
|
||||||
|
|
||||||
|
encoded_texts = self.tokenizer(texts, padding=True, truncation=True, max_length=self.max_length, return_tensors='pt')
|
||||||
|
input_ids = encoded_texts['input_ids']
|
||||||
|
attention_mask = encoded_texts['attention_mask']
|
||||||
|
labels = torch.tensor(labels)
|
||||||
|
|
||||||
|
return TensorDataset(input_ids, attention_mask, labels)
|
||||||
|
|
||||||
|
def train(self, train_data, val_data, epochs=3, batch_size=16):
|
||||||
|
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
|
||||||
|
val_dataloader = DataLoader(val_data, batch_size=batch_size)
|
||||||
|
|
||||||
|
optimizer = AdamW(self.model.parameters(), lr=2e-5)
|
||||||
|
|
||||||
|
for epoch in range(epochs):
|
||||||
|
self.model.train()
|
||||||
|
total_loss = 0
|
||||||
|
|
||||||
|
for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
|
||||||
|
input_ids, attention_mask, labels = [b.to(self.device) for b in batch]
|
||||||
|
|
||||||
|
outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
|
||||||
|
loss = outputs.loss
|
||||||
|
total_loss += loss.item()
|
||||||
|
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
optimizer.zero_grad()
|
||||||
|
|
||||||
|
avg_train_loss = total_loss / len(train_dataloader)
|
||||||
|
print(f'Average training loss: {avg_train_loss:.4f}')
|
||||||
|
|
||||||
|
val_accuracy = self.evaluate(val_dataloader)
|
||||||
|
print(f'Validation accuracy: {val_accuracy:.4f}')
|
||||||
|
|
||||||
|
def evaluate(self, dataloader):
|
||||||
|
self.model.eval()
|
||||||
|
correct_predictions = 0
|
||||||
|
total_predictions = 0
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
for batch in dataloader:
|
||||||
|
input_ids, attention_mask, labels = [b.to(self.device) for b in batch]
|
||||||
|
|
||||||
|
outputs = self.model(input_ids, attention_mask=attention_mask)
|
||||||
|
_, preds = torch.max(outputs.logits, dim=1)
|
||||||
|
|
||||||
|
correct_predictions += torch.sum(preds == labels)
|
||||||
|
total_predictions += labels.shape[0]
|
||||||
|
|
||||||
|
return correct_predictions.float() / total_predictions
|
||||||
|
|
||||||
|
def save_model(self, path):
|
||||||
|
self.model.save_pretrained(path)
|
||||||
|
self.tokenizer.save_pretrained(path)
|
||||||
|
|
||||||
|
|
||||||
|
# Usage example
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# Load and preprocess the data
|
||||||
|
df = pq.read_table('your_dataset.parquet').to_pandas()
|
||||||
|
df['text'] = df['title'] + ' ' + df['text'] # Combine title and text
|
||||||
|
|
||||||
|
# Split the data
|
||||||
|
train_df, val_df = train_test_split(df, test_size=0.3, random_state=42)
|
||||||
|
|
||||||
|
# Initialize and train the model
|
||||||
|
trainer = FakeNewsModelTrainer()
|
||||||
|
train_data = trainer.prepare_data(train_df)
|
||||||
|
val_data = trainer.prepare_data(val_df)
|
||||||
|
trainer.train(train_data, val_data)
|
||||||
|
|
||||||
|
# Save the model
|
||||||
|
trainer.save_model('VeriMind')
|
Reference in New Issue