Compare commits

...

22 Commits

Author SHA1 Message Date
Falko Victor Habel f0a2e77701 Merge pull request 'test_model' (#13) from test_model into develop
Reviewed-on: #13
2024-09-03 08:53:14 +00:00
Falko Victor Habel 4af1d651da icnreased epoch 2024-09-01 16:08:50 +02:00
Falko Victor Habel da78ad357a removed model error when it comes to storage 2024-09-01 16:08:33 +02:00
Falko Victor Habel 5d8ec1a01f corrected bertconfig init 2024-09-01 11:06:08 +02:00
Falko Victor Habel 306cd5619d bert config added 2024-08-31 21:40:33 +02:00
Falko Victor Habel a824724854 inclucded inference into training and added weighted sampler 2024-08-31 21:40:20 +02:00
Falko Victor Habel cd8e1857ea removed old model 2024-08-31 21:14:47 +02:00
Falko Victor Habel b55fc02ede switched to 50% Bert instead of 25% 2024-08-31 21:14:31 +02:00
Falko Habel beb9eedc1f VeriMindSmall added 2024-08-31 18:26:26 +00:00
Falko Victor Habel be17dd3d17 updateded size 2024-08-31 14:14:05 +02:00
Falko Victor Habel 4327bbf05a updated inference and removed comments 2024-08-31 13:40:05 +02:00
Falko Habel 424ee27357 added VeriMind Model 2024-08-31 11:22:50 +00:00
Falko Victor Habel 7b07ad91a1 added validation of trainingsdata 2024-08-31 08:19:47 +02:00
Falko Victor Habel aa6c343c40 corrected dataset path, increased epoch and batch_size 2024-08-31 08:09:49 +02:00
Falko Victor Habel de0699d6ba inference class added 2024-08-31 08:06:28 +02:00
Falko Victor Habel cbfcad6088 training class added 2024-08-31 08:06:05 +02:00
Falko Victor Habel 9592ab8140 updated sample size 2024-08-30 08:06:33 +02:00
Falko Victor Habel dc3e8ba73c using unnamed for the translator 2024-08-30 08:04:47 +02:00
Falko Victor Habel 55a50276fa better integration 2024-08-30 08:02:51 +02:00
Falko Victor Habel 1ebe2e1212 updated translation 2024-08-29 11:48:58 +02:00
Falko Victor Habel 24b6a2a8a7 translation for model added 2024-08-29 11:11:59 +02:00
Falko Victor Habel 37d60e436f Merge pull request 'add webTextExtactor + projekt structure' (#10) from develop into main
Reviewed-on: #10
2024-08-29 08:46:31 +00:00
8 changed files with 119933 additions and 0 deletions

43
src/model/Inference.py Normal file
View File

@ -0,0 +1,43 @@
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import pyarrow.parquet as pq
import torch
class FakeNewsInference:
def __init__(self, model_path):
self.tokenizer = BertTokenizer.from_pretrained(model_path)
self.model = BertForSequenceClassification.from_pretrained(model_path)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.model.to(self.device)
self.model.eval()
def predict(self, text):
inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = self.model(**inputs)
probabilities = torch.softmax(outputs.logits, dim=1)
prediction = torch.argmax(probabilities, dim=1).item()
return prediction, probabilities[0][prediction].item()
if __name__ == '__main__':
inference = FakeNewsInference('src/model/VeriMind')
# Beispiel von Postillion
sample_text = """München, Eindhoven (dpo) - Es ist ein wahrer Transfer-Hammer kurz vor Ablauf der offiziellen Frist: Der FC Bayern München verkauft seinen Ehrenpräsidenten Uli Hoeneß für 720.000 Euro an die PSV Eindhoven.
"Der Vertrag ist unterzeichnet", bestätigte ein sichtlich erleichterter Bayern-Sportvorstand Max Eberl. "Da gibt's nichts mehr dran zu rütteln. Das ist fix. Wir danken Uli für die vielen Jahre der Loyalität und Treue und wünschen ihm einen super Start in den Niederlanden."
In Eindhoven soll Hoeneß wie bereits bei den Münchnern das Amt des Ehrenpräsidenten übernehmen, der an allen Entscheidungen rumnörgelt, den Trainer vor der Boulevardpresse in die Pfanne haut und mit Fans streitet.
Die Ablösesumme in Höhe von 720.000 Euro gilt als relativ gering für einen verdienten Fußballfunktionär wie Uli Hoeneß. Insidern zufolge sollen die Münchner der PSV finanziell stark entgegengekommen sein, damit der Deal auch ganz sicher zustande kommt.
Auch der Zeitpunkt des Wechsels kurz vor Transferschluss gilt als ungewöhnlich, könnte aber das Ziel gehabt haben, Hoeneß zu überrumpeln und sicherzustellen, dass er auf keinen Fall wieder zurückwechselt.
Angeblich befindet sich der Ex-Bayern-Präsident bereits auf dem Weg nach Eindhoven, wo er nach einem kurzen Medizincheck schon ab Montag zum ersten Mal das Training besuchen und dort Spieler und Trainer anmaulen könnte vorerst noch mithilfe eines Dolmetschers, bis er die nötigen niederländischen Schimpfwörter gelernt hat.
"""
prediction, confidence = inference.predict(sample_text)
print(f"Prediction: {'Real' if prediction == 1 else 'Fake'}")
print(f"Confidence: {confidence:.4f}")

View File

@ -0,0 +1,32 @@
{
"_name_or_path": "google-bert/bert-base-multilingual-cased",
"architectures": [
"BertForSequenceClassification"
],
"attention_probs_dropout_prob": 0.1,
"classifier_dropout": null,
"directionality": "bidi",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
"position_embedding_type": "absolute",
"problem_type": "single_label_classification",
"torch_dtype": "float32",
"transformers_version": "4.44.0",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 119547
}

Binary file not shown.

View File

@ -0,0 +1,7 @@
{
"cls_token": "[CLS]",
"mask_token": "[MASK]",
"pad_token": "[PAD]",
"sep_token": "[SEP]",
"unk_token": "[UNK]"
}

View File

@ -0,0 +1,57 @@
{
"added_tokens_decoder": {
"0": {
"content": "[PAD]",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"100": {
"content": "[UNK]",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"101": {
"content": "[CLS]",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"102": {
"content": "[SEP]",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"103": {
"content": "[MASK]",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"clean_up_tokenization_spaces": true,
"cls_token": "[CLS]",
"do_basic_tokenize": true,
"do_lower_case": false,
"mask_token": "[MASK]",
"model_max_length": 512,
"never_split": null,
"pad_token": "[PAD]",
"sep_token": "[SEP]",
"strip_accents": null,
"tokenize_chinese_chars": true,
"tokenizer_class": "BertTokenizer",
"unk_token": "[UNK]"
}

119547
src/model/VeriMind/vocab.txt Normal file

File diff suppressed because it is too large Load Diff

182
src/model/train.py Normal file
View File

@ -0,0 +1,182 @@
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
import torch
from tqdm import tqdm
import pyarrow.parquet as pq
from sklearn.metrics import classification_report, confusion_matrix
class FakeNewsModelTrainer:
def __init__(self, model_name='google-bert/bert-base-multilingual-cased', max_length=512, size_factor=0.5):
self.model_name = model_name
self.max_length = max_length
self.tokenizer = BertTokenizer.from_pretrained(model_name)
self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.model.to(self.device)
# Load the original config
original_config = BertConfig.from_pretrained(model_name)
# Calculate new dimensions
new_num_attention_heads = max(int(original_config.num_attention_heads * size_factor ** 0.5), 1)
new_hidden_size = new_num_attention_heads * max(int((original_config.hidden_size // original_config.num_attention_heads) * size_factor ** 0.5), 1)
new_num_hidden_layers = max(int(original_config.num_hidden_layers * size_factor ** 0.5), 1)
# Create a new config with reduced size
config = BertConfig(
vocab_size=original_config.vocab_size,
hidden_size=new_hidden_size,
num_hidden_layers=new_num_hidden_layers,
num_attention_heads=new_num_attention_heads,
intermediate_size=new_hidden_size * 4,
max_position_embeddings=original_config.max_position_embeddings,
num_labels=2
)
# Initialize the model with the new config
self.model = BertForSequenceClassification(config)
def prepare_data(self, df):
texts = df.apply(lambda row: f"{row['title'] or ''} {row['text'] or ''}".strip(), axis=1).tolist()
labels = df['label'].tolist()
valid_texts = []
valid_labels = []
for text, label in zip(texts, labels):
if text.strip():
valid_texts.append(text)
valid_labels.append(label)
if not valid_texts:
raise ValueError("No valid texts found in the dataset after filtering empty entries.")
encoded_texts = self.tokenizer(valid_texts, padding=True, truncation=True, max_length=self.max_length, return_tensors='pt')
input_ids = encoded_texts['input_ids']
attention_mask = encoded_texts['attention_mask']
labels = torch.tensor(valid_labels)
# Create a weighted sampler for balanced batches
class_sample_count = np.array([len(np.where(valid_labels == t)[0]) for t in np.unique(valid_labels)])
weight = 1. / class_sample_count
samples_weight = np.array([weight[t] for t in valid_labels])
samples_weight = torch.from_numpy(samples_weight)
sampler = WeightedRandomSampler(samples_weight.type('torch.DoubleTensor'), len(samples_weight))
return TensorDataset(input_ids, attention_mask, labels), sampler
def train(self, train_data, val_data, epochs=13, batch_size=16):
train_dataset, train_sampler = train_data
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
optimizer = AdamW(self.model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
self.model.to(self.device) # Ensure model is on the correct device
for epoch in range(epochs):
self.model.train()
total_loss = 0
for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
input_ids, attention_mask, labels = [b.to(self.device) for b in batch]
self.model.zero_grad()
outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
total_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
optimizer.step()
scheduler.step()
avg_train_loss = total_loss / len(train_dataloader)
print(f'Average training loss: {avg_train_loss:.4f}')
val_accuracy, val_report = self.evaluate(val_dataloader)
print(f'Validation accuracy: {val_accuracy:.4f}')
print('Validation Classification Report:')
print(val_report)
def evaluate(self, dataloader):
self.model.eval()
predictions = []
true_labels = []
with torch.no_grad():
for batch in dataloader:
input_ids, attention_mask, labels = [b.to(self.device) for b in batch]
outputs = self.model(input_ids, attention_mask=attention_mask)
_, preds = torch.max(outputs.logits, dim=1)
predictions.extend(preds.cpu().tolist())
true_labels.extend(labels.cpu().tolist())
accuracy = sum(1 for p, t in zip(predictions, true_labels) if p == t) / len(true_labels)
report = classification_report(true_labels, predictions, target_names=['Fake', 'Real'])
print('Confusion Matrix:')
print(confusion_matrix(true_labels, predictions))
return accuracy, report
def save_model(self, path):
self.model.save_pretrained(path)
self.tokenizer.save_pretrained(path)
class FakeNewsInference:
def __init__(self, model_path):
self.tokenizer = BertTokenizer.from_pretrained(model_path)
self.model = BertForSequenceClassification.from_pretrained(model_path)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.model.to(self.device)
self.model.eval()
def predict(self, text):
inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = self.model(**inputs)
probabilities = torch.softmax(outputs.logits, dim=1)
prediction = torch.argmax(probabilities, dim=1).item()
return prediction, probabilities[0][prediction].item()
# Usage example
if __name__ == '__main__':
# Load and preprocess the data
df = pq.read_table('dataset.parquet').to_pandas()
# Split the data
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
# Initialize and train the model
trainer = FakeNewsModelTrainer()
train_data = trainer.prepare_data(train_df)
val_data = trainer.prepare_data(val_df)[0]
trainer.train(train_data, val_data)
# Save the model
trainer.save_model('VeriMind')
# Inference example
inference = FakeNewsInference('fake_news_detector_model')
sample_texts = [
"Breaking news: Scientists discover new planet in solar system",
"Celebrity secretly lizard person, unnamed sources claim",
"New study shows benefits of regular exercise",
"Government admits to hiding alien life, whistleblower reveals"
]
for text in sample_texts:
prediction, confidence = inference.predict(text)
print(f"Text: {text}")
print(f"Prediction: {'Real' if prediction == 1 else 'Fake'}")
print(f"Confidence: {confidence:.4f}\n")

65
src/model/translate.py Normal file
View File

@ -0,0 +1,65 @@
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
# Load the CSV file
file_path = '/root/schule/WELFake_Dataset.csv'
try:
df = pd.read_csv(file_path)
except FileNotFoundError:
print(f"File not found: {file_path}")
exit(1)
print("Columns in the DataFrame:", df.columns)
# Ensure the 'Unnamed: 0' column exists
if 'Unnamed: 0' not in df.columns:
print("'Unnamed: 0' column not found. Please check your CSV file.")
exit(1)
# Take a sample of 10 entries
sample_size = 5000
df_sample = df.sample(n=sample_size, random_state=42)
# Load the translation model
model_name = "Helsinki-NLP/opus-mt-en-de"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Function to translate text
def translate(text):
if pd.isna(text) or text == '':
return ''
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
translated = model.generate(**inputs)
return tokenizer.decode(translated[0], skip_special_tokens=True)
# Translate 'text' and 'title' columns
tqdm.pandas()
df_sample['title_de'] = df_sample['title'].fillna('').progress_apply(translate)
df_sample['text_de'] = df_sample['text'].fillna('').progress_apply(translate)
# Calculate the new serial numbers
max_serial = df['Unnamed: 0'].max()
df_sample['Unnamed: 0_de'] = df_sample['Unnamed: 0'].apply(lambda x: x + max_serial + 1)
# Create new rows with translated content
df_translated = df_sample.copy()
df_translated['Unnamed: 0'] = df_translated['Unnamed: 0_de']
df_translated['title'] = df_translated['title_de']
df_translated['text'] = df_translated['text_de']
# Drop the temporary columns
df_translated = df_translated.drop(['Unnamed: 0_de', 'title_de', 'text_de'], axis=1)
# Combine original and translated DataFrames
df_combined = pd.concat([df, df_translated], ignore_index=True)
# Sort by 'Unnamed: 0' (serial) number
df_combined = df_combined.sort_values('Unnamed: 0').reset_index(drop=True)
# Save as parquet
df_combined.to_parquet('combined_with_translations_10_samples.parquet', index=False)
print("Translation, combination, and saving completed.")