updated translation

This commit is contained in:
Falko Victor Habel 2024-08-29 11:48:58 +02:00
parent 24b6a2a8a7
commit 1ebe2e1212
1 changed files with 6 additions and 2 deletions

View File

@ -17,14 +17,18 @@ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Function to translate text # Function to translate text
def translate(text): def translate(text):
if pd.isna(text) or text == '':
return '' # Return an empty string for NaN or empty string inputs
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
translated = model.generate(**inputs) translated = model.generate(**inputs)
return tokenizer.decode(translated[0], skip_special_tokens=True) return tokenizer.decode(translated[0], skip_special_tokens=True)
# Translate 'text' and 'title' columns # Translate 'text' and 'title' columns
tqdm.pandas() tqdm.pandas()
df_sample['title_de'] = df_sample['title'].progress_apply(translate) df_sample['title_de'] = df_sample['title'].fillna('').progress_apply(translate)
df_sample['text_de'] = df_sample['text'].progress_apply(translate) df_sample['text_de'] = df_sample['text'].fillna('').progress_apply(translate)
# Calculate the new serial numbers # Calculate the new serial numbers
max_serial = df['Serial'].max() max_serial = df['Serial'].max()