using unnamed for the translator

This commit is contained in:
Falko Victor Habel 2024-08-30 08:04:47 +02:00
parent 55a50276fa
commit dc3e8ba73c
1 changed files with 9 additions and 9 deletions

View File

@ -13,9 +13,9 @@ except FileNotFoundError:
print("Columns in the DataFrame:", df.columns) print("Columns in the DataFrame:", df.columns)
# Ensure the '#' column exists # Ensure the 'Unnamed: 0' column exists
if '#' not in df.columns: if 'Unnamed: 0' not in df.columns:
print("'#' column not found. Please check your CSV file.") print("'Unnamed: 0' column not found. Please check your CSV file.")
exit(1) exit(1)
# Take a sample of 10 entries # Take a sample of 10 entries
@ -41,23 +41,23 @@ df_sample['title_de'] = df_sample['title'].fillna('').progress_apply(translate)
df_sample['text_de'] = df_sample['text'].fillna('').progress_apply(translate) df_sample['text_de'] = df_sample['text'].fillna('').progress_apply(translate)
# Calculate the new serial numbers # Calculate the new serial numbers
max_serial = df['#'].max() max_serial = df['Unnamed: 0'].max()
df_sample['#_de'] = df_sample['#'].apply(lambda x: x + max_serial + 1) df_sample['Unnamed: 0_de'] = df_sample['Unnamed: 0'].apply(lambda x: x + max_serial + 1)
# Create new rows with translated content # Create new rows with translated content
df_translated = df_sample.copy() df_translated = df_sample.copy()
df_translated['#'] = df_translated['#_de'] df_translated['Unnamed: 0'] = df_translated['Unnamed: 0_de']
df_translated['title'] = df_translated['title_de'] df_translated['title'] = df_translated['title_de']
df_translated['text'] = df_translated['text_de'] df_translated['text'] = df_translated['text_de']
# Drop the temporary columns # Drop the temporary columns
df_translated = df_translated.drop(['#_de', 'title_de', 'text_de'], axis=1) df_translated = df_translated.drop(['Unnamed: 0_de', 'title_de', 'text_de'], axis=1)
# Combine original and translated DataFrames # Combine original and translated DataFrames
df_combined = pd.concat([df, df_translated], ignore_index=True) df_combined = pd.concat([df, df_translated], ignore_index=True)
# Sort by '#' (serial) number # Sort by 'Unnamed: 0' (serial) number
df_combined = df_combined.sort_values('#').reset_index(drop=True) df_combined = df_combined.sort_values('Unnamed: 0').reset_index(drop=True)
# Save as parquet # Save as parquet
df_combined.to_parquet('combined_with_translations_10_samples.parquet', index=False) df_combined.to_parquet('combined_with_translations_10_samples.parquet', index=False)