using unnamed for the translator
This commit is contained in:
parent
55a50276fa
commit
dc3e8ba73c
|
@ -13,9 +13,9 @@ except FileNotFoundError:
|
||||||
|
|
||||||
print("Columns in the DataFrame:", df.columns)
|
print("Columns in the DataFrame:", df.columns)
|
||||||
|
|
||||||
# Ensure the '#' column exists
|
# Ensure the 'Unnamed: 0' column exists
|
||||||
if '#' not in df.columns:
|
if 'Unnamed: 0' not in df.columns:
|
||||||
print("'#' column not found. Please check your CSV file.")
|
print("'Unnamed: 0' column not found. Please check your CSV file.")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
# Take a sample of 10 entries
|
# Take a sample of 10 entries
|
||||||
|
@ -41,23 +41,23 @@ df_sample['title_de'] = df_sample['title'].fillna('').progress_apply(translate)
|
||||||
df_sample['text_de'] = df_sample['text'].fillna('').progress_apply(translate)
|
df_sample['text_de'] = df_sample['text'].fillna('').progress_apply(translate)
|
||||||
|
|
||||||
# Calculate the new serial numbers
|
# Calculate the new serial numbers
|
||||||
max_serial = df['#'].max()
|
max_serial = df['Unnamed: 0'].max()
|
||||||
df_sample['#_de'] = df_sample['#'].apply(lambda x: x + max_serial + 1)
|
df_sample['Unnamed: 0_de'] = df_sample['Unnamed: 0'].apply(lambda x: x + max_serial + 1)
|
||||||
|
|
||||||
# Create new rows with translated content
|
# Create new rows with translated content
|
||||||
df_translated = df_sample.copy()
|
df_translated = df_sample.copy()
|
||||||
df_translated['#'] = df_translated['#_de']
|
df_translated['Unnamed: 0'] = df_translated['Unnamed: 0_de']
|
||||||
df_translated['title'] = df_translated['title_de']
|
df_translated['title'] = df_translated['title_de']
|
||||||
df_translated['text'] = df_translated['text_de']
|
df_translated['text'] = df_translated['text_de']
|
||||||
|
|
||||||
# Drop the temporary columns
|
# Drop the temporary columns
|
||||||
df_translated = df_translated.drop(['#_de', 'title_de', 'text_de'], axis=1)
|
df_translated = df_translated.drop(['Unnamed: 0_de', 'title_de', 'text_de'], axis=1)
|
||||||
|
|
||||||
# Combine original and translated DataFrames
|
# Combine original and translated DataFrames
|
||||||
df_combined = pd.concat([df, df_translated], ignore_index=True)
|
df_combined = pd.concat([df, df_translated], ignore_index=True)
|
||||||
|
|
||||||
# Sort by '#' (serial) number
|
# Sort by 'Unnamed: 0' (serial) number
|
||||||
df_combined = df_combined.sort_values('#').reset_index(drop=True)
|
df_combined = df_combined.sort_values('Unnamed: 0').reset_index(drop=True)
|
||||||
|
|
||||||
# Save as parquet
|
# Save as parquet
|
||||||
df_combined.to_parquet('combined_with_translations_10_samples.parquet', index=False)
|
df_combined.to_parquet('combined_with_translations_10_samples.parquet', index=False)
|
||||||
|
|
Reference in New Issue