diff --git a/src/model/translate.py b/src/model/translate.py index c1da695..25089cd 100644 --- a/src/model/translate.py +++ b/src/model/translate.py @@ -13,9 +13,9 @@ except FileNotFoundError: print("Columns in the DataFrame:", df.columns) -# Ensure the '#' column exists -if '#' not in df.columns: - print("'#' column not found. Please check your CSV file.") +# Ensure the 'Unnamed: 0' column exists +if 'Unnamed: 0' not in df.columns: + print("'Unnamed: 0' column not found. Please check your CSV file.") exit(1) # Take a sample of 10 entries @@ -41,23 +41,23 @@ df_sample['title_de'] = df_sample['title'].fillna('').progress_apply(translate) df_sample['text_de'] = df_sample['text'].fillna('').progress_apply(translate) # Calculate the new serial numbers -max_serial = df['#'].max() -df_sample['#_de'] = df_sample['#'].apply(lambda x: x + max_serial + 1) +max_serial = df['Unnamed: 0'].max() +df_sample['Unnamed: 0_de'] = df_sample['Unnamed: 0'].apply(lambda x: x + max_serial + 1) # Create new rows with translated content df_translated = df_sample.copy() -df_translated['#'] = df_translated['#_de'] +df_translated['Unnamed: 0'] = df_translated['Unnamed: 0_de'] df_translated['title'] = df_translated['title_de'] df_translated['text'] = df_translated['text_de'] # Drop the temporary columns -df_translated = df_translated.drop(['#_de', 'title_de', 'text_de'], axis=1) +df_translated = df_translated.drop(['Unnamed: 0_de', 'title_de', 'text_de'], axis=1) # Combine original and translated DataFrames df_combined = pd.concat([df, df_translated], ignore_index=True) -# Sort by '#' (serial) number -df_combined = df_combined.sort_values('#').reset_index(drop=True) +# Sort by 'Unnamed: 0' (serial) number +df_combined = df_combined.sort_values('Unnamed: 0').reset_index(drop=True) # Save as parquet df_combined.to_parquet('combined_with_translations_10_samples.parquet', index=False)