Spaces:

cstr
/

translate_datasets

Sleeping

App Files Files Community

cstr commited on May 17, 2024

Commit

4ed2821

verified ·

1 Parent(s): 55f037d

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -16

app.py CHANGED Viewed

@@ -89,7 +89,7 @@ def parse_range_specification(range_specification, file_length):
             line_indices.append(single_line)
     return line_indices
-def translate_text(text, translator, tokenizer):
     """
     Translates the given text from English to German using CTranslate2 and the WMT21 model,
     with special handling for newlines and segmenting text longer than 500 characters.
@@ -131,7 +131,7 @@ def translate_text(text, translator, tokenizer):
         translated_segments = []
         for segment in segments:
             source = tokenizer.convert_ids_to_tokens(tokenizer.encode(segment))
-            target_prefix = [tokenizer.lang_code_to_token["de"]]
             results = translator.translate_batch([source], target_prefix=[target_prefix])
             target = results[0].hypotheses[0][1:]
             translated_segment = tokenizer.decode(tokenizer.convert_tokens_to_ids(target))
@@ -150,7 +150,7 @@ def translate_text(text, translator, tokenizer):
         logging.error(f"An error occurred during translation: {e}")
         return None
-def translate_item_ufb(item, raw_file_path, translator, tokenizer):
     try:
         # Translate the prompt directly since it's a string
         translated_prompt = translate_text(item['prompt'], translator, tokenizer)
@@ -158,12 +158,12 @@ def translate_item_ufb(item, raw_file_path, translator, tokenizer):
         # Translate the chosen and rejected contents
         translated_chosen = []
         for choice in item['chosen']:
-            translated_content = translate_text(choice['content'], translator, tokenizer)
             translated_chosen.append({'content': translated_content, 'role': choice['role']})
         translated_rejected = []
         for choice in item['rejected']:
-            translated_content = translate_text(choice['content'], translator, tokenizer)
             translated_rejected.append({'content': translated_content, 'role': choice['role']})
         # Write the raw response to a backup file
@@ -211,7 +211,7 @@ def validate_item_ufb(item):
-def translate_item_mix(item, raw_file_path, translator, tokenizer):
     """
     Translates the relevant fields in the given item from English to German using CTranslate2 and the WMT21 model,
     and saves the raw response to a backup file.
@@ -221,12 +221,12 @@ def translate_item_mix(item, raw_file_path, translator, tokenizer):
         # Translate each part of the prompt separately and preserve the order
         translated_prompts = []
         for message in item['prompt']:
-            translated_content = translate_text(message['content'], translator, tokenizer)
             translated_prompts.append({'content': translated_content, 'role': message['role']})
         # Translate the chosen and rejected contents
-        translated_chosen_content = translate_text(item['chosen'][0]['content'], translator, tokenizer)
-        translated_rejected_content = translate_text(item['rejected'][0]['content'], translator, tokenizer)
         # Write the raw response to a backup file
         with open(raw_file_path, 'a', encoding='utf-8') as raw_file:
@@ -276,13 +276,13 @@ def validate_item_mix(item):
     return True
-def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer):
     try:
         translated_texts = {}  # Cache to store translated texts
         # Translate the prompt if necessary (which is a user input and can appear again)
         if item['prompt'] not in translated_texts:
-            translated_prompt = translate_text(item['prompt'], translator, tokenizer)
             translated_texts[item['prompt']] = translated_prompt
         else:
             translated_prompt = translated_texts[item['prompt']]
@@ -290,7 +290,7 @@ def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer):
         # Helper function to handle content translation with caching
         def get_translated_content(content):
             if content not in translated_texts:
-                translated_texts[content] = translate_text(content, translator, tokenizer)
             return translated_texts[content]
         # Process translations for chosen and rejected sections
@@ -349,7 +349,7 @@ def validate_item_ufb_cached(item):
     return True
-def process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type):
     try:
         # Assigning validation and translation functions based on model_type
         if model_type == "mix":
@@ -387,7 +387,7 @@ def process_file(input_file_path, output_file_path, raw_file_path, line_indices,
             retry_count = 0
             while translated_item is None and retry_count < 3:
                 print ("going to translate the item...")
-                translated_item = translate_item(item, raw_file_path, translator, tokenizer)
                 retry_count += 1
                 if translated_item is None:
                     logging.warning(f"Translation failed for item. Retry attempt: {retry_count}")
@@ -485,7 +485,7 @@ def upload_output_to_huggingface(output_file_path, repo_name, token):
         print(f"Failed to upload {output_file_path} to Hugging Face: {e}")
         raise
-def translate_dataset(train_url, local_parquet_path, input_file_path, output_file_path, raw_file_path, range_specification, model_type, output_dir, output_repo_name, token, translator, tokenizer):
     try:
         # Download the Parquet file
         download_parquet(train_url, local_parquet_path)
@@ -527,7 +527,7 @@ def translate_dataset(train_url, local_parquet_path, input_file_path, output_fil
     try:
         # Process the file with specified model type and line indices
-        process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type)
     except Exception as e:
         logging.error(f"Failed to process the file {input_file_path}: {e}")
         return

             line_indices.append(single_line)
     return line_indices
+def translate_text(text, translator, tokenizer, target_language):
     """
     Translates the given text from English to German using CTranslate2 and the WMT21 model,
     with special handling for newlines and segmenting text longer than 500 characters.
         translated_segments = []
         for segment in segments:
             source = tokenizer.convert_ids_to_tokens(tokenizer.encode(segment))
+            target_prefix = [tokenizer.lang_code_to_token[target_language]]
             results = translator.translate_batch([source], target_prefix=[target_prefix])
             target = results[0].hypotheses[0][1:]
             translated_segment = tokenizer.decode(tokenizer.convert_tokens_to_ids(target))
         logging.error(f"An error occurred during translation: {e}")
         return None
+def translate_item_ufb(item, raw_file_path, translator, tokenizer, target_language):
     try:
         # Translate the prompt directly since it's a string
         translated_prompt = translate_text(item['prompt'], translator, tokenizer)
         # Translate the chosen and rejected contents
         translated_chosen = []
         for choice in item['chosen']:
+            translated_content = translate_text(choice['content'], translator, tokenizer, target_language)
             translated_chosen.append({'content': translated_content, 'role': choice['role']})
         translated_rejected = []
         for choice in item['rejected']:
+            translated_content = translate_text(choice['content'], translator, tokenizer, target_language)
             translated_rejected.append({'content': translated_content, 'role': choice['role']})
         # Write the raw response to a backup file
+def translate_item_mix(item, raw_file_path, translator, tokenizer, target_language):
     """
     Translates the relevant fields in the given item from English to German using CTranslate2 and the WMT21 model,
     and saves the raw response to a backup file.
         # Translate each part of the prompt separately and preserve the order
         translated_prompts = []
         for message in item['prompt']:
+            translated_content = translate_text(message['content'], translator, tokenizer, target_language)
             translated_prompts.append({'content': translated_content, 'role': message['role']})
         # Translate the chosen and rejected contents
+        translated_chosen_content = translate_text(item['chosen'][0]['content'], translator, tokenizer, target_language)
+        translated_rejected_content = translate_text(item['rejected'][0]['content'], translator, tokenizer, target_language)
         # Write the raw response to a backup file
         with open(raw_file_path, 'a', encoding='utf-8') as raw_file:
     return True
+def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer, target_language):
     try:
         translated_texts = {}  # Cache to store translated texts
         # Translate the prompt if necessary (which is a user input and can appear again)
         if item['prompt'] not in translated_texts:
+            translated_prompt = translate_text(item['prompt'], translator, tokenizer, target_language)
             translated_texts[item['prompt']] = translated_prompt
         else:
             translated_prompt = translated_texts[item['prompt']]
         # Helper function to handle content translation with caching
         def get_translated_content(content):
             if content not in translated_texts:
+                translated_texts[content] = translate_text(content, translator, tokenizer, target_language)
             return translated_texts[content]
         # Process translations for chosen and rejected sections
     return True
+def process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type, target_language):
     try:
         # Assigning validation and translation functions based on model_type
         if model_type == "mix":
             retry_count = 0
             while translated_item is None and retry_count < 3:
                 print ("going to translate the item...")
+                translated_item = translate_item(item, raw_file_path, translator, tokenizer, target_language)
                 retry_count += 1
                 if translated_item is None:
                     logging.warning(f"Translation failed for item. Retry attempt: {retry_count}")
         print(f"Failed to upload {output_file_path} to Hugging Face: {e}")
         raise
+def translate_dataset(train_url, local_parquet_path, input_file_path, output_file_path, raw_file_path, range_specification, model_type, output_dir, output_repo_name, token, translator, tokenizer, target_language):
     try:
         # Download the Parquet file
         download_parquet(train_url, local_parquet_path)
     try:
         # Process the file with specified model type and line indices
+        process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type, target_language)
     except Exception as e:
         logging.error(f"Failed to process the file {input_file_path}: {e}")
         return