Spaces:

cstr
/

translate_datasets

Sleeping

App Files Files Community

cstr commited on May 17, 2024

Commit

964e0c7

verified ·

1 Parent(s): 3233c26

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -12

app.py CHANGED Viewed

@@ -276,7 +276,7 @@ def validate_item_mix(item):
     return True
-def translate_item_orpo(item, raw_file_path, translator, tokenizer):
     try:
         translated_texts = {}  # Cache to store translated texts
@@ -321,7 +321,7 @@ def translate_item_orpo(item, raw_file_path, translator, tokenizer):
         logging.error(f"An error occurred during translation: {e}")
         return None
-def validate_item_orpo(item):
     # Check basic required fields
     required_fields = ['source', 'prompt', 'chosen', 'rejected']
     for field in required_fields:
@@ -356,10 +356,10 @@ def process_file(input_file_path, output_file_path, raw_file_path, line_indices,
             print ("translating a mix-style model...")
             validate_item = validate_item_mix
             translate_item = translate_item_mix
-        elif model_type == "orpo":
-            print ("translating an orpo-style model...")
-            validate_item = validate_item_orpo
-            translate_item = translate_item_orpo # def translate_item_ufb(item, raw_file_path, translator, tokenizer):
         elif model_type == "ufb":
             print ("translating an ultrafeedback-style model...")
             validate_item = validate_item_ufb
@@ -458,10 +458,12 @@ def upload_output_to_huggingface(output_file_path, repo_name, token):
     # Check if the repository exists
     try:
         api.repo_info(repo_id=repo_name, repo_type="dataset", token=token)
     except Exception as e:
         if "404" in str(e):
             # Create the repository if it doesn't exist
             create_repo(repo_id=repo_name, repo_type="dataset", token=token)
             print(f"Created repository: {repo_name}")
         else:
@@ -470,6 +472,7 @@ def upload_output_to_huggingface(output_file_path, repo_name, token):
     # Upload the file to the repository
     try:
         upload_file(
             path_or_fileobj=output_file_path,
             path_in_repo=output_file_path,
@@ -619,14 +622,12 @@ datasets_desc = """## 📊 Dataset Types:
   - `prompt`: List of dictionaries with 'content' and 'role' fields (multi-turn conversation).
   - `chosen`: Single dictionary with 'content' and 'role' fields.
   - `rejected`: Single dictionary with 'content' and 'role' fields.
-- **orpo**:
   - `prompt`: String (user input).
   - `chosen`: List of dictionaries with 'content' and 'role' fields.
   - `rejected`: List of dictionaries with 'content' and 'role' fields.
 - **ufb**:
-  - `prompt`: String (user input).
-  - `chosen`: List of dictionaries with 'content' and 'role' fields.
-  - `rejected`: List of dictionaries with 'content' and 'role' fields.
 ## 🛠️ Backend:
 The translation backend runs on the Hugging Face Hub API."""
@@ -646,8 +647,8 @@ with gr.Blocks(theme=theme) as demo:
     with gr.Row(equal_height=False):
         with gr.Column():
             dataset_url = gr.Textbox(label="Input Dataset URL", lines=2, placeholder = "https://huggingface.co/datasets/alvarobartt/dpo-mix-7k-simplified/resolve/main/data/train-00000-of-00001.parquet?download=true")
-            model_type = gr.Dropdown(choices=["mix", "orpo", "ufb"], label="Dataset Type")
-            output_dataset_name = gr.Textbox(label="Output Dataset Name", lines=1, placeholder = "dataset_test_translations")
             range_specification = gr.Textbox(label="Range Specification", lines=1, placeholder="e.g., 1-100")
         with gr.Column():

     return True
+def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer):
     try:
         translated_texts = {}  # Cache to store translated texts
         logging.error(f"An error occurred during translation: {e}")
         return None
+def validate_item_ufb_cached(item):
     # Check basic required fields
     required_fields = ['source', 'prompt', 'chosen', 'rejected']
     for field in required_fields:
             print ("translating a mix-style model...")
             validate_item = validate_item_mix
             translate_item = translate_item_mix
+        elif model_type == "ufb_cached":
+            print ("translating an ufb_cached-style model...")
+            validate_item = validate_item_ufb_cached
+            translate_item = translate_item_ufb_cached # def translate_item_ufb(item, raw_file_path, translator, tokenizer):
         elif model_type == "ufb":
             print ("translating an ultrafeedback-style model...")
             validate_item = validate_item_ufb
     # Check if the repository exists
     try:
+        print ("checking repo:", repo_name)
         api.repo_info(repo_id=repo_name, repo_type="dataset", token=token)
     except Exception as e:
         if "404" in str(e):
             # Create the repository if it doesn't exist
+            print ("creating it...")
             create_repo(repo_id=repo_name, repo_type="dataset", token=token)
             print(f"Created repository: {repo_name}")
         else:
     # Upload the file to the repository
     try:
+        print ("starting dataset upload from:", output_file_path)
         upload_file(
             path_or_fileobj=output_file_path,
             path_in_repo=output_file_path,
   - `prompt`: List of dictionaries with 'content' and 'role' fields (multi-turn conversation).
   - `chosen`: Single dictionary with 'content' and 'role' fields.
   - `rejected`: Single dictionary with 'content' and 'role' fields.
+- **ufb_cached**:
   - `prompt`: String (user input).
   - `chosen`: List of dictionaries with 'content' and 'role' fields.
   - `rejected`: List of dictionaries with 'content' and 'role' fields.
 - **ufb**:
+  - like ufb_cached, but we do not check for already translated strings
 ## 🛠️ Backend:
 The translation backend runs on the Hugging Face Hub API."""
     with gr.Row(equal_height=False):
         with gr.Column():
             dataset_url = gr.Textbox(label="Input Dataset URL", lines=2, placeholder = "https://huggingface.co/datasets/alvarobartt/dpo-mix-7k-simplified/resolve/main/data/train-00000-of-00001.parquet?download=true")
+            model_type = gr.Dropdown(choices=["mix", "ufb_cached", "ufb"], label="Dataset Type")
+            output_dataset_name = gr.Textbox(label="Output Dataset Name", lines=1, placeholder = "cstr/translated_datasets")
             range_specification = gr.Textbox(label="Range Specification", lines=1, placeholder="e.g., 1-100")
         with gr.Column():