Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -276,7 +276,7 @@ def validate_item_mix(item):
|
|
276 |
|
277 |
return True
|
278 |
|
279 |
-
def
|
280 |
try:
|
281 |
translated_texts = {} # Cache to store translated texts
|
282 |
|
@@ -321,7 +321,7 @@ def translate_item_orpo(item, raw_file_path, translator, tokenizer):
|
|
321 |
logging.error(f"An error occurred during translation: {e}")
|
322 |
return None
|
323 |
|
324 |
-
def
|
325 |
# Check basic required fields
|
326 |
required_fields = ['source', 'prompt', 'chosen', 'rejected']
|
327 |
for field in required_fields:
|
@@ -356,10 +356,10 @@ def process_file(input_file_path, output_file_path, raw_file_path, line_indices,
|
|
356 |
print ("translating a mix-style model...")
|
357 |
validate_item = validate_item_mix
|
358 |
translate_item = translate_item_mix
|
359 |
-
elif model_type == "
|
360 |
-
print ("translating an
|
361 |
-
validate_item =
|
362 |
-
translate_item =
|
363 |
elif model_type == "ufb":
|
364 |
print ("translating an ultrafeedback-style model...")
|
365 |
validate_item = validate_item_ufb
|
@@ -458,10 +458,12 @@ def upload_output_to_huggingface(output_file_path, repo_name, token):
|
|
458 |
|
459 |
# Check if the repository exists
|
460 |
try:
|
|
|
461 |
api.repo_info(repo_id=repo_name, repo_type="dataset", token=token)
|
462 |
except Exception as e:
|
463 |
if "404" in str(e):
|
464 |
# Create the repository if it doesn't exist
|
|
|
465 |
create_repo(repo_id=repo_name, repo_type="dataset", token=token)
|
466 |
print(f"Created repository: {repo_name}")
|
467 |
else:
|
@@ -470,6 +472,7 @@ def upload_output_to_huggingface(output_file_path, repo_name, token):
|
|
470 |
|
471 |
# Upload the file to the repository
|
472 |
try:
|
|
|
473 |
upload_file(
|
474 |
path_or_fileobj=output_file_path,
|
475 |
path_in_repo=output_file_path,
|
@@ -619,14 +622,12 @@ datasets_desc = """## 📊 Dataset Types:
|
|
619 |
- `prompt`: List of dictionaries with 'content' and 'role' fields (multi-turn conversation).
|
620 |
- `chosen`: Single dictionary with 'content' and 'role' fields.
|
621 |
- `rejected`: Single dictionary with 'content' and 'role' fields.
|
622 |
-
- **
|
623 |
- `prompt`: String (user input).
|
624 |
- `chosen`: List of dictionaries with 'content' and 'role' fields.
|
625 |
- `rejected`: List of dictionaries with 'content' and 'role' fields.
|
626 |
- **ufb**:
|
627 |
-
-
|
628 |
-
- `chosen`: List of dictionaries with 'content' and 'role' fields.
|
629 |
-
- `rejected`: List of dictionaries with 'content' and 'role' fields.
|
630 |
## 🛠️ Backend:
|
631 |
The translation backend runs on the Hugging Face Hub API."""
|
632 |
|
@@ -646,8 +647,8 @@ with gr.Blocks(theme=theme) as demo:
|
|
646 |
with gr.Row(equal_height=False):
|
647 |
with gr.Column():
|
648 |
dataset_url = gr.Textbox(label="Input Dataset URL", lines=2, placeholder = "https://huggingface.co/datasets/alvarobartt/dpo-mix-7k-simplified/resolve/main/data/train-00000-of-00001.parquet?download=true")
|
649 |
-
model_type = gr.Dropdown(choices=["mix", "
|
650 |
-
output_dataset_name = gr.Textbox(label="Output Dataset Name", lines=1, placeholder = "
|
651 |
range_specification = gr.Textbox(label="Range Specification", lines=1, placeholder="e.g., 1-100")
|
652 |
|
653 |
with gr.Column():
|
|
|
276 |
|
277 |
return True
|
278 |
|
279 |
+
def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer):
|
280 |
try:
|
281 |
translated_texts = {} # Cache to store translated texts
|
282 |
|
|
|
321 |
logging.error(f"An error occurred during translation: {e}")
|
322 |
return None
|
323 |
|
324 |
+
def validate_item_ufb_cached(item):
|
325 |
# Check basic required fields
|
326 |
required_fields = ['source', 'prompt', 'chosen', 'rejected']
|
327 |
for field in required_fields:
|
|
|
356 |
print ("translating a mix-style model...")
|
357 |
validate_item = validate_item_mix
|
358 |
translate_item = translate_item_mix
|
359 |
+
elif model_type == "ufb_cached":
|
360 |
+
print ("translating an ufb_cached-style model...")
|
361 |
+
validate_item = validate_item_ufb_cached
|
362 |
+
translate_item = translate_item_ufb_cached # def translate_item_ufb(item, raw_file_path, translator, tokenizer):
|
363 |
elif model_type == "ufb":
|
364 |
print ("translating an ultrafeedback-style model...")
|
365 |
validate_item = validate_item_ufb
|
|
|
458 |
|
459 |
# Check if the repository exists
|
460 |
try:
|
461 |
+
print ("checking repo:", repo_name)
|
462 |
api.repo_info(repo_id=repo_name, repo_type="dataset", token=token)
|
463 |
except Exception as e:
|
464 |
if "404" in str(e):
|
465 |
# Create the repository if it doesn't exist
|
466 |
+
print ("creating it...")
|
467 |
create_repo(repo_id=repo_name, repo_type="dataset", token=token)
|
468 |
print(f"Created repository: {repo_name}")
|
469 |
else:
|
|
|
472 |
|
473 |
# Upload the file to the repository
|
474 |
try:
|
475 |
+
print ("starting dataset upload from:", output_file_path)
|
476 |
upload_file(
|
477 |
path_or_fileobj=output_file_path,
|
478 |
path_in_repo=output_file_path,
|
|
|
622 |
- `prompt`: List of dictionaries with 'content' and 'role' fields (multi-turn conversation).
|
623 |
- `chosen`: Single dictionary with 'content' and 'role' fields.
|
624 |
- `rejected`: Single dictionary with 'content' and 'role' fields.
|
625 |
+
- **ufb_cached**:
|
626 |
- `prompt`: String (user input).
|
627 |
- `chosen`: List of dictionaries with 'content' and 'role' fields.
|
628 |
- `rejected`: List of dictionaries with 'content' and 'role' fields.
|
629 |
- **ufb**:
|
630 |
+
- like ufb_cached, but we do not check for already translated strings
|
|
|
|
|
631 |
## 🛠️ Backend:
|
632 |
The translation backend runs on the Hugging Face Hub API."""
|
633 |
|
|
|
647 |
with gr.Row(equal_height=False):
|
648 |
with gr.Column():
|
649 |
dataset_url = gr.Textbox(label="Input Dataset URL", lines=2, placeholder = "https://huggingface.co/datasets/alvarobartt/dpo-mix-7k-simplified/resolve/main/data/train-00000-of-00001.parquet?download=true")
|
650 |
+
model_type = gr.Dropdown(choices=["mix", "ufb_cached", "ufb"], label="Dataset Type")
|
651 |
+
output_dataset_name = gr.Textbox(label="Output Dataset Name", lines=1, placeholder = "cstr/translated_datasets")
|
652 |
range_specification = gr.Textbox(label="Range Specification", lines=1, placeholder="e.g., 1-100")
|
653 |
|
654 |
with gr.Column():
|