Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -89,7 +89,7 @@ def parse_range_specification(range_specification, file_length):
|
|
89 |
line_indices.append(single_line)
|
90 |
return line_indices
|
91 |
|
92 |
-
def translate_text(text, translator, tokenizer):
|
93 |
"""
|
94 |
Translates the given text from English to German using CTranslate2 and the WMT21 model,
|
95 |
with special handling for newlines and segmenting text longer than 500 characters.
|
@@ -131,7 +131,7 @@ def translate_text(text, translator, tokenizer):
|
|
131 |
translated_segments = []
|
132 |
for segment in segments:
|
133 |
source = tokenizer.convert_ids_to_tokens(tokenizer.encode(segment))
|
134 |
-
target_prefix = [tokenizer.lang_code_to_token[
|
135 |
results = translator.translate_batch([source], target_prefix=[target_prefix])
|
136 |
target = results[0].hypotheses[0][1:]
|
137 |
translated_segment = tokenizer.decode(tokenizer.convert_tokens_to_ids(target))
|
@@ -150,7 +150,7 @@ def translate_text(text, translator, tokenizer):
|
|
150 |
logging.error(f"An error occurred during translation: {e}")
|
151 |
return None
|
152 |
|
153 |
-
def translate_item_ufb(item, raw_file_path, translator, tokenizer):
|
154 |
try:
|
155 |
# Translate the prompt directly since it's a string
|
156 |
translated_prompt = translate_text(item['prompt'], translator, tokenizer)
|
@@ -158,12 +158,12 @@ def translate_item_ufb(item, raw_file_path, translator, tokenizer):
|
|
158 |
# Translate the chosen and rejected contents
|
159 |
translated_chosen = []
|
160 |
for choice in item['chosen']:
|
161 |
-
translated_content = translate_text(choice['content'], translator, tokenizer)
|
162 |
translated_chosen.append({'content': translated_content, 'role': choice['role']})
|
163 |
|
164 |
translated_rejected = []
|
165 |
for choice in item['rejected']:
|
166 |
-
translated_content = translate_text(choice['content'], translator, tokenizer)
|
167 |
translated_rejected.append({'content': translated_content, 'role': choice['role']})
|
168 |
|
169 |
# Write the raw response to a backup file
|
@@ -211,7 +211,7 @@ def validate_item_ufb(item):
|
|
211 |
|
212 |
|
213 |
|
214 |
-
def translate_item_mix(item, raw_file_path, translator, tokenizer):
|
215 |
"""
|
216 |
Translates the relevant fields in the given item from English to German using CTranslate2 and the WMT21 model,
|
217 |
and saves the raw response to a backup file.
|
@@ -221,12 +221,12 @@ def translate_item_mix(item, raw_file_path, translator, tokenizer):
|
|
221 |
# Translate each part of the prompt separately and preserve the order
|
222 |
translated_prompts = []
|
223 |
for message in item['prompt']:
|
224 |
-
translated_content = translate_text(message['content'], translator, tokenizer)
|
225 |
translated_prompts.append({'content': translated_content, 'role': message['role']})
|
226 |
|
227 |
# Translate the chosen and rejected contents
|
228 |
-
translated_chosen_content = translate_text(item['chosen'][0]['content'], translator, tokenizer)
|
229 |
-
translated_rejected_content = translate_text(item['rejected'][0]['content'], translator, tokenizer)
|
230 |
|
231 |
# Write the raw response to a backup file
|
232 |
with open(raw_file_path, 'a', encoding='utf-8') as raw_file:
|
@@ -276,13 +276,13 @@ def validate_item_mix(item):
|
|
276 |
|
277 |
return True
|
278 |
|
279 |
-
def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer):
|
280 |
try:
|
281 |
translated_texts = {} # Cache to store translated texts
|
282 |
|
283 |
# Translate the prompt if necessary (which is a user input and can appear again)
|
284 |
if item['prompt'] not in translated_texts:
|
285 |
-
translated_prompt = translate_text(item['prompt'], translator, tokenizer)
|
286 |
translated_texts[item['prompt']] = translated_prompt
|
287 |
else:
|
288 |
translated_prompt = translated_texts[item['prompt']]
|
@@ -290,7 +290,7 @@ def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer):
|
|
290 |
# Helper function to handle content translation with caching
|
291 |
def get_translated_content(content):
|
292 |
if content not in translated_texts:
|
293 |
-
translated_texts[content] = translate_text(content, translator, tokenizer)
|
294 |
return translated_texts[content]
|
295 |
|
296 |
# Process translations for chosen and rejected sections
|
@@ -349,7 +349,7 @@ def validate_item_ufb_cached(item):
|
|
349 |
|
350 |
return True
|
351 |
|
352 |
-
def process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type):
|
353 |
try:
|
354 |
# Assigning validation and translation functions based on model_type
|
355 |
if model_type == "mix":
|
@@ -387,7 +387,7 @@ def process_file(input_file_path, output_file_path, raw_file_path, line_indices,
|
|
387 |
retry_count = 0
|
388 |
while translated_item is None and retry_count < 3:
|
389 |
print ("going to translate the item...")
|
390 |
-
translated_item = translate_item(item, raw_file_path, translator, tokenizer)
|
391 |
retry_count += 1
|
392 |
if translated_item is None:
|
393 |
logging.warning(f"Translation failed for item. Retry attempt: {retry_count}")
|
@@ -485,7 +485,7 @@ def upload_output_to_huggingface(output_file_path, repo_name, token):
|
|
485 |
print(f"Failed to upload {output_file_path} to Hugging Face: {e}")
|
486 |
raise
|
487 |
|
488 |
-
def translate_dataset(train_url, local_parquet_path, input_file_path, output_file_path, raw_file_path, range_specification, model_type, output_dir, output_repo_name, token, translator, tokenizer):
|
489 |
try:
|
490 |
# Download the Parquet file
|
491 |
download_parquet(train_url, local_parquet_path)
|
@@ -527,7 +527,7 @@ def translate_dataset(train_url, local_parquet_path, input_file_path, output_fil
|
|
527 |
|
528 |
try:
|
529 |
# Process the file with specified model type and line indices
|
530 |
-
process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type)
|
531 |
except Exception as e:
|
532 |
logging.error(f"Failed to process the file {input_file_path}: {e}")
|
533 |
return
|
|
|
89 |
line_indices.append(single_line)
|
90 |
return line_indices
|
91 |
|
92 |
+
def translate_text(text, translator, tokenizer, target_language):
|
93 |
"""
|
94 |
Translates the given text from English to German using CTranslate2 and the WMT21 model,
|
95 |
with special handling for newlines and segmenting text longer than 500 characters.
|
|
|
131 |
translated_segments = []
|
132 |
for segment in segments:
|
133 |
source = tokenizer.convert_ids_to_tokens(tokenizer.encode(segment))
|
134 |
+
target_prefix = [tokenizer.lang_code_to_token[target_language]]
|
135 |
results = translator.translate_batch([source], target_prefix=[target_prefix])
|
136 |
target = results[0].hypotheses[0][1:]
|
137 |
translated_segment = tokenizer.decode(tokenizer.convert_tokens_to_ids(target))
|
|
|
150 |
logging.error(f"An error occurred during translation: {e}")
|
151 |
return None
|
152 |
|
153 |
+
def translate_item_ufb(item, raw_file_path, translator, tokenizer, target_language):
|
154 |
try:
|
155 |
# Translate the prompt directly since it's a string
|
156 |
translated_prompt = translate_text(item['prompt'], translator, tokenizer)
|
|
|
158 |
# Translate the chosen and rejected contents
|
159 |
translated_chosen = []
|
160 |
for choice in item['chosen']:
|
161 |
+
translated_content = translate_text(choice['content'], translator, tokenizer, target_language)
|
162 |
translated_chosen.append({'content': translated_content, 'role': choice['role']})
|
163 |
|
164 |
translated_rejected = []
|
165 |
for choice in item['rejected']:
|
166 |
+
translated_content = translate_text(choice['content'], translator, tokenizer, target_language)
|
167 |
translated_rejected.append({'content': translated_content, 'role': choice['role']})
|
168 |
|
169 |
# Write the raw response to a backup file
|
|
|
211 |
|
212 |
|
213 |
|
214 |
+
def translate_item_mix(item, raw_file_path, translator, tokenizer, target_language):
|
215 |
"""
|
216 |
Translates the relevant fields in the given item from English to German using CTranslate2 and the WMT21 model,
|
217 |
and saves the raw response to a backup file.
|
|
|
221 |
# Translate each part of the prompt separately and preserve the order
|
222 |
translated_prompts = []
|
223 |
for message in item['prompt']:
|
224 |
+
translated_content = translate_text(message['content'], translator, tokenizer, target_language)
|
225 |
translated_prompts.append({'content': translated_content, 'role': message['role']})
|
226 |
|
227 |
# Translate the chosen and rejected contents
|
228 |
+
translated_chosen_content = translate_text(item['chosen'][0]['content'], translator, tokenizer, target_language)
|
229 |
+
translated_rejected_content = translate_text(item['rejected'][0]['content'], translator, tokenizer, target_language)
|
230 |
|
231 |
# Write the raw response to a backup file
|
232 |
with open(raw_file_path, 'a', encoding='utf-8') as raw_file:
|
|
|
276 |
|
277 |
return True
|
278 |
|
279 |
+
def translate_item_ufb_cached(item, raw_file_path, translator, tokenizer, target_language):
|
280 |
try:
|
281 |
translated_texts = {} # Cache to store translated texts
|
282 |
|
283 |
# Translate the prompt if necessary (which is a user input and can appear again)
|
284 |
if item['prompt'] not in translated_texts:
|
285 |
+
translated_prompt = translate_text(item['prompt'], translator, tokenizer, target_language)
|
286 |
translated_texts[item['prompt']] = translated_prompt
|
287 |
else:
|
288 |
translated_prompt = translated_texts[item['prompt']]
|
|
|
290 |
# Helper function to handle content translation with caching
|
291 |
def get_translated_content(content):
|
292 |
if content not in translated_texts:
|
293 |
+
translated_texts[content] = translate_text(content, translator, tokenizer, target_language)
|
294 |
return translated_texts[content]
|
295 |
|
296 |
# Process translations for chosen and rejected sections
|
|
|
349 |
|
350 |
return True
|
351 |
|
352 |
+
def process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type, target_language):
|
353 |
try:
|
354 |
# Assigning validation and translation functions based on model_type
|
355 |
if model_type == "mix":
|
|
|
387 |
retry_count = 0
|
388 |
while translated_item is None and retry_count < 3:
|
389 |
print ("going to translate the item...")
|
390 |
+
translated_item = translate_item(item, raw_file_path, translator, tokenizer, target_language)
|
391 |
retry_count += 1
|
392 |
if translated_item is None:
|
393 |
logging.warning(f"Translation failed for item. Retry attempt: {retry_count}")
|
|
|
485 |
print(f"Failed to upload {output_file_path} to Hugging Face: {e}")
|
486 |
raise
|
487 |
|
488 |
+
def translate_dataset(train_url, local_parquet_path, input_file_path, output_file_path, raw_file_path, range_specification, model_type, output_dir, output_repo_name, token, translator, tokenizer, target_language):
|
489 |
try:
|
490 |
# Download the Parquet file
|
491 |
download_parquet(train_url, local_parquet_path)
|
|
|
527 |
|
528 |
try:
|
529 |
# Process the file with specified model type and line indices
|
530 |
+
process_file(input_file_path, output_file_path, raw_file_path, line_indices, translator, tokenizer, model_type, target_language)
|
531 |
except Exception as e:
|
532 |
logging.error(f"Failed to process the file {input_file_path}: {e}")
|
533 |
return
|