Update app.py
Browse files
app.py
CHANGED
@@ -85,24 +85,32 @@ def load_and_train():
|
|
85 |
|
86 |
logging.info("Dataset combinado columnas: %s", combined_dataset.column_names)
|
87 |
|
88 |
-
# Función
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
def tokenize_function(examples):
|
90 |
-
text =
|
91 |
-
|
92 |
-
|
93 |
-
elif 'docstring' in examples:
|
94 |
-
text = examples['docstring']
|
95 |
-
elif 'code' in examples:
|
96 |
-
text = examples['code']
|
97 |
-
if text:
|
98 |
-
return tokenizer(text, truncation=True, padding='max_length', max_length=512)
|
99 |
-
return {}
|
100 |
-
|
101 |
-
# Tokenizar y mantener todo en RAM
|
102 |
tokenized_dataset = combined_dataset.map(
|
103 |
tokenize_function,
|
104 |
-
batched=True
|
105 |
-
cache_dir=cache_dir
|
106 |
)
|
107 |
|
108 |
# Configurar argumentos de entrenamiento
|
|
|
85 |
|
86 |
logging.info("Dataset combinado columnas: %s", combined_dataset.column_names)
|
87 |
|
88 |
+
# Función para crear un campo 'text' estandarizado
|
89 |
+
def concatenate_text_fields(examples):
|
90 |
+
texts = []
|
91 |
+
for i in range(len(examples['dialog']) if 'dialog' in examples else 0):
|
92 |
+
if 'dialog' in examples and examples['dialog'][i]:
|
93 |
+
texts.append(examples['dialog'][i])
|
94 |
+
elif 'whole_func_string' in examples and examples['whole_func_string'][i]:
|
95 |
+
texts.append(examples['whole_func_string'][i])
|
96 |
+
elif 'func_documentation_string' in examples and examples['func_documentation_string'][i]:
|
97 |
+
texts.append(examples['func_documentation_string'][i])
|
98 |
+
else:
|
99 |
+
texts.append('')
|
100 |
+
examples['text'] = texts
|
101 |
+
return examples
|
102 |
+
|
103 |
+
# Crear el campo 'text'
|
104 |
+
combined_dataset = combined_dataset.map(concatenate_text_fields, batched=True)
|
105 |
+
|
106 |
+
# Función de tokenización basada en el campo 'text'
|
107 |
def tokenize_function(examples):
|
108 |
+
return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
|
109 |
+
|
110 |
+
# Tokenizar el dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
tokenized_dataset = combined_dataset.map(
|
112 |
tokenize_function,
|
113 |
+
batched=True
|
|
|
114 |
)
|
115 |
|
116 |
# Configurar argumentos de entrenamiento
|