Yhhxhfh commited on
Commit
62e192e
1 Parent(s): 042e1e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -15
app.py CHANGED
@@ -85,24 +85,32 @@ def load_and_train():
85
 
86
  logging.info("Dataset combinado columnas: %s", combined_dataset.column_names)
87
 
88
- # Función de tokenización en RAM
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  def tokenize_function(examples):
90
- text = ""
91
- if 'dialog' in examples:
92
- text = examples['dialog']
93
- elif 'docstring' in examples:
94
- text = examples['docstring']
95
- elif 'code' in examples:
96
- text = examples['code']
97
- if text:
98
- return tokenizer(text, truncation=True, padding='max_length', max_length=512)
99
- return {}
100
-
101
- # Tokenizar y mantener todo en RAM
102
  tokenized_dataset = combined_dataset.map(
103
  tokenize_function,
104
- batched=True,
105
- cache_dir=cache_dir
106
  )
107
 
108
  # Configurar argumentos de entrenamiento
 
85
 
86
  logging.info("Dataset combinado columnas: %s", combined_dataset.column_names)
87
 
88
+ # Función para crear un campo 'text' estandarizado
89
+ def concatenate_text_fields(examples):
90
+ texts = []
91
+ for i in range(len(examples['dialog']) if 'dialog' in examples else 0):
92
+ if 'dialog' in examples and examples['dialog'][i]:
93
+ texts.append(examples['dialog'][i])
94
+ elif 'whole_func_string' in examples and examples['whole_func_string'][i]:
95
+ texts.append(examples['whole_func_string'][i])
96
+ elif 'func_documentation_string' in examples and examples['func_documentation_string'][i]:
97
+ texts.append(examples['func_documentation_string'][i])
98
+ else:
99
+ texts.append('')
100
+ examples['text'] = texts
101
+ return examples
102
+
103
+ # Crear el campo 'text'
104
+ combined_dataset = combined_dataset.map(concatenate_text_fields, batched=True)
105
+
106
+ # Función de tokenización basada en el campo 'text'
107
  def tokenize_function(examples):
108
+ return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
109
+
110
+ # Tokenizar el dataset
 
 
 
 
 
 
 
 
 
111
  tokenized_dataset = combined_dataset.map(
112
  tokenize_function,
113
+ batched=True
 
114
  )
115
 
116
  # Configurar argumentos de entrenamiento