Fix some bugs
Browse files- src/run.sh +3 -3
- src/run_clm_flax.py +2 -0
src/run.sh
CHANGED
@@ -9,9 +9,9 @@ export OUTPUT_DIR=/home/m3hrdadfi/code/gpt2-medium-persian
|
|
9 |
# export CONFIG_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
|
10 |
# export TOKENIZER_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
|
11 |
|
12 |
-
export TRAIN_FILE=/home/m3hrdadfi/data/train.csv
|
13 |
-
export VALIDATION_FILE=/home/m3hrdadfi/data/test.csv
|
14 |
-
export TEST_FILE=/home/m3hrdadfi/code/data/test.csv
|
15 |
# export DATASET_NAME=oscar
|
16 |
# export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
|
17 |
export MAX_SEQUENCE_LENGTH=512
|
|
|
9 |
# export CONFIG_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
|
10 |
# export TOKENIZER_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
|
11 |
|
12 |
+
export TRAIN_FILE=/home/m3hrdadfi/data/train-fixed.csv
|
13 |
+
export VALIDATION_FILE=/home/m3hrdadfi/data/test-fixed.csv
|
14 |
+
export TEST_FILE=/home/m3hrdadfi/code/data/test-fixed.csv
|
15 |
# export DATASET_NAME=oscar
|
16 |
# export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
|
17 |
export MAX_SEQUENCE_LENGTH=512
|
src/run_clm_flax.py
CHANGED
@@ -368,6 +368,7 @@ def main():
|
|
368 |
# dataset = dataset.map(normalizer)
|
369 |
# logger.info(f"Preprocessed dataset kept {len(dataset)} out of {len(raw_dataset)}")
|
370 |
dataset = raw_dataset
|
|
|
371 |
|
372 |
# Load pretrained model and tokenizer
|
373 |
|
@@ -421,6 +422,7 @@ def main():
|
|
421 |
else:
|
422 |
column_names = dataset["validation"].column_names
|
423 |
text_column_name = "text" if "text" in column_names else column_names[0]
|
|
|
424 |
|
425 |
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
|
426 |
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
|
|
|
368 |
# dataset = dataset.map(normalizer)
|
369 |
# logger.info(f"Preprocessed dataset kept {len(dataset)} out of {len(raw_dataset)}")
|
370 |
dataset = raw_dataset
|
371 |
+
logger.info(f"dataset: {dataset}")
|
372 |
|
373 |
# Load pretrained model and tokenizer
|
374 |
|
|
|
422 |
else:
|
423 |
column_names = dataset["validation"].column_names
|
424 |
text_column_name = "text" if "text" in column_names else column_names[0]
|
425 |
+
logger.info(f"text_column_name: {text_column_name}")
|
426 |
|
427 |
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
|
428 |
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
|