aapot
commited on
Commit
·
e496b51
1
Parent(s):
7f835a3
Saving weights and logs of step 10000
Browse files- flax_model.msgpack +3 -0
- merges.txt +0 -0
- run_mlm_flax.py +1 -1
- special_tokens_map.json +0 -0
- start_train.sh +2 -1
- tokenizer.json +0 -0
- tokenizer_config.json +0 -0
- vocab.json +0 -0
flax_model.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:551fdc978da02b8bb2d4c532488e94af3ed1bb471671edc02611cd5ef07f1e00
|
3 |
+
size 711588089
|
merges.txt
CHANGED
File without changes
|
run_mlm_flax.py
CHANGED
@@ -31,7 +31,7 @@ from pathlib import Path
|
|
31 |
from typing import Dict, List, Optional, Tuple
|
32 |
|
33 |
import numpy as np
|
34 |
-
from datasets import load_dataset
|
35 |
from tqdm import tqdm
|
36 |
|
37 |
import flax
|
|
|
31 |
from typing import Dict, List, Optional, Tuple
|
32 |
|
33 |
import numpy as np
|
34 |
+
from datasets import load_dataset, load_from_disk
|
35 |
from tqdm import tqdm
|
36 |
|
37 |
import flax
|
special_tokens_map.json
CHANGED
File without changes
|
start_train.sh
CHANGED
@@ -6,7 +6,7 @@ python3 run_mlm_flax.py \
|
|
6 |
--config_name="./" \
|
7 |
--tokenizer_name="./" \
|
8 |
--dataset_filepath="/researchdisk1/data/training_data_full" \
|
9 |
-
--
|
10 |
--max_seq_length="128" \
|
11 |
--pad_to_max_length \
|
12 |
--preprocessing_num_workers="96" \
|
@@ -22,4 +22,5 @@ python3 run_mlm_flax.py \
|
|
22 |
--eval_steps="10000" \
|
23 |
--logging_steps="1000" \
|
24 |
--dtype="bfloat16" \
|
|
|
25 |
--push_to_hub
|
|
|
6 |
--config_name="./" \
|
7 |
--tokenizer_name="./" \
|
8 |
--dataset_filepath="/researchdisk1/data/training_data_full" \
|
9 |
+
--tokenized_dataset_filepath="/researchdisk1/data/training_data_full_tokenized_128" \
|
10 |
--max_seq_length="128" \
|
11 |
--pad_to_max_length \
|
12 |
--preprocessing_num_workers="96" \
|
|
|
22 |
--eval_steps="10000" \
|
23 |
--logging_steps="1000" \
|
24 |
--dtype="bfloat16" \
|
25 |
+
--adafactor \
|
26 |
--push_to_hub
|
tokenizer.json
CHANGED
File without changes
|
tokenizer_config.json
CHANGED
File without changes
|
vocab.json
CHANGED
File without changes
|