chitanda commited on
Commit
9dd9a2a
1 Parent(s): 4ac06b6

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/tmp/llama2.13b.wudao.sft.combine.v1.0.seq2k.w16.adamw.NA100.0803.ds/checkpoint-1750",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "bos_token_id": 1,
7
+ "eos_token_id": 2,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 5120,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 13824,
12
+ "max_position_embeddings": 2048,
13
+ "model_type": "llama",
14
+ "num_attention_heads": 40,
15
+ "num_hidden_layers": 40,
16
+ "num_key_value_heads": 40,
17
+ "pad_token_id": 0,
18
+ "pretraining_tp": 2,
19
+ "rms_norm_eps": 1e-05,
20
+ "rope_scaling": null,
21
+ "tie_word_embeddings": false,
22
+ "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.31.0",
24
+ "use_cache": false,
25
+ "vocab_size": 79458
26
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "temperature": 0.9,
7
+ "top_p": 0.6,
8
+ "transformers_version": "4.31.0"
9
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec8bca7e83bce757be0c5c86dde381c7f6c069c288da15d17522041833b9ff65
3
+ size 27003737780
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c6776001191a59ce6be0d949adb3b524f37fc1fbd387e48d2ab7b1310912bdf
3
+ size 1268131
tokenizer_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "__type": "AddedToken",
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "clean_up_tokenization_spaces": false,
11
+ "eos_token": {
12
+ "__type": "AddedToken",
13
+ "content": "</s>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "legacy": true,
20
+ "model_max_length": 1000000000000000019884624838656,
21
+ "pad_token": null,
22
+ "sp_model_kwargs": {},
23
+ "tokenizer_class": "LlamaTokenizer",
24
+ "unk_token": {
25
+ "__type": "AddedToken",
26
+ "content": "<unk>",
27
+ "lstrip": false,
28
+ "normalized": true,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ }
32
+ }
training_config.yaml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aws_output_bucket: s3://panda-us-west-2/experiments/llama2.13b.wudao.sft.combine.legal.v1.0.seq2k.w16.adamw.NA100.0921.ds
2
+ data_dir: null
3
+ dist_load_data_barrier: false
4
+ train_file: data/files/tmp.json
5
+ dev_file: null
6
+ test_file: null
7
+ model:
8
+ _target_: models.llama.LlamaForConditionalGeneration.from_pretrained
9
+ use_peft: false
10
+ gradient_checkpointing: true
11
+ enable_flash_attention: true
12
+ flash_attention_vanilla_torch: true
13
+ read_tensor:
14
+ _target_: data.collators.zh_instruct.TextDatasetUnifyV3
15
+ pair_file_list: null
16
+ extended_vocab: null
17
+ collator:
18
+ _target_: data.collators.flan.FlanCollatorOverCollator
19
+ collator: null
20
+ max_seq_length: 2048
21
+ tokenizer: ${model_name_or_path}
22
+ decoder_only: true
23
+ padding: longest
24
+ padding_side: right
25
+ num_workers: 4
26
+ prefetch_factor: 2
27
+ do_preprocess: false
28
+ model_name_or_path: /tmp/llama2.13b.wudao.sft.combine.v1.0.seq2k.w16.adamw.NA100.0803.ds/checkpoint-1750
29
+ pretrain: null
30
+ exp_name: llama2.13b.wudao.sft.combine.legal.v1.0.seq2k.w16.adamw.NA100.0921.ds
31
+ exp_notes: null
32
+ output_dir: /tmp/${exp_name}
33
+ resume: null
34
+ do_train: true
35
+ evaluate_during_training: false
36
+ do_eval: false
37
+ eval_sub_path: checkpoint-*
38
+ per_gpu_train_batch_size: 2
39
+ per_gpu_eval_batch_size: 1
40
+ learning_rate: 1.0e-06
41
+ gradient_accumulation_steps: 8
42
+ weight_decay: 0.01
43
+ adam_epsilon: 1.0e-06
44
+ adam_betas: (0.9, 0.99)
45
+ max_grad_norm: 1.0
46
+ num_train_epochs: 1
47
+ total_dataset_len: -1
48
+ max_steps: 0
49
+ warmup_proportion: 0
50
+ warmup_steps: 0
51
+ optimizer: null
52
+ use_nvlamb: null
53
+ bit_training: null
54
+ logging_steps: 1
55
+ save_best: false
56
+ save_steps: 250
57
+ eval_steps: 250
58
+ ddp_eval: true
59
+ no_cuda: false
60
+ seed: 42
61
+ local_rank: 0
62
+ fp16: true
63
+ fp16_opt_level: O1
64
+ fp16_bfloat16: true
65
+ prediction_cfg:
66
+ metric: acc
67
+ measure: 1
68
+ best_checkpoint: null
69
+ best_result: null
70
+ eval_forward_fn:
71
+ _target_: general_util.evaluator.DiscriminatorForwardFn
72
+ post_process: null
73
+ fairscale_config:
74
+ _target_: general_util.fsdp_utils.default_initialize
75
+ fp16: ${fp16}
76
+ move_grads_to_cpu: false
77
+ move_params_to_cpu: false
78
+ flatten_parameters: false
79
+ with_lightseq: false
80
+ load_lr_scheduler_states: false
81
+ ds_cfg:
82
+ train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
83
+ gradient_accumulation_steps: ${gradient_accumulation_steps}
84
+ optimizer:
85
+ type: AdamW
86
+ params:
87
+ lr: ${learning_rate}
88
+ betas:
89
+ - 0.9
90
+ - 0.96
91
+ eps: ${adam_epsilon}
92
+ weight_decay: ${weight_decay}
93
+ scheduler:
94
+ type: WarmupDecayLR
95
+ params:
96
+ total_num_steps: 1474
97
+ warmup_max_lr: ${learning_rate}
98
+ warmup_num_steps: 0
99
+ warmup_type: linear
100
+ gradient_clipping: ${max_grad_norm}
101
+ bf16:
102
+ enabled: ${fp16}
103
+ zero_optimization:
104
+ stage: 1
105
+ contiguous_gradients: true
106
+ overlap_comm: true
107
+ reduce_scatter: true
108
+ reduce_bucket_size: 500000000.0
109
+ allgather_bucket_size: 500000000.0
110
+ offload_optimizer:
111
+ device: cpu
112
+ pin_memory: true
113
+ steps_per_print: 1
114
+ summary_helper:
115
+ _target_: general_util.tensorboard_helper.WandbWriter
116
+ batch_index_or_keys: null
117
+ outputs_index_or_keys: null
118
+ n_gpu: 1
119
+ device: cuda:0
120
+ train_batch_size: 2
121
+ eval_batch_size: null
122
+ world_size: 16
123
+ world_rank: null