Ogamon commited on
Commit
c8324f0
1 Parent(s): 860958e

second commit

Browse files
all_results.json CHANGED
@@ -1,9 +1,10 @@
1
  {
2
- "epoch": 4.98793242156074,
3
- "num_input_tokens_seen": 5132288,
4
- "total_flos": 2.3110461174474342e+17,
5
- "train_loss": 0.14078616270634925,
6
- "train_runtime": 10317.9615,
7
- "train_samples_per_second": 9.634,
8
- "train_steps_per_second": 0.075
 
9
  }
 
1
  {
2
+ "predict_bleu-4": 86.52117906250001,
3
+ "predict_model_preparation_time": 0.005,
4
+ "predict_rouge-1": 94.453125,
5
+ "predict_rouge-2": 0.0,
6
+ "predict_rouge-l": 94.453125,
7
+ "predict_runtime": 15.8809,
8
+ "predict_samples_per_second": 160.822,
9
+ "predict_steps_per_second": 10.075
10
  }
generated_predictions.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llamaboard_config.yaml CHANGED
@@ -1,5 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
1
  top.booster: auto
2
- top.checkpoint_path: null
3
  top.finetuning_type: full
4
  top.model_name: LLaMA3.1-8B-Chat
5
  top.quantization_bit: none
@@ -7,61 +18,3 @@ top.quantization_method: bitsandbytes
7
  top.rope_scaling: none
8
  top.template: llama3
9
  top.visual_inputs: false
10
- train.additional_target: ''
11
- train.badam_mode: layer
12
- train.badam_switch_interval: 50
13
- train.badam_switch_mode: ascending
14
- train.badam_update_ratio: 0.05
15
- train.batch_size: 2
16
- train.compute_type: bf16
17
- train.create_new_adapter: false
18
- train.cutoff_len: 1024
19
- train.dataset:
20
- - truth_train
21
- train.dataset_dir: data
22
- train.ds_offload: false
23
- train.ds_stage: '2'
24
- train.freeze_extra_modules: ''
25
- train.freeze_trainable_layers: 2
26
- train.freeze_trainable_modules: all
27
- train.galore_rank: 16
28
- train.galore_scale: 0.25
29
- train.galore_target: all
30
- train.galore_update_interval: 200
31
- train.gradient_accumulation_steps: 8
32
- train.learning_rate: 5e-6
33
- train.logging_steps: 1
34
- train.lora_alpha: 16
35
- train.lora_dropout: 0
36
- train.lora_rank: 8
37
- train.lora_target: ''
38
- train.loraplus_lr_ratio: 0
39
- train.lr_scheduler_type: cosine
40
- train.mask_history: false
41
- train.max_grad_norm: '1.0'
42
- train.max_samples: '100000'
43
- train.neat_packing: false
44
- train.neftune_alpha: 0
45
- train.num_train_epochs: '5.0'
46
- train.optim: adamw_torch
47
- train.packing: false
48
- train.ppo_score_norm: false
49
- train.ppo_whiten_rewards: false
50
- train.pref_beta: 0.1
51
- train.pref_ftx: 0
52
- train.pref_loss: sigmoid
53
- train.report_to: false
54
- train.resize_vocab: false
55
- train.reward_model: null
56
- train.save_steps: 5000
57
- train.shift_attn: false
58
- train.train_on_prompt: false
59
- train.training_stage: Supervised Fine-Tuning
60
- train.use_badam: false
61
- train.use_dora: false
62
- train.use_galore: false
63
- train.use_llama_pro: false
64
- train.use_pissa: false
65
- train.use_rslora: false
66
- train.val_size: 0
67
- train.warmup_steps: 10
 
1
+ eval.batch_size: 2
2
+ eval.cutoff_len: 1024
3
+ eval.dataset:
4
+ - truth_dev
5
+ eval.dataset_dir: data
6
+ eval.max_new_tokens: 512
7
+ eval.max_samples: '100000'
8
+ eval.output_dir: eval_2024-07-29-16-36-04_llama3.1_reeval
9
+ eval.predict: true
10
+ eval.temperature: 0.95
11
+ eval.top_p: 0.7
12
  top.booster: auto
13
+ top.checkpoint_path: train_2024-07-29-16-36-04_llama3.1_reeval
14
  top.finetuning_type: full
15
  top.model_name: LLaMA3.1-8B-Chat
16
  top.quantization_bit: none
 
18
  top.rope_scaling: none
19
  top.template: llama3
20
  top.visual_inputs: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
predict_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "predict_bleu-4": 86.52117906250001,
3
+ "predict_model_preparation_time": 0.005,
4
+ "predict_rouge-1": 94.453125,
5
+ "predict_rouge-2": 0.0,
6
+ "predict_rouge-l": 94.453125,
7
+ "predict_runtime": 15.8809,
8
+ "predict_samples_per_second": 160.822,
9
+ "predict_steps_per_second": 10.075
10
+ }
running_log.txt CHANGED
The diff for this file is too large to render. See raw diff
 
trainer_log.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.yaml CHANGED
@@ -1,29 +1,18 @@
1
- bf16: true
2
  cutoff_len: 1024
3
- dataset: truth_train
4
  dataset_dir: data
5
- ddp_timeout: 180000000
6
- deepspeed: cache/ds_z2_config.json
7
- do_train: true
8
  finetuning_type: full
9
  flash_attn: auto
10
- gradient_accumulation_steps: 8
11
- include_num_input_tokens_seen: true
12
- learning_rate: 5.0e-06
13
- logging_steps: 1
14
- lr_scheduler_type: cosine
15
- max_grad_norm: 1.0
16
  max_samples: 100000
17
- model_name_or_path: meta-llama/Meta-Llama-3.1-8B-Instruct
18
- num_train_epochs: 5.0
19
- optim: adamw_torch
20
- output_dir: saves/LLaMA3.1-8B-Chat/full/train_2024-07-29-16-36-04_llama3.1_reeval
21
- packing: false
22
- per_device_train_batch_size: 2
23
- plot_loss: true
24
  preprocessing_num_workers: 16
25
- report_to: none
26
- save_steps: 5000
27
  stage: sft
 
28
  template: llama3
29
- warmup_steps: 10
 
 
1
  cutoff_len: 1024
 
2
  dataset_dir: data
3
+ do_predict: true
4
+ eval_dataset: truth_dev
 
5
  finetuning_type: full
6
  flash_attn: auto
7
+ max_new_tokens: 512
 
 
 
 
 
8
  max_samples: 100000
9
+ model_name_or_path: saves/LLaMA3.1-8B-Chat/full/train_2024-07-29-16-36-04_llama3.1_reeval
10
+ output_dir: saves/LLaMA3.1-8B-Chat/full/eval_2024-07-29-16-36-04_llama3.1_reeval
11
+ per_device_eval_batch_size: 2
12
+ predict_with_generate: true
 
 
 
13
  preprocessing_num_workers: 16
14
+ quantization_method: bitsandbytes
 
15
  stage: sft
16
+ temperature: 0.95
17
  template: llama3
18
+ top_p: 0.7