willtensora commited on
Commit
31980b7
·
verified ·
1 Parent(s): 272ce4c

Training in progress, step 40

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
00000000-0000-0000-0000-000000000000.yml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: peft-internal-testing/tiny-dummy-qwen2
2
+ batch_size: 8
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - format: custom
7
+ path: argilla/databricks-dolly-15k-curated-en
8
+ type:
9
+ field_input: original-instruction
10
+ field_instruction: original-instruction
11
+ field_output: original-response
12
+ format: '{instruction} {input}'
13
+ no_input_format: '{instruction}'
14
+ system_format: '{system}'
15
+ system_prompt: ''
16
+ eval_steps: 20
17
+ flash_attention: true
18
+ gpu_memory_limit: 80GiB
19
+ gradient_checkpointing: true
20
+ group_by_length: true
21
+ hub_model_id: willtensora/test-repo
22
+ hub_strategy: checkpoint
23
+ learning_rate: 0.002
24
+ load_best_model_at_end: true
25
+ logging_steps: 10
26
+ lr_scheduler: cosine
27
+ max_steps: 1
28
+ micro_batch_size: 1
29
+ model_type: AutoModelForCausalLM
30
+ num_epochs: 100
31
+ optimizer: adamw_bnb_8bit
32
+ output_dir: /workspace/axolotl/configs
33
+ pad_to_sequence_len: true
34
+ resize_token_embeddings_to_32x: false
35
+ sample_packing: false
36
+ save_steps: 40
37
+ save_total_limit: 1
38
+ sequence_len: 8
39
+ tokenizer_type: Qwen2TokenizerFast
40
+ train_on_inputs: false
41
+ trust_remote_code: true
42
+ val_set_size: 0.001
43
+ wandb_entity: ''
44
+ wandb_mode: online
45
+ wandb_name: peft-internal-testing/tiny-dummy-qwen2-argilla/databricks-dolly-15k-curated-en
46
+ wandb_project: Gradients-On-Demand
47
+ wandb_run: your_name
48
+ wandb_runid: default
49
+ warmup_ratio: 0.05
50
+ xformers_attention: true
01b60291-41f3-4631-b7e8-f7c60c2ca163.yml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: huggyllama/llama-7b
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - ccd32583f980ebf0_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/ccd32583f980ebf0_train_data.json
11
+ type:
12
+ field_input: ''
13
+ field_instruction: problem
14
+ field_output: solution
15
+ format: '{instruction}'
16
+ no_input_format: '{instruction}'
17
+ system_format: '{system}'
18
+ system_prompt: ''
19
+ eval_steps: 20
20
+ flash_attention: true
21
+ gpu_memory_limit: 80GiB
22
+ gradient_checkpointing: true
23
+ group_by_length: true
24
+ hub_model_id: willtensora/fa36bf4c-34a6-4e51-ae14-a8372bf92b39
25
+ hub_strategy: checkpoint
26
+ learning_rate: 0.0002
27
+ logging_steps: 10
28
+ lr_scheduler: cosine
29
+ max_steps: 2500
30
+ micro_batch_size: 4
31
+ model_type: AutoModelForCausalLM
32
+ optimizer: adamw_bnb_8bit
33
+ output_dir: /workspace/axolotl/configs
34
+ pad_to_sequence_len: true
35
+ resize_token_embeddings_to_32x: false
36
+ sample_packing: false
37
+ save_steps: 40
38
+ save_total_limit: 1
39
+ sequence_len: 2048
40
+ special_tokens:
41
+ pad_token: </s>
42
+ tokenizer_type: LlamaTokenizerFast
43
+ train_on_inputs: false
44
+ trust_remote_code: true
45
+ val_set_size: 0.1
46
+ wandb_entity: ''
47
+ wandb_mode: online
48
+ wandb_name: huggyllama/llama-7b-/workspace/input_data/ccd32583f980ebf0_train_data.json
49
+ wandb_project: Gradients-On-Demand
50
+ wandb_run: your_name
51
+ wandb_runid: default
52
+ warmup_ratio: 0.05
53
+ xformers_attention: true
03a659ff-e350-4bb9-8ff3-8c658a5d0dff.yml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: fxmarty/tiny-llama-fast-tokenizer
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - fc6136aac03f618a_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/fc6136aac03f618a_train_data.json
11
+ type:
12
+ field_instruction: text
13
+ field_output: title
14
+ format: '{instruction}'
15
+ no_input_format: '{instruction}'
16
+ system_format: '{system}'
17
+ system_prompt: ''
18
+ eval_steps: 20
19
+ flash_attention: true
20
+ gpu_memory_limit: 80GiB
21
+ gradient_checkpointing: true
22
+ group_by_length: true
23
+ hub_model_id: willtensora/b1c9c4ec-ffa2-429d-9c5b-90b5979c502d
24
+ hub_strategy: checkpoint
25
+ learning_rate: 0.0002
26
+ logging_steps: 10
27
+ lr_scheduler: cosine
28
+ max_steps: 2500
29
+ micro_batch_size: 4
30
+ model_type: AutoModelForCausalLM
31
+ optimizer: adamw_bnb_8bit
32
+ output_dir: /workspace/axolotl/configs
33
+ pad_to_sequence_len: true
34
+ resize_token_embeddings_to_32x: false
35
+ sample_packing: false
36
+ save_steps: 40
37
+ save_total_limit: 1
38
+ sequence_len: 2048
39
+ special_tokens:
40
+ pad_token: </s>
41
+ tokenizer_type: LlamaTokenizerFast
42
+ train_on_inputs: false
43
+ trust_remote_code: true
44
+ val_set_size: 0.1
45
+ wandb_entity: ''
46
+ wandb_mode: online
47
+ wandb_name: fxmarty/tiny-llama-fast-tokenizer-/workspace/input_data/fc6136aac03f618a_train_data.json
48
+ wandb_project: Gradients-On-Demand
49
+ wandb_run: your_name
50
+ wandb_runid: default
51
+ warmup_ratio: 0.05
52
+ xformers_attention: true
077fd330-87f9-4bc4-b449-7713fbdaf1b0.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: unsloth/mistral-7b-v0.3
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - ca0152973425c947_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/ca0152973425c947_train_data.json
11
+ type:
12
+ field_input: code
13
+ field_instruction: func_name
14
+ field_output: docstring
15
+ format: '{instruction} {input}'
16
+ no_input_format: '{instruction}'
17
+ system_format: '{system}'
18
+ system_prompt: ''
19
+ eval_steps: 20
20
+ flash_attention: true
21
+ gpu_memory_limit: 80GiB
22
+ gradient_checkpointing: true
23
+ group_by_length: true
24
+ hub_model_id: willtensora/5a2f5ce6-446b-4282-bb4d-9ee4e970231f
25
+ hub_strategy: checkpoint
26
+ learning_rate: 0.0002
27
+ logging_steps: 10
28
+ lr_scheduler: cosine
29
+ max_steps: 2500
30
+ micro_batch_size: 4
31
+ model_type: AutoModelForCausalLM
32
+ optimizer: adamw_bnb_8bit
33
+ output_dir: /workspace/axolotl/configs
34
+ pad_to_sequence_len: true
35
+ resize_token_embeddings_to_32x: false
36
+ sample_packing: false
37
+ save_steps: 40
38
+ save_total_limit: 1
39
+ sequence_len: 2048
40
+ tokenizer_type: LlamaTokenizerFast
41
+ train_on_inputs: false
42
+ trust_remote_code: true
43
+ val_set_size: 0.1
44
+ wandb_entity: ''
45
+ wandb_mode: online
46
+ wandb_name: unsloth/mistral-7b-v0.3-/tmp/ca0152973425c947_train_data.json
47
+ wandb_project: Gradients-On-Demand
48
+ wandb_run: your_name
49
+ wandb_runid: default
50
+ warmup_ratio: 0.05
51
+ xformers_attention: true
1dc178e8-8f66-48ae-8ebb-825428c168d0.yml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: NousResearch/Yarn-Mistral-7b-64k
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - bccab6bcbcb6fc03_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/bccab6bcbcb6fc03_train_data.json
11
+ type:
12
+ field_input: choices
13
+ field_instruction: full_prompt
14
+ field_output: example
15
+ format: '{instruction} {input}'
16
+ no_input_format: '{instruction}'
17
+ system_format: '{system}'
18
+ system_prompt: ''
19
+ eval_steps: 20
20
+ flash_attention: true
21
+ gpu_memory_limit: 80GiB
22
+ gradient_checkpointing: true
23
+ group_by_length: true
24
+ hub_model_id: willtensora/93085afc-6d0b-49ca-ac4a-839ea57462a9
25
+ hub_strategy: checkpoint
26
+ learning_rate: 0.0002
27
+ logging_steps: 10
28
+ lr_scheduler: cosine
29
+ max_steps: 2500
30
+ micro_batch_size: 4
31
+ model_type: AutoModelForCausalLM
32
+ optimizer: adamw_bnb_8bit
33
+ output_dir: /workspace/axolotl/configs
34
+ pad_to_sequence_len: true
35
+ resize_token_embeddings_to_32x: false
36
+ sample_packing: false
37
+ save_steps: 40
38
+ save_total_limit: 1
39
+ sequence_len: 2048
40
+ special_tokens:
41
+ pad_token: </s>
42
+ tokenizer_type: LlamaTokenizerFast
43
+ train_on_inputs: false
44
+ trust_remote_code: true
45
+ val_set_size: 0.1
46
+ wandb_entity: ''
47
+ wandb_mode: online
48
+ wandb_name: NousResearch/Yarn-Mistral-7b-64k-/workspace/input_data/bccab6bcbcb6fc03_train_data.json
49
+ wandb_project: Gradients-On-Demand
50
+ wandb_run: your_name
51
+ wandb_runid: default
52
+ warmup_ratio: 0.05
53
+ xformers_attention: true
21315ae5-16ee-43cd-9612-743524060933.yml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: unsloth/Meta-Llama-3.1-8B
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - 562fa3aeea07046a_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/562fa3aeea07046a_train_data.json
11
+ type:
12
+ field_instruction: prompt
13
+ field_output: text
14
+ format: '{instruction}'
15
+ no_input_format: '{instruction}'
16
+ system_format: '{system}'
17
+ system_prompt: ''
18
+ eval_steps: 20
19
+ flash_attention: true
20
+ gpu_memory_limit: 80GiB
21
+ gradient_checkpointing: true
22
+ group_by_length: true
23
+ hub_model_id: willtensora/c4596edc-efad-4776-86a1-caa06bffcada
24
+ hub_strategy: checkpoint
25
+ learning_rate: 0.0002
26
+ logging_steps: 10
27
+ lr_scheduler: cosine
28
+ max_steps: 2500
29
+ micro_batch_size: 4
30
+ model_type: AutoModelForCausalLM
31
+ optimizer: adamw_bnb_8bit
32
+ output_dir: /workspace/axolotl/configs
33
+ pad_to_sequence_len: true
34
+ resize_token_embeddings_to_32x: false
35
+ sample_packing: false
36
+ save_steps: 40
37
+ save_total_limit: 1
38
+ sequence_len: 2048
39
+ tokenizer_type: PreTrainedTokenizerFast
40
+ train_on_inputs: false
41
+ trust_remote_code: true
42
+ val_set_size: 0.1
43
+ wandb_entity: ''
44
+ wandb_mode: online
45
+ wandb_name: unsloth/Meta-Llama-3.1-8B-/workspace/input_data/562fa3aeea07046a_train_data.json
46
+ wandb_project: Gradients-On-Demand
47
+ wandb_run: your_name
48
+ wandb_runid: default
49
+ warmup_ratio: 0.05
50
+ xformers_attention: true
284c3982-7bc3-4e42-a78c-849f03798c5f.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: unsloth/Qwen2-7B-Instruct
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - 3e306f9221b79797_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/3e306f9221b79797_train_data.json
11
+ type:
12
+ field_input: dialogue
13
+ field_instruction: rendered_input
14
+ field_output: summary
15
+ format: '{instruction} {input}'
16
+ no_input_format: '{instruction}'
17
+ system_format: '{system}'
18
+ system_prompt: ''
19
+ eval_steps: 20
20
+ flash_attention: true
21
+ gpu_memory_limit: 80GiB
22
+ gradient_checkpointing: true
23
+ group_by_length: true
24
+ hub_model_id: willtensora/f3c9c6cc-5806-45b4-aab9-d03de6022b3a
25
+ hub_strategy: checkpoint
26
+ learning_rate: 0.0002
27
+ logging_steps: 10
28
+ lr_scheduler: cosine
29
+ max_steps: 2500
30
+ micro_batch_size: 4
31
+ model_type: AutoModelForCausalLM
32
+ optimizer: adamw_bnb_8bit
33
+ output_dir: /workspace/axolotl/configs
34
+ pad_to_sequence_len: true
35
+ resize_token_embeddings_to_32x: false
36
+ sample_packing: false
37
+ save_steps: 40
38
+ save_total_limit: 1
39
+ sequence_len: 2048
40
+ tokenizer_type: Qwen2TokenizerFast
41
+ train_on_inputs: false
42
+ trust_remote_code: true
43
+ val_set_size: 0.1
44
+ wandb_entity: ''
45
+ wandb_mode: online
46
+ wandb_name: unsloth/Qwen2-7B-Instruct-/workspace/input_data/3e306f9221b79797_train_data.json
47
+ wandb_project: Gradients-On-Demand
48
+ wandb_run: your_name
49
+ wandb_runid: default
50
+ warmup_ratio: 0.05
51
+ xformers_attention: true
2eaa630f-7785-4ca3-b46f-be41dcf74f78.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: katuni4ka/tiny-random-qwen1.5-moe
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - 95544452e61c7393_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/95544452e61c7393_train_data.json
11
+ type:
12
+ field_input: input
13
+ field_instruction: instruction
14
+ field_output: output
15
+ format: '{instruction} {input}'
16
+ no_input_format: '{instruction}'
17
+ system_format: '{system}'
18
+ system_prompt: ''
19
+ eval_steps: 20
20
+ flash_attention: true
21
+ gpu_memory_limit: 80GiB
22
+ gradient_checkpointing: true
23
+ group_by_length: true
24
+ hub_model_id: willtensora/e61e89f0-854a-4922-8d25-dae435e91af0
25
+ hub_strategy: checkpoint
26
+ learning_rate: 0.0002
27
+ logging_steps: 10
28
+ lr_scheduler: cosine
29
+ max_steps: 2500
30
+ micro_batch_size: 4
31
+ model_type: AutoModelForCausalLM
32
+ optimizer: adamw_bnb_8bit
33
+ output_dir: /workspace/axolotl/configs
34
+ pad_to_sequence_len: true
35
+ resize_token_embeddings_to_32x: false
36
+ sample_packing: false
37
+ save_steps: 40
38
+ save_total_limit: 1
39
+ sequence_len: 2048
40
+ tokenizer_type: Qwen2TokenizerFast
41
+ train_on_inputs: false
42
+ trust_remote_code: true
43
+ val_set_size: 0.1
44
+ wandb_entity: ''
45
+ wandb_mode: online
46
+ wandb_name: katuni4ka/tiny-random-qwen1.5-moe-/workspace/input_data/95544452e61c7393_train_data.json
47
+ wandb_project: Gradients-On-Demand
48
+ wandb_run: your_name
49
+ wandb_runid: default
50
+ warmup_ratio: 0.05
51
+ xformers_attention: true
40f27435-f59d-488f-b2d6-01e356d79c48.yml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: Qwen/Qwen2-1.5B-Instruct
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - df925134bb2c32b8_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/df925134bb2c32b8_train_data.json
11
+ type:
12
+ field_instruction: prompt
13
+ field_output: amoral
14
+ format: '{instruction}'
15
+ no_input_format: '{instruction}'
16
+ system_format: '{system}'
17
+ system_prompt: ''
18
+ eval_steps: 20
19
+ flash_attention: true
20
+ gpu_memory_limit: 80GiB
21
+ gradient_checkpointing: true
22
+ group_by_length: true
23
+ hub_model_id: willtensora/ba640bbe-3257-40d8-88fe-26152f412bb7
24
+ hub_strategy: checkpoint
25
+ learning_rate: 0.0002
26
+ logging_steps: 10
27
+ lr_scheduler: cosine
28
+ max_steps: 2500
29
+ micro_batch_size: 4
30
+ model_type: AutoModelForCausalLM
31
+ optimizer: adamw_bnb_8bit
32
+ output_dir: /workspace/axolotl/configs
33
+ pad_to_sequence_len: true
34
+ resize_token_embeddings_to_32x: false
35
+ sample_packing: false
36
+ save_steps: 40
37
+ save_total_limit: 1
38
+ sequence_len: 2048
39
+ tokenizer_type: Qwen2TokenizerFast
40
+ train_on_inputs: false
41
+ trust_remote_code: true
42
+ val_set_size: 0.1
43
+ wandb_entity: ''
44
+ wandb_mode: online
45
+ wandb_name: Qwen/Qwen2-1.5B-Instruct-/tmp/df925134bb2c32b8_train_data.json
46
+ wandb_project: Gradients-On-Demand
47
+ wandb_run: your_name
48
+ wandb_runid: default
49
+ warmup_ratio: 0.05
50
+ xformers_attention: true
427d02be-6008-4556-9a5e-9c7cb7503058.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: unsloth/Phi-3.5-mini-instruct
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - 7e5b54272524b996_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/7e5b54272524b996_train_data.json
11
+ type:
12
+ field_input: input
13
+ field_instruction: instruction
14
+ field_output: output
15
+ format: '{instruction} {input}'
16
+ no_input_format: '{instruction}'
17
+ system_format: '{system}'
18
+ system_prompt: ''
19
+ eval_steps: 20
20
+ flash_attention: true
21
+ gpu_memory_limit: 80GiB
22
+ gradient_checkpointing: true
23
+ group_by_length: true
24
+ hub_model_id: willtensora/ae26a9e9-089e-4d4a-b592-d8935df7c18d
25
+ hub_strategy: checkpoint
26
+ learning_rate: 0.0002
27
+ logging_steps: 10
28
+ lr_scheduler: cosine
29
+ max_steps: 2500
30
+ micro_batch_size: 4
31
+ model_type: AutoModelForCausalLM
32
+ optimizer: adamw_bnb_8bit
33
+ output_dir: /workspace/axolotl/configs
34
+ pad_to_sequence_len: true
35
+ resize_token_embeddings_to_32x: false
36
+ sample_packing: false
37
+ save_steps: 40
38
+ save_total_limit: 1
39
+ sequence_len: 2048
40
+ tokenizer_type: LlamaTokenizerFast
41
+ train_on_inputs: false
42
+ trust_remote_code: true
43
+ val_set_size: 0.1
44
+ wandb_entity: ''
45
+ wandb_mode: online
46
+ wandb_name: unsloth/Phi-3.5-mini-instruct-/workspace/input_data/7e5b54272524b996_train_data.json
47
+ wandb_project: Gradients-On-Demand
48
+ wandb_run: your_name
49
+ wandb_runid: default
50
+ warmup_ratio: 0.05
51
+ xformers_attention: true
54c39bbc-809b-4c67-a254-0e03a4884b4e.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: unsloth/codegemma-7b-it
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - 2ebe89763cb3150d_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/2ebe89763cb3150d_train_data.json
11
+ type:
12
+ field_input: input
13
+ field_instruction: instruction
14
+ field_output: output
15
+ format: '{instruction} {input}'
16
+ no_input_format: '{instruction}'
17
+ system_format: '{system}'
18
+ system_prompt: ''
19
+ eval_steps: 20
20
+ flash_attention: true
21
+ gpu_memory_limit: 80GiB
22
+ gradient_checkpointing: true
23
+ group_by_length: true
24
+ hub_model_id: willtensora/a0bc38f8-fcd3-4d7e-9a3f-3aa2e8a4204f
25
+ hub_strategy: checkpoint
26
+ learning_rate: 0.0002
27
+ logging_steps: 10
28
+ lr_scheduler: cosine
29
+ max_steps: 2500
30
+ micro_batch_size: 4
31
+ model_type: AutoModelForCausalLM
32
+ optimizer: adamw_bnb_8bit
33
+ output_dir: /workspace/axolotl/configs
34
+ pad_to_sequence_len: true
35
+ resize_token_embeddings_to_32x: false
36
+ sample_packing: false
37
+ save_steps: 40
38
+ save_total_limit: 1
39
+ sequence_len: 2048
40
+ tokenizer_type: GemmaTokenizerFast
41
+ train_on_inputs: false
42
+ trust_remote_code: true
43
+ val_set_size: 0.1
44
+ wandb_entity: ''
45
+ wandb_mode: online
46
+ wandb_name: unsloth/codegemma-7b-it-/tmp/2ebe89763cb3150d_train_data.json
47
+ wandb_project: Gradients-On-Demand
48
+ wandb_run: your_name
49
+ wandb_runid: default
50
+ warmup_ratio: 0.05
51
+ xformers_attention: true
5ff7bf5f-96dc-43dd-aeeb-560c0ab78db8.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: NousResearch/Hermes-3-Llama-3.1-8B
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - 69447058613b41d8_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/69447058613b41d8_train_data.json
11
+ type:
12
+ field_input: sectionParentTitre
13
+ field_instruction: title_main
14
+ field_output: texte
15
+ format: '{instruction} {input}'
16
+ no_input_format: '{instruction}'
17
+ system_format: '{system}'
18
+ system_prompt: ''
19
+ eval_steps: 20
20
+ flash_attention: true
21
+ gpu_memory_limit: 80GiB
22
+ gradient_checkpointing: true
23
+ group_by_length: true
24
+ hub_model_id: willtensora/942aa5fc-b540-46ce-b482-e38c4f637264
25
+ hub_strategy: checkpoint
26
+ learning_rate: 0.0002
27
+ logging_steps: 10
28
+ lr_scheduler: cosine
29
+ max_steps: 2500
30
+ micro_batch_size: 4
31
+ model_type: AutoModelForCausalLM
32
+ optimizer: adamw_bnb_8bit
33
+ output_dir: /workspace/axolotl/configs
34
+ pad_to_sequence_len: true
35
+ resize_token_embeddings_to_32x: false
36
+ sample_packing: false
37
+ save_steps: 40
38
+ save_total_limit: 1
39
+ sequence_len: 2048
40
+ tokenizer_type: PreTrainedTokenizerFast
41
+ train_on_inputs: false
42
+ trust_remote_code: true
43
+ val_set_size: 0.1
44
+ wandb_entity: ''
45
+ wandb_mode: online
46
+ wandb_name: NousResearch/Hermes-3-Llama-3.1-8B-/workspace/input_data/69447058613b41d8_train_data.json
47
+ wandb_project: Gradients-On-Demand
48
+ wandb_run: your_name
49
+ wandb_runid: default
50
+ warmup_ratio: 0.05
51
+ xformers_attention: true
63345f8a-4ec9-47f0-9956-6eaa52b2c2a6.yml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: heegyu/WizardVicuna-open-llama-3b-v2
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - fe9267419ea75ad2_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/fe9267419ea75ad2_train_data.json
11
+ type:
12
+ field_instruction: ca_topic
13
+ field_output: article
14
+ format: '{instruction}'
15
+ no_input_format: '{instruction}'
16
+ system_format: '{system}'
17
+ system_prompt: ''
18
+ eval_steps: 20
19
+ flash_attention: true
20
+ gpu_memory_limit: 80GiB
21
+ gradient_checkpointing: true
22
+ group_by_length: true
23
+ hub_model_id: willtensora/7114c34f-852f-43da-b985-b7f0b6d6d724
24
+ hub_strategy: checkpoint
25
+ learning_rate: 0.0002
26
+ logging_steps: 10
27
+ lr_scheduler: cosine
28
+ max_steps: 2500
29
+ micro_batch_size: 4
30
+ model_type: AutoModelForCausalLM
31
+ optimizer: adamw_bnb_8bit
32
+ output_dir: /workspace/axolotl/configs
33
+ pad_to_sequence_len: true
34
+ resize_token_embeddings_to_32x: false
35
+ sample_packing: false
36
+ save_steps: 40
37
+ save_total_limit: 1
38
+ sequence_len: 2048
39
+ special_tokens:
40
+ pad_token: </s>
41
+ tokenizer_type: LlamaTokenizerFast
42
+ train_on_inputs: false
43
+ trust_remote_code: true
44
+ val_set_size: 0.1
45
+ wandb_entity: ''
46
+ wandb_mode: online
47
+ wandb_name: heegyu/WizardVicuna-open-llama-3b-v2-/tmp/fe9267419ea75ad2_train_data.json
48
+ wandb_project: Gradients-On-Demand
49
+ wandb_run: your_name
50
+ wandb_runid: default
51
+ warmup_ratio: 0.05
52
+ xformers_attention: true
6c7ae056-3b4d-460b-ba7b-a4000f32b3f1.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: unsloth/gemma-2-2b
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - b98d5b59c20c6595_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/b98d5b59c20c6595_train_data.json
11
+ type:
12
+ field_input: metadata
13
+ field_instruction: text
14
+ field_output: tags_str
15
+ format: '{instruction} {input}'
16
+ no_input_format: '{instruction}'
17
+ system_format: '{system}'
18
+ system_prompt: ''
19
+ eval_steps: 20
20
+ flash_attention: true
21
+ gpu_memory_limit: 80GiB
22
+ gradient_checkpointing: true
23
+ group_by_length: true
24
+ hub_model_id: willtensora/429ee307-6dd2-4dd7-9e1d-7384d807a3df
25
+ hub_strategy: checkpoint
26
+ learning_rate: 0.0002
27
+ logging_steps: 10
28
+ lr_scheduler: cosine
29
+ max_steps: 2500
30
+ micro_batch_size: 4
31
+ model_type: AutoModelForCausalLM
32
+ optimizer: adamw_bnb_8bit
33
+ output_dir: /workspace/axolotl/configs
34
+ pad_to_sequence_len: true
35
+ resize_token_embeddings_to_32x: false
36
+ sample_packing: false
37
+ save_steps: 40
38
+ save_total_limit: 1
39
+ sequence_len: 2048
40
+ tokenizer_type: GemmaTokenizerFast
41
+ train_on_inputs: false
42
+ trust_remote_code: true
43
+ val_set_size: 0.1
44
+ wandb_entity: ''
45
+ wandb_mode: online
46
+ wandb_name: unsloth/gemma-2-2b-/tmp/b98d5b59c20c6595_train_data.json
47
+ wandb_project: Gradients-On-Demand
48
+ wandb_run: your_name
49
+ wandb_runid: default
50
+ warmup_ratio: 0.05
51
+ xformers_attention: true
75b21ca4-feab-4bdd-92b0-ea6d90dfa18f.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: Qwen/Qwen2.5-1.5B-Instruct
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - c6adcdcb593a3ee4_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/c6adcdcb593a3ee4_train_data.json
11
+ type:
12
+ field_input: abstract
13
+ field_instruction: question_en_origin
14
+ field_output: answer_en_origin
15
+ format: '{instruction} {input}'
16
+ no_input_format: '{instruction}'
17
+ system_format: '{system}'
18
+ system_prompt: ''
19
+ eval_steps: 20
20
+ flash_attention: true
21
+ gpu_memory_limit: 80GiB
22
+ gradient_checkpointing: true
23
+ group_by_length: true
24
+ hub_model_id: willtensora/2faf844e-4a0a-4d23-95f4-a055e4864133
25
+ hub_strategy: checkpoint
26
+ learning_rate: 0.0002
27
+ logging_steps: 10
28
+ lr_scheduler: cosine
29
+ max_steps: 2500
30
+ micro_batch_size: 4
31
+ model_type: AutoModelForCausalLM
32
+ optimizer: adamw_bnb_8bit
33
+ output_dir: /workspace/axolotl/configs
34
+ pad_to_sequence_len: true
35
+ resize_token_embeddings_to_32x: false
36
+ sample_packing: false
37
+ save_steps: 40
38
+ save_total_limit: 1
39
+ sequence_len: 2048
40
+ tokenizer_type: Qwen2TokenizerFast
41
+ train_on_inputs: false
42
+ trust_remote_code: true
43
+ val_set_size: 0.1
44
+ wandb_entity: ''
45
+ wandb_mode: online
46
+ wandb_name: Qwen/Qwen2.5-1.5B-Instruct-/workspace/input_data/c6adcdcb593a3ee4_train_data.json
47
+ wandb_project: Gradients-On-Demand
48
+ wandb_run: your_name
49
+ wandb_runid: default
50
+ warmup_ratio: 0.05
51
+ xformers_attention: true
7a4a0d08-b201-4939-999e-8cad606c5cdd.yml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: NousResearch/GPT4-x-Vicuna-13b-fp16
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - dcf32f9d35bdd1f9_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/dcf32f9d35bdd1f9_train_data.json
11
+ type:
12
+ field_instruction: doc_text
13
+ field_output: summary_text
14
+ format: '{instruction}'
15
+ no_input_format: '{instruction}'
16
+ system_format: '{system}'
17
+ system_prompt: ''
18
+ eval_steps: 20
19
+ flash_attention: true
20
+ gpu_memory_limit: 80GiB
21
+ gradient_checkpointing: true
22
+ group_by_length: true
23
+ hub_model_id: willtensora/009ef170-2771-4ab8-8e1b-9a9d2a2e1e2b
24
+ hub_strategy: checkpoint
25
+ learning_rate: 0.0002
26
+ logging_steps: 10
27
+ lr_scheduler: cosine
28
+ max_steps: 2500
29
+ micro_batch_size: 2
30
+ model_type: AutoModelForCausalLM
31
+ optimizer: adamw_bnb_8bit
32
+ output_dir: /workspace/axolotl/configs
33
+ pad_to_sequence_len: true
34
+ resize_token_embeddings_to_32x: false
35
+ sample_packing: false
36
+ save_steps: 40
37
+ save_total_limit: 1
38
+ sequence_len: 2048
39
+ tokenizer_type: LlamaTokenizerFast
40
+ train_on_inputs: false
41
+ trust_remote_code: true
42
+ val_set_size: 0.1
43
+ wandb_entity: ''
44
+ wandb_mode: online
45
+ wandb_name: NousResearch/GPT4-x-Vicuna-13b-fp16-/workspace/input_data/dcf32f9d35bdd1f9_train_data.json
46
+ wandb_project: Gradients-On-Demand
47
+ wandb_run: your_name
48
+ wandb_runid: default
49
+ warmup_ratio: 0.05
50
+ xformers_attention: true
879db250-c3f5-4d43-a7c5-c5a456ae5803.yml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: unsloth/Qwen2.5-Coder-1.5B-Instruct
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - 4d85b564dafa38db_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/4d85b564dafa38db_train_data.json
11
+ type:
12
+ field_instruction: prompt
13
+ field_output: response
14
+ format: '{instruction}'
15
+ no_input_format: '{instruction}'
16
+ system_format: '{system}'
17
+ system_prompt: ''
18
+ eval_steps: 20
19
+ flash_attention: true
20
+ gpu_memory_limit: 80GiB
21
+ gradient_checkpointing: true
22
+ group_by_length: true
23
+ hub_model_id: willtensora/876ff803-5357-4240-8766-c54166515403
24
+ hub_strategy: checkpoint
25
+ learning_rate: 0.0002
26
+ logging_steps: 10
27
+ lr_scheduler: cosine
28
+ max_steps: 2500
29
+ micro_batch_size: 4
30
+ model_type: AutoModelForCausalLM
31
+ optimizer: adamw_bnb_8bit
32
+ output_dir: /workspace/axolotl/configs
33
+ pad_to_sequence_len: true
34
+ resize_token_embeddings_to_32x: false
35
+ sample_packing: false
36
+ save_steps: 40
37
+ save_total_limit: 1
38
+ sequence_len: 2048
39
+ tokenizer_type: Qwen2TokenizerFast
40
+ train_on_inputs: false
41
+ trust_remote_code: true
42
+ val_set_size: 0.1
43
+ wandb_entity: ''
44
+ wandb_mode: online
45
+ wandb_name: unsloth/Qwen2.5-Coder-1.5B-Instruct-/workspace/input_data/4d85b564dafa38db_train_data.json
46
+ wandb_project: Gradients-On-Demand
47
+ wandb_run: your_name
48
+ wandb_runid: default
49
+ warmup_ratio: 0.05
50
+ xformers_attention: true
8910478d-79cf-499e-8fed-7a2142f7ee60.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: unsloth/Phi-3-medium-4k-instruct
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - f6199f34ade98809_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/f6199f34ade98809_train_data.json
11
+ type:
12
+ field_input: choices
13
+ field_instruction: question
14
+ field_output: answer
15
+ format: '{instruction} {input}'
16
+ no_input_format: '{instruction}'
17
+ system_format: '{system}'
18
+ system_prompt: ''
19
+ eval_steps: 20
20
+ flash_attention: true
21
+ gpu_memory_limit: 80GiB
22
+ gradient_checkpointing: true
23
+ group_by_length: true
24
+ hub_model_id: willtensora/2d37ba50-cd70-4895-be62-3477f5193e86
25
+ hub_strategy: checkpoint
26
+ learning_rate: 0.0002
27
+ logging_steps: 10
28
+ lr_scheduler: cosine
29
+ max_steps: 2500
30
+ micro_batch_size: 4
31
+ model_type: AutoModelForCausalLM
32
+ optimizer: adamw_bnb_8bit
33
+ output_dir: /workspace/axolotl/configs
34
+ pad_to_sequence_len: true
35
+ resize_token_embeddings_to_32x: false
36
+ sample_packing: false
37
+ save_steps: 40
38
+ save_total_limit: 1
39
+ sequence_len: 2048
40
+ tokenizer_type: LlamaTokenizerFast
41
+ train_on_inputs: false
42
+ trust_remote_code: true
43
+ val_set_size: 0.1
44
+ wandb_entity: ''
45
+ wandb_mode: online
46
+ wandb_name: unsloth/Phi-3-medium-4k-instruct-/tmp/f6199f34ade98809_train_data.json
47
+ wandb_project: Gradients-On-Demand
48
+ wandb_run: your_name
49
+ wandb_runid: default
50
+ warmup_ratio: 0.05
51
+ xformers_attention: true
README.md ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ base_model: katuni4ka/tiny-random-qwen1.5-moe
4
+ tags:
5
+ - axolotl
6
+ - generated_from_trainer
7
+ model-index:
8
+ - name: e61e89f0-854a-4922-8d25-dae435e91af0
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
16
+ <details><summary>See axolotl config</summary>
17
+
18
+ axolotl version: `0.4.1`
19
+ ```yaml
20
+ base_model: katuni4ka/tiny-random-qwen1.5-moe
21
+ batch_size: 32
22
+ bf16: true
23
+ chat_template: tokenizer_default_fallback_alpaca
24
+ datasets:
25
+ - data_files:
26
+ - 95544452e61c7393_train_data.json
27
+ ds_type: json
28
+ format: custom
29
+ path: /workspace/input_data/95544452e61c7393_train_data.json
30
+ type:
31
+ field_input: input
32
+ field_instruction: instruction
33
+ field_output: output
34
+ format: '{instruction} {input}'
35
+ no_input_format: '{instruction}'
36
+ system_format: '{system}'
37
+ system_prompt: ''
38
+ eval_steps: 20
39
+ flash_attention: true
40
+ gpu_memory_limit: 80GiB
41
+ gradient_checkpointing: true
42
+ group_by_length: true
43
+ hub_model_id: willtensora/e61e89f0-854a-4922-8d25-dae435e91af0
44
+ hub_strategy: checkpoint
45
+ learning_rate: 0.0002
46
+ logging_steps: 10
47
+ lr_scheduler: cosine
48
+ max_steps: 2500
49
+ micro_batch_size: 4
50
+ model_type: AutoModelForCausalLM
51
+ optimizer: adamw_bnb_8bit
52
+ output_dir: /workspace/axolotl/configs
53
+ pad_to_sequence_len: true
54
+ resize_token_embeddings_to_32x: false
55
+ sample_packing: false
56
+ save_steps: 40
57
+ save_total_limit: 1
58
+ sequence_len: 2048
59
+ tokenizer_type: Qwen2TokenizerFast
60
+ train_on_inputs: false
61
+ trust_remote_code: true
62
+ val_set_size: 0.1
63
+ wandb_entity: ''
64
+ wandb_mode: online
65
+ wandb_name: katuni4ka/tiny-random-qwen1.5-moe-/workspace/input_data/95544452e61c7393_train_data.json
66
+ wandb_project: Gradients-On-Demand
67
+ wandb_run: your_name
68
+ wandb_runid: default
69
+ warmup_ratio: 0.05
70
+ xformers_attention: true
71
+
72
+ ```
73
+
74
+ </details><br>
75
+
76
+ # e61e89f0-854a-4922-8d25-dae435e91af0
77
+
78
+ This model is a fine-tuned version of [katuni4ka/tiny-random-qwen1.5-moe](https://huggingface.co/katuni4ka/tiny-random-qwen1.5-moe) on the None dataset.
79
+ It achieves the following results on the evaluation set:
80
+ - Loss: 11.6281
81
+
82
+ ## Model description
83
+
84
+ More information needed
85
+
86
+ ## Intended uses & limitations
87
+
88
+ More information needed
89
+
90
+ ## Training and evaluation data
91
+
92
+ More information needed
93
+
94
+ ## Training procedure
95
+
96
+ ### Training hyperparameters
97
+
98
+ The following hyperparameters were used during training:
99
+ - learning_rate: 0.0002
100
+ - train_batch_size: 4
101
+ - eval_batch_size: 4
102
+ - seed: 42
103
+ - distributed_type: multi-GPU
104
+ - num_devices: 8
105
+ - total_train_batch_size: 32
106
+ - total_eval_batch_size: 32
107
+ - optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
108
+ - lr_scheduler_type: cosine
109
+ - lr_scheduler_warmup_steps: 2
110
+ - training_steps: 40
111
+
112
+ ### Training results
113
+
114
+ | Training Loss | Epoch | Step | Validation Loss |
115
+ |:-------------:|:------:|:----:|:---------------:|
116
+ | No log | 0.0031 | 1 | 11.9223 |
117
+ | 11.7325 | 0.0629 | 20 | 11.6783 |
118
+ | 11.6304 | 0.1258 | 40 | 11.6281 |
119
+
120
+
121
+ ### Framework versions
122
+
123
+ - Transformers 4.46.0
124
+ - Pytorch 2.5.0+cu124
125
+ - Datasets 3.0.1
126
+ - Tokenizers 0.20.1
added_tokens.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "<|PAD_TOKEN|>": 151646,
3
+ "<|endoftext|>": 151643,
4
+ "<|im_end|>": 151645,
5
+ "<|im_start|>": 151644
6
+ }
ba646963-47d5-4d28-bb73-74fd1aef7feb.yml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: unsloth/Qwen2-0.5B
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - 1c60ef1fa1ddd4a9_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/1c60ef1fa1ddd4a9_train_data.json
11
+ type:
12
+ field_instruction: en
13
+ field_output: es
14
+ format: '{instruction}'
15
+ no_input_format: '{instruction}'
16
+ system_format: '{system}'
17
+ system_prompt: ''
18
+ eval_steps: 20
19
+ flash_attention: true
20
+ gpu_memory_limit: 80GiB
21
+ gradient_checkpointing: true
22
+ group_by_length: true
23
+ hub_model_id: willtensora/054d8bb5-59eb-4c69-9472-ab1b71a92df6
24
+ hub_strategy: checkpoint
25
+ learning_rate: 0.0002
26
+ logging_steps: 10
27
+ lr_scheduler: cosine
28
+ max_steps: 2500
29
+ micro_batch_size: 4
30
+ model_type: AutoModelForCausalLM
31
+ optimizer: adamw_bnb_8bit
32
+ output_dir: /workspace/axolotl/configs
33
+ pad_to_sequence_len: true
34
+ resize_token_embeddings_to_32x: false
35
+ sample_packing: false
36
+ save_steps: 40
37
+ save_total_limit: 1
38
+ sequence_len: 2048
39
+ tokenizer_type: Qwen2TokenizerFast
40
+ train_on_inputs: false
41
+ trust_remote_code: true
42
+ val_set_size: 0.1
43
+ wandb_entity: ''
44
+ wandb_mode: online
45
+ wandb_name: unsloth/Qwen2-0.5B-/workspace/input_data/1c60ef1fa1ddd4a9_train_data.json
46
+ wandb_project: Gradients-On-Demand
47
+ wandb_run: your_name
48
+ wandb_runid: default
49
+ warmup_ratio: 0.05
50
+ xformers_attention: true
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "fxmarty/tiny-random-GemmaForCausalLM",
3
+ "architectures": [
4
+ "GemmaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 2,
9
+ "eos_token_id": 1,
10
+ "head_dim": 8,
11
+ "hidden_act": "gelu",
12
+ "hidden_activation": "gelu_pytorch_tanh",
13
+ "hidden_size": 32,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 2,
16
+ "max_position_embeddings": 2048,
17
+ "model_type": "gemma",
18
+ "num_attention_heads": 2,
19
+ "num_hidden_layers": 1,
20
+ "num_key_value_heads": 1,
21
+ "pad_token_id": 0,
22
+ "rms_norm_eps": 1e-06,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.46.0",
27
+ "use_cache": false,
28
+ "vocab_size": 256000
29
+ }
da9e44b3-e4fb-4905-9c7c-6b03aad6b593.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: unsloth/SmolLM2-360M-Instruct
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - f1ccd02a885008e6_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/f1ccd02a885008e6_train_data.json
11
+ type:
12
+ field_input: target
13
+ field_instruction: user
14
+ field_output: assistant
15
+ format: '{instruction} {input}'
16
+ no_input_format: '{instruction}'
17
+ system_format: '{system}'
18
+ system_prompt: ''
19
+ eval_steps: 20
20
+ flash_attention: true
21
+ gpu_memory_limit: 80GiB
22
+ gradient_checkpointing: true
23
+ group_by_length: true
24
+ hub_model_id: willtensora/3da0a03a-adbb-42e3-8fd7-bd7c0b1d3e9f
25
+ hub_strategy: checkpoint
26
+ learning_rate: 0.0002
27
+ logging_steps: 10
28
+ lr_scheduler: cosine
29
+ max_steps: 2500
30
+ micro_batch_size: 4
31
+ model_type: AutoModelForCausalLM
32
+ optimizer: adamw_bnb_8bit
33
+ output_dir: /workspace/axolotl/configs
34
+ pad_to_sequence_len: true
35
+ resize_token_embeddings_to_32x: false
36
+ sample_packing: false
37
+ save_steps: 40
38
+ save_total_limit: 1
39
+ sequence_len: 2048
40
+ tokenizer_type: GPT2TokenizerFast
41
+ train_on_inputs: false
42
+ trust_remote_code: true
43
+ val_set_size: 0.1
44
+ wandb_entity: ''
45
+ wandb_mode: online
46
+ wandb_name: unsloth/SmolLM2-360M-Instruct-/tmp/f1ccd02a885008e6_train_data.json
47
+ wandb_project: Gradients-On-Demand
48
+ wandb_run: your_name
49
+ wandb_runid: default
50
+ warmup_ratio: 0.05
51
+ xformers_attention: true
e51d7c64-39d2-4079-b777-8892db299c2a.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: fxmarty/tiny-random-GemmaForCausalLM
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - b7c2a4a781c93416_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/b7c2a4a781c93416_train_data.json
11
+ type:
12
+ field_input: context
13
+ field_instruction: question
14
+ field_output: answer
15
+ format: '{instruction} {input}'
16
+ no_input_format: '{instruction}'
17
+ system_format: '{system}'
18
+ system_prompt: ''
19
+ eval_steps: 20
20
+ flash_attention: true
21
+ gpu_memory_limit: 80GiB
22
+ gradient_checkpointing: true
23
+ group_by_length: true
24
+ hub_model_id: willtensora/fd1980a0-7e71-4e52-addb-318dca5991d5
25
+ hub_strategy: checkpoint
26
+ learning_rate: 0.0002
27
+ logging_steps: 10
28
+ lr_scheduler: cosine
29
+ max_steps: 2500
30
+ micro_batch_size: 4
31
+ model_type: AutoModelForCausalLM
32
+ optimizer: adamw_bnb_8bit
33
+ output_dir: /workspace/axolotl/configs
34
+ pad_to_sequence_len: true
35
+ resize_token_embeddings_to_32x: false
36
+ sample_packing: false
37
+ save_steps: 40
38
+ save_total_limit: 1
39
+ sequence_len: 2048
40
+ tokenizer_type: GemmaTokenizerFast
41
+ train_on_inputs: false
42
+ trust_remote_code: true
43
+ val_set_size: 0.1
44
+ wandb_entity: ''
45
+ wandb_mode: online
46
+ wandb_name: fxmarty/tiny-random-GemmaForCausalLM-/workspace/input_data/b7c2a4a781c93416_train_data.json
47
+ wandb_project: Gradients-On-Demand
48
+ wandb_run: your_name
49
+ wandb_runid: default
50
+ warmup_ratio: 0.05
51
+ xformers_attention: true
ee62f35d-1a99-4f1c-a69c-c91bc444b71f.yml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: EleutherAI/pythia-1b
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - b2a4966d9a5c880e_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/b2a4966d9a5c880e_train_data.json
11
+ type:
12
+ field_input: input
13
+ field_instruction: instruction
14
+ field_output: output
15
+ format: '{instruction} {input}'
16
+ no_input_format: '{instruction}'
17
+ system_format: '{system}'
18
+ system_prompt: ''
19
+ eval_steps: 20
20
+ flash_attention: true
21
+ gpu_memory_limit: 80GiB
22
+ gradient_checkpointing: true
23
+ group_by_length: true
24
+ hub_model_id: willtensora/ee937811-31d0-4e11-944a-f4f8e06309d2
25
+ hub_strategy: checkpoint
26
+ learning_rate: 0.0002
27
+ logging_steps: 10
28
+ lr_scheduler: cosine
29
+ max_steps: 2500
30
+ micro_batch_size: 4
31
+ model_type: AutoModelForCausalLM
32
+ optimizer: adamw_bnb_8bit
33
+ output_dir: /workspace/axolotl/configs
34
+ pad_to_sequence_len: true
35
+ resize_token_embeddings_to_32x: false
36
+ sample_packing: false
37
+ save_steps: 40
38
+ save_total_limit: 1
39
+ sequence_len: 2048
40
+ special_tokens:
41
+ pad_token: <|endoftext|>
42
+ tokenizer_type: GPTNeoXTokenizerFast
43
+ train_on_inputs: false
44
+ trust_remote_code: true
45
+ val_set_size: 0.1
46
+ wandb_entity: ''
47
+ wandb_mode: online
48
+ wandb_name: EleutherAI/pythia-1b-/workspace/input_data/b2a4966d9a5c880e_train_data.json
49
+ wandb_project: Gradients-On-Demand
50
+ wandb_run: your_name
51
+ wandb_runid: default
52
+ warmup_ratio: 0.05
53
+ xformers_attention: true
ef61f40b-eca8-4670-964b-fdd3d1d0f066.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: unsloth/SmolLM-135M
2
+ batch_size: 32
3
+ bf16: true
4
+ chat_template: tokenizer_default_fallback_alpaca
5
+ datasets:
6
+ - data_files:
7
+ - 658988857b0a29c9_train_data.json
8
+ ds_type: json
9
+ format: custom
10
+ path: /workspace/input_data/658988857b0a29c9_train_data.json
11
+ type:
12
+ field_input: choices
13
+ field_instruction: subject
14
+ field_output: question
15
+ format: '{instruction} {input}'
16
+ no_input_format: '{instruction}'
17
+ system_format: '{system}'
18
+ system_prompt: ''
19
+ eval_steps: 20
20
+ flash_attention: true
21
+ gpu_memory_limit: 80GiB
22
+ gradient_checkpointing: true
23
+ group_by_length: true
24
+ hub_model_id: willtensora/09370687-f28e-45e5-91f6-f87011850a94
25
+ hub_strategy: checkpoint
26
+ learning_rate: 0.0002
27
+ logging_steps: 10
28
+ lr_scheduler: cosine
29
+ max_steps: 2500
30
+ micro_batch_size: 4
31
+ model_type: AutoModelForCausalLM
32
+ optimizer: adamw_bnb_8bit
33
+ output_dir: /workspace/axolotl/configs
34
+ pad_to_sequence_len: true
35
+ resize_token_embeddings_to_32x: false
36
+ sample_packing: false
37
+ save_steps: 40
38
+ save_total_limit: 1
39
+ sequence_len: 2048
40
+ tokenizer_type: GPT2TokenizerFast
41
+ train_on_inputs: false
42
+ trust_remote_code: true
43
+ val_set_size: 0.1
44
+ wandb_entity: ''
45
+ wandb_mode: online
46
+ wandb_name: unsloth/SmolLM-135M-/workspace/input_data/658988857b0a29c9_train_data.json
47
+ wandb_project: Gradients-On-Demand
48
+ wandb_run: your_name
49
+ wandb_runid: default
50
+ warmup_ratio: 0.05
51
+ xformers_attention: true
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
+ "do_sample": true,
5
+ "eos_token_id": 151643,
6
+ "transformers_version": "4.46.0"
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e11886517a082333f99a75ace51f18d2ba9ac990b21393d6fc072e02aa4186f
3
+ size 16388832
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:244c30fce0d5c4892e3b25d25e50c952fa49cb08493bb32684f850179545a7e3
3
+ size 19817334
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<bos>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<eos>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a63c485bbbab0efcfc1ffe32fd177108a9f70be1875ea3aacad3c4f064a5974b
3
+ size 34315097
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2
3
+ size 4241003
tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<bos>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ }
37
+ },
38
+ "bos_token": "<bos>",
39
+ "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response: ' + message['content'] + eos_token}}{% endif %}{% endfor %}",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "<eos>",
42
+ "legacy": null,
43
+ "model_max_length": 1000000000000000019884624838656,
44
+ "pad_token": "<pad>",
45
+ "sp_model_kwargs": {},
46
+ "spaces_between_special_tokens": false,
47
+ "tokenizer_class": "GemmaTokenizer",
48
+ "unk_token": "<unk>",
49
+ "use_default_system_prompt": false,
50
+ "use_fast": true
51
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:031c6655ddce296cb6a3d94bf34f1b61cb6caff63f630e0535dae289ad4b1be8
3
+ size 6584
vocab.json ADDED
The diff for this file is too large to render. See raw diff