afrias5
/

Allama370b

Generated from Trainer

4-bit precision

Model card Files Files and versions Community

afrias5 commited on Jul 22

Commit

1a8ed49

•

1 Parent(s): d15f27a

End of training

Files changed (1) hide show

README.md +12 -14

README.md CHANGED Viewed

@@ -33,9 +33,9 @@ datasets:
 dataset_prepared_path: Allama3dataset
 val_set_size: 0
-output_dir: models/Allama370b                        #change
-# lora_model_dir: models/llama370b
-# auto_resume_from_checkpoints: true
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
@@ -59,7 +59,7 @@ wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 1
-num_epochs: 4
 optimizer: adamw_torch
 lr_scheduler: cosine
 learning_rate: 0.0002
@@ -71,33 +71,31 @@ fp16:
 tf32: false
 hub_model_id: afrias5/Allama370b
 gradient_checkpointing: true
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 s2_attention:
 warmup_steps: 10
 # eval_steps: 300
 saves_per_epoch: 1
 save_total_limit: 12
 debug:
-deepspeed: deepspeed_configs/zero3_bf16.json
-weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
    pad_token: <|end_of_text|>
 ```
 </details><br>
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/afrias5/llama3run/runs/0xrylxx1)
 # Allama370b
 This model is a fine-tuned version of [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) on the None dataset.
@@ -131,7 +129,7 @@ The following hyperparameters were used during training:
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_steps: 10
-- num_epochs: 4
 ### Training results

 dataset_prepared_path: Allama3dataset
 val_set_size: 0
+output_dir: models/Allama370b
+lora_model_dir: models/Allama370b/checkpoint-36
+auto_resume_from_checkpoints: true
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
 gradient_accumulation_steps: 4
 micro_batch_size: 1
+num_epochs: 8
 optimizer: adamw_torch
 lr_scheduler: cosine
 learning_rate: 0.0002
 tf32: false
 hub_model_id: afrias5/Allama370b
 gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 s2_attention:
+logging_steps: 1
 warmup_steps: 10
 # eval_steps: 300
 saves_per_epoch: 1
 save_total_limit: 12
 debug:
+deepspeed:
+weight_decay: 0.0
 fsdp:
+deepspeed: deepspeed_configs/zero3_bf16.json
 fsdp_config:
 special_tokens:
    pad_token: <|end_of_text|>
 ```
 </details><br>
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/afrias5/llama3run/runs/9o5mcasc)
 # Allama370b
 This model is a fine-tuned version of [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) on the None dataset.
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_steps: 10
+- num_epochs: 8
 ### Training results