model initial

Files changed (6) hide show

out/pretrain/initial/config.json ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:94e73e086a5ed14149cee99a1aa3e2563ec7ab536c1653ff332999afa3520694
+size 546

out/pretrain/initial/hyperparameters.yaml ADDED Viewed

+model_name: tiny-llama-1.1b
+model_config:
+  name: ''
+  hf_config: {}
+  scale_embeddings: false
+  block_size: 32768
+  vocab_size: 32768
+  padding_multiple: 512
+  padded_vocab_size: 32768
+  n_layer: 10
+  n_head: 12
+  n_embd: 312
+  rotary_percentage: 1.0
+  parallel_residual: false
+  bias: false
+  lm_head_bias: false
+  n_query_groups: 4
+  shared_attention_norm: false
+  norm_class_name: RMSNorm
+  post_attention_norm: false
+  post_mlp_norm: false
+  norm_eps: 1.0e-05
+  mlp_class_name: LLaMAMLP
+  gelu_approximate: none
+  intermediate_size: 1092
+  rope_condense_ratio: 1
+  rope_base: 500000
+  n_expert: 0
+  n_expert_per_token: 0
+out_dir: ../out/pretrain
+precision: bf16-true
+resume: auto
+data:
+  class_path: litgpt.data.LitData
+  init_args:
+    data_path: ../data/
+    seed: 42
+    num_workers: 16
+train:
+  save_interval: 1000
+  log_interval: 1
+  global_batch_size: 512
+  micro_batch_size: 16
+  lr_warmup_steps: 2000
+  max_tokens: 9782206713
+  max_seq_length: 2048
+  max_norm: 1.0
+  min_lr: 4.0e-05
+eval:
+  interval: 1000
+  max_iters: 100
+  initial_validation: false
+  final_validation: false
+optimizer:
+  class_path: grokadamw.GrokAdamW
+  init_args:
+    lr: 5.0e-05
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+devices: auto
+num_nodes: 1
+tokenizer_dir: ..
+logger_name: wandb
+seed: 42

out/pretrain/initial/lit_model.pth ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:19358c65b29f92cd191e18609d9eb8ad27872d96a90c6acfed76b541648d5aa8
+size 266269738

out/pretrain/initial/model_config.yaml ADDED Viewed

+attention_logit_softcapping: null
+attention_scores_scalar: null
+bias: false
+block_size: 32768
+final_logit_softcapping: null
+gelu_approximate: none
+head_size: 26
+hf_config: {}
+intermediate_size: 1092
+lm_head_bias: false
+mlp_class_name: LLaMAMLP
+n_embd: 312
+n_expert: 0
+n_expert_per_token: 0
+n_head: 12
+n_layer: 10
+n_query_groups: 4
+name: ''
+norm_class_name: RMSNorm
+norm_eps: 1.0e-05
+padded_vocab_size: 32768
+padding_multiple: 512
+parallel_residual: false
+post_attention_norm: false
+post_mlp_norm: false
+rope_base: 500000
+rope_condense_ratio: 1
+rotary_percentage: 1.0
+scale_embeddings: false
+shared_attention_norm: false
+sliding_window_layer_placing: null
+sliding_window_size: null
+vocab_size: 32768

out/pretrain/initial/tokenizer.json ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b496a30dc268bcb8adfd551f693e68e9eadd06b81cab385c088a61e7663649c
+size 1368561

out/pretrain/initial/tokenizer_config.json ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6333d68c3280be6081b795cc160fd5872707562021f9889b2e2bd3ae508fa62
+size 23043