diff --git a/comment.txt b/comment.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6fef8a2a8c8f96c20969774fd9fd2804c2ec1c84
--- /dev/null
+++ b/comment.txt
@@ -0,0 +1,7 @@
+Job ID: 2498282
+
+Git commit: 10e3e0a update alpaca eval gen
+
+Git branch: * main
+
+Comment: llama_moe_four_mix_freeze_gate_100
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d09b73212fa0756af8af70819e28e4363bf2e1da
--- /dev/null
+++ b/config.json
@@ -0,0 +1,373 @@
+{
+ "_name_or_path": "/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new",
+ "add_weight_norm": false,
+ "architectures": [
+ "LlamaMoEForCausalLM"
+ ],
+ "attention_bias": false,
+ "attention_dropout": 0.0,
+ "auto_map": {
+ "AutoConfig": "configuration_llama_moe.LlamaMoEConfig",
+ "AutoModel": "modeling_llama_moe_hf.LlamaMoEModel",
+ "AutoModelForCausalLM": "modeling_llama_moe_hf.LlamaMoEForCausalLM"
+ },
+ "bos_token_id": 1,
+ "calculator_type": "UniversalCalculator",
+ "capacity_factor": 1.25,
+ "drop_tokens": true,
+ "dropped_padding": "zero",
+ "eos_token_id": 2,
+ "gate_add_noise": true,
+ "gate_balance_loss_weight": 0.01,
+ "gate_network": "mlp",
+ "gate_noise_epsilon": 0.01,
+ "gate_type": "TopKBalancedNoisyGate",
+ "gate_use_balance": true,
+ "gate_use_softmax": true,
+ "gates": "mlp",
+ "hidden_act": "silu",
+ "hidden_size": 4096,
+ "initializer_range": 0.02,
+ "intermediate_size": 11008,
+ "max_position_embeddings": 4096,
+ "model_type": "llama_moe",
+ "multiply_gate_scores": true,
+ "num_attention_heads": 32,
+ "num_experts": 8,
+ "num_hidden_layers": 32,
+ "num_key_value_heads": 32,
+ "num_selects": 2,
+ "pad_token_id": 0,
+ "pretraining_tp": 1,
+ "rms_norm_eps": 1e-05,
+ "rope_scaling": null,
+ "rope_theta": 10000.0,
+ "score_scale_factor": 4.0,
+ "size_experts": [
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ],
+ [
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376,
+ 1376
+ ]
+ ],
+ "tie_word_embeddings": false,
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.36.2",
+ "use_cache": true,
+ "vocab_size": 32000
+}
diff --git a/configuration_llama_moe.py b/configuration_llama_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..333a778e26d30eb6e79dbc118eefe0d83889afd6
--- /dev/null
+++ b/configuration_llama_moe.py
@@ -0,0 +1,130 @@
+from transformers.configuration_utils import PretrainedConfig
+
+
+class LlamaMoEConfig(PretrainedConfig):
+ model_type = "llama_moe"
+ keys_to_ignore_at_inference = ["past_key_values"]
+
+ def __init__(
+ self,
+ vocab_size=32000,
+ hidden_size=4096,
+ intermediate_size=11008,
+ num_hidden_layers=32,
+ num_attention_heads=32,
+ num_key_value_heads=None,
+ hidden_act="silu",
+ max_position_embeddings=2048,
+ initializer_range=0.02,
+ rms_norm_eps=1e-6,
+ use_cache=True,
+ pad_token_id=0,
+ bos_token_id=1,
+ eos_token_id=2,
+ pretraining_tp=1,
+ tie_word_embeddings=False,
+ rope_theta=10000.0,
+ rope_scaling=None,
+ attention_bias=False,
+ attention_dropout=0.0,
+ # -------- moe expert configs --------
+ num_experts=16,
+ num_selects=4,
+ size_experts=None,
+ # -------- moe gate configs --------
+ gate_type="TopKBalancedNoisyGate",
+ gate_network="mlp",
+ gate_use_softmax=True,
+ gate_use_balance=True,
+ gate_balance_loss_weight=1e-2,
+ gate_add_noise=True,
+ # TopKBalancedNoisyGate
+ gate_noise_epsilon=1e-2,
+ # -------- moe calculator configs --------
+ calculator_type="UniversalCalculator",
+ multiply_gate_scores=True,
+ score_scale_factor=1.0,
+ add_weight_norm=False,
+ # SwitchDropTokenCalculator
+ drop_tokens=True,
+ dropped_padding="zero",
+ capacity_factor=1.25,
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.rms_norm_eps = rms_norm_eps
+ self.pretraining_tp = pretraining_tp
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
+ self._rope_scaling_validation()
+ self.attention_bias = attention_bias
+ self.attention_dropout = attention_dropout
+
+ self.num_experts = num_experts
+ self.num_selects = num_selects
+ self.size_experts = size_experts
+
+ self.gate_type = gate_type
+ self.gate_network = gate_network
+ self.gate_use_softmax = gate_use_softmax
+ self.gate_use_balance = gate_use_balance
+ self.gate_balance_loss_weight = gate_balance_loss_weight
+ self.gate_add_noise = gate_add_noise
+ self.gate_noise_epsilon = gate_noise_epsilon
+
+ self.calculator_type = calculator_type
+ self.multiply_gate_scores = multiply_gate_scores
+ self.score_scale_factor = score_scale_factor
+ self.add_weight_norm = add_weight_norm
+ self.drop_tokens = drop_tokens
+ self.dropped_padding = dropped_padding
+ self.capacity_factor = capacity_factor
+
+ # for backward compatibility
+ if num_key_value_heads is None:
+ num_key_value_heads = num_attention_heads
+
+ self.num_key_value_heads = num_key_value_heads
+
+ super().__init__(
+ pad_token_id=pad_token_id,
+ bos_token_id=bos_token_id,
+ eos_token_id=eos_token_id,
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
+
+ def _rope_scaling_validation(self):
+ """
+ Validate the `rope_scaling` configuration.
+ """
+ if self.rope_scaling is None:
+ return
+
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+ raise ValueError(
+ "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
+ f"got {self.rope_scaling}"
+ )
+ rope_scaling_type = self.rope_scaling.get("type", None)
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+ raise ValueError(
+ f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+ )
+ if (
+ rope_scaling_factor is None
+ or not isinstance(rope_scaling_factor, float)
+ or rope_scaling_factor <= 1.0
+ ):
+ raise ValueError(
+ f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}"
+ )
diff --git a/diff.patch b/diff.patch
new file mode 100644
index 0000000000000000000000000000000000000000..6b948889322c57f8580d4b80b5c99d8a3d906a11
--- /dev/null
+++ b/diff.patch
@@ -0,0 +1,863 @@
+diff --git a/.gitignore b/.gitignore
+index c243024..8c28ce3 100644
+--- a/.gitignore
++++ b/.gitignore
+@@ -175,6 +175,7 @@ debug.py
+ wandb/
+ nohup.out
+ lm-evaluation-harness/
++bigcode-evaluation-harness/
+ results/**/*.json
+ results/**/*.jsonl
+ results/**/*.db
+diff --git a/README.md b/README.md
+index 8813a32..b276a78 100644
+--- a/README.md
++++ b/README.md
+@@ -26,6 +26,11 @@ bash scripts/data.sh
+ git clone https://github.com/EleutherAI/lm-evaluation-harness.git
+ cd lm-evaluation-harness
+ pip install -e .
++# commit: 9cfa52b
++git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git
++cd bigcode-evaluation-harness
++# change `pyext==0.5` in `bigcode-evaluation-harness/requirements.txt`, ref: https://github.com/bigcode-project/bigcode-evaluation-harness/pull/181
++pip install -e .
+ ```
+
+ ## 📃 TODO
+diff --git a/scripts/eval.sh b/scripts/eval.sh
+deleted file mode 100644
+index 4f41b37..0000000
+--- a/scripts/eval.sh
++++ /dev/null
+@@ -1,96 +0,0 @@
+-# nohup srun -p MoE --gres gpu:1 bash scripts/eval.sh all /mnt/petrelfs/share_data/quxiaoye/models/Sheared-LLaMA-2.7B True results/Sheared-LLaMA-2.7B 1>logs/eval-all-Sheared-LLaMA-2.7B.log 2>&1 &
+-
+-mmlu() {
+- # MMLU: https://github.com/princeton-nlp/LLM-Shearing/blob/20ebd2645a8ff5fa65874e1347f9891b80e01805/icl_eval/run_eval.sh#L18
+- MODEL=$1
+- TRUST_REMOTE_CODE=$2
+- RESULT_DIR=$3
+- mkdir -p $RESULT_DIR
+-
+- lm_eval \
+- --model hf \
+- --model_args pretrained=$MODEL,trust_remote_code=$TRUST_REMOTE_CODE \
+- --tasks mmlu_computer_security,mmlu_high_school_chemistry,mmlu_philosophy,mmlu_elementary_mathematics,mmlu_prehistory,mmlu_formal_logic,mmlu_high_school_mathematics,mmlu_econometrics,mmlu_moral_scenarios,mmlu_college_mathematics,mmlu_high_school_government_and_politics,mmlu_us_foreign_policy,mmlu_high_school_world_history,mmlu_conceptual_physics,mmlu_college_medicine,mmlu_international_law,mmlu_abstract_algebra,mmlu_logical_fallacies,mmlu_machine_learning,mmlu_medical_genetics,mmlu_public_relations,mmlu_college_biology,mmlu_marketing,mmlu_electrical_engineering,mmlu_anatomy,mmlu_high_school_us_history,mmlu_high_school_biology,mmlu_miscellaneous,mmlu_high_school_psychology,mmlu_sociology,mmlu_business_ethics,mmlu_high_school_geography,mmlu_human_aging,mmlu_high_school_statistics,mmlu_moral_disputes,mmlu_professional_psychology,mmlu_global_facts,mmlu_college_physics,mmlu_nutrition,mmlu_high_school_macroeconomics,mmlu_world_religions,mmlu_professional_medicine,mmlu_high_school_computer_science,mmlu_college_chemistry,mmlu_human_sexuality,mmlu_high_school_microeconomics,mmlu_astronomy,mmlu_professional_accounting,mmlu_high_school_european_history,mmlu_jurisprudence,mmlu_professional_law,mmlu_high_school_physics,mmlu_virology,mmlu_management,mmlu_college_computer_science,mmlu_clinical_knowledge,mmlu_security_studies \
+- --num_fewshot 5 \
+- --device cuda:0 \
+- --batch_size auto \
+- --verbosity DEBUG \
+- --output_path $RESULT_DIR/mmlu.json
+-}
+-
+-bbh() {
+- # Big Bench Hard (BBH): https://arxiv.org/pdf/2210.09261.pdf
+- MODEL=$1
+- TRUST_REMOTE_CODE=$2
+- RESULT_DIR=$3
+- mkdir -p $RESULT_DIR
+-
+- lm_eval \
+- --log_samples \
+- --model hf \
+- --model_args pretrained=$MODEL,trust_remote_code=$TRUST_REMOTE_CODE \
+- --tasks bbh_fewshot_boolean_expressions,bbh_fewshot_causal_judgement,bbh_fewshot_date_understanding,bbh_fewshot_disambiguation_qa,bbh_fewshot_dyck_languages,bbh_fewshot_formal_fallacies,bbh_fewshot_geometric_shapes,bbh_fewshot_hyperbaton,bbh_fewshot_logical_deduction_five_objects,bbh_fewshot_logical_deduction_seven_objects,bbh_fewshot_logical_deduction_three_objects,bbh_fewshot_movie_recommendation,bbh_fewshot_multistep_arithmetic_two,bbh_fewshot_navigate,bbh_fewshot_object_counting,bbh_fewshot_penguins_in_a_table,bbh_fewshot_reasoning_about_colored_objects,bbh_fewshot_ruin_names,bbh_fewshot_salient_translation_error_detection,bbh_fewshot_snarks,bbh_fewshot_sports_understanding,bbh_fewshot_temporal_sequences,bbh_fewshot_tracking_shuffled_objects_five_objects,bbh_fewshot_tracking_shuffled_objects_seven_objects,bbh_fewshot_tracking_shuffled_objects_three_objects,bbh_fewshot_web_of_lies,bbh_fewshot_word_sorting \
+- --device cuda:0 \
+- --batch_size auto \
+- --verbosity DEBUG \
+- --output_path $RESULT_DIR/bbh.json
+-}
+-
+-reasoning() {
+- MODEL=$1
+- TRUST_REMOTE_CODE=$2
+- RESULT_DIR=$3
+- mkdir -p $RESULT_DIR
+-
+- lm_eval \
+- --log_samples \
+- --model hf \
+- --model_args pretrained=$MODEL,trust_remote_code=$TRUST_REMOTE_CODE \
+- --tasks gsm8k_cot \
+- --device cuda:0 \
+- --batch_size auto \
+- --verbosity DEBUG \
+- --output_path $RESULT_DIR/reasoning.json
+-}
+-
+-qa() {
+- MODEL=$1
+- TRUST_REMOTE_CODE=$2
+- RESULT_DIR=$3
+- mkdir -p $RESULT_DIR
+-
+- lm_eval \
+- --log_samples \
+- --model hf \
+- --model_args pretrained=$MODEL,trust_remote_code=$TRUST_REMOTE_CODE \
+- --tasks arc_easy,arc_challenge,boolq \
+- --num_fewshot 0 \
+- --device cuda:0 \
+- --batch_size auto \
+- --verbosity DEBUG \
+- --output_path $RESULT_DIR/qa.json
+-}
+-
+-EVAL_TASK=$1
+-shift 1
+-start=$(date +%s)
+-case $EVAL_TASK in
+- mmlu)
+- mmlu $* ;;
+- bbh)
+- bbh $* ;;
+- reasoning)
+- reasoning $* ;;
+- qa)
+- qa $* ;;
+- all)
+- mmlu $*
+- bbh $*
+- reasoning $*
+- qa $*
+- ;;
+- *)
+- echo "$EVAL_TASK not recognized!";;
+-esac
+-end=$(date +%s)
+-echo "Elapsed Time: $(($end-$start)) seconds"
+diff --git a/scripts/four_mix/freeze_gate.sh b/scripts/four_mix/freeze_gate.sh
+index d94d78c..70afb8e 100644
+--- a/scripts/four_mix/freeze_gate.sh
++++ b/scripts/four_mix/freeze_gate.sh
+@@ -83,8 +83,11 @@ num_gpus=4
+
+ python -m src.eval.gen_mt_ans \
+ --model-path $output_dir \
+- --model-id $task_name \
+- --num-gpus-total $num_gpus
++ --model-id $task_name
++
++ python -m src.eval.gen_alpaca_eval_ans \
++ --model-path $output_dir \
++ --model-id $task_name
+ }
+
+ # nohup srun -p MoE --ntasks-per-node=1 --cpus-per-task=16 --mem=128G --nodes=1 --gres=gpu:4 bash "/mnt/petrelfs/zhutong/adaptive-sft-for-moe/scripts/one_data_steps_dynamic.sh" "llama_moe_orca_epochs_cluster_4" "auto" "/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new" "data/open_orca_clustered/4" "data/open_orca_clustered_eval/4" 1>logs/llama_moe_orca_cluster_4_dynamic.log 2>&1 &
+diff --git a/scripts/gen_mt_bench_ans.sh b/scripts/gen_mt_bench_ans.sh
+deleted file mode 100644
+index f251644..0000000
+--- a/scripts/gen_mt_bench_ans.sh
++++ /dev/null
+@@ -1,32 +0,0 @@
+-#!/usr/bin/bash
+-
+-#SBATCH --job-name=moe_gen
+-#SBATCH --output=logs/%x-%j.log
+-#SBATCH --error=logs/%x-%j.log
+-
+-#SBATCH --partition=MoE
+-#SBATCH --ntasks-per-node=1
+-#SBATCH --cpus-per-task=16
+-#SBATCH --mem=64G
+-
+-#SBATCH --nodes=1
+-#SBATCH --gres=gpu:1
+-#SBATCH --quotatype=auto
+-
+-{
+- # python -m fastchat.llm_judge.gen_model_answer \
+- # --model-path outputs/sheared_llama_sharegpt/moe_sft-2411306 \
+- # --model-id sheared_llama_sharegpt
+-
+- # python -m fastchat.llm_judge.gen_model_answer \
+- # --model-path outputs/sheared_llama_uniform_mix/moe_sft-2421072 \
+- # --model-id sheared_llama_uniform_mix
+-
+- bash scripts/cp_model_files.sh outputs/llama_moe/moe_sft-2409782
+- python -m fastchat.llm_judge.gen_model_answer \
+- --model-path outputs/llama_moe/moe_sft-2409782 \
+- --model-id llama_moe_uniform_mix
+-}
+-
+-# nohup srun -p MoE -n1 -N1 --gres=gpu:1 --quotatype spot python -m fastchat.llm_judge.gen_model_answer --model-path outputs/sheared_llama_sharegpt/moe_sft-2411306 --model-id sheared_llama_sharegpt 1>logs/mt_bench_gen_sheared_llama_sharegpt.log 2>&1 &
+-# nohup srun -p MoE -n1 -N1 --gres=gpu:1 --quotatype spot python -m fastchat.llm_judge.gen_model_answer --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/llama_moe_sharegpt/moe_sft-2411309 --model-id llama_moe_sharegpt 1>logs/mt_bench_gen_llama_moe_sharegpt.log 2>&1 &
+diff --git a/scripts/multi.sh b/scripts/multi.sh
+index bcd83b8..e399761 100644
+--- a/scripts/multi.sh
++++ b/scripts/multi.sh
+@@ -100,5 +100,8 @@ nohup srun -p MoE --ntasks-per-node=1 --cpus-per-task=16 --mem=128G --nodes=1 --
+ nohup srun -p MoE --gres gpu:1 python -m src.eval.gen_mt_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/llama_moe_four_mix_uniform/bash-2485396 --model-id llama_moe_four_mix_uniform 1>logs/gen_mt_ans-llama_moe_four_mix_uniform.log 2>&1 &
+ nohup srun -p MoE --gres gpu:1 python -m src.eval.gen_mt_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/sheared_four_mix_uniform/bash-2485397 --model-id sheared_four_mix_uniform 1>logs/gen_mt_ans-sheared_four_mix_uniform.log 2>&1 &
+
+-nohup srun -p MoE --gres gpu:1 python -m src.eval.get_alpaca_eval_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/llama_moe_four_mix_uniform/bash-2485396 --model-id llama_moe_four_mix_uniform 1>logs/gen_alpaca_eval-llama_moe_four_mix_uniform.log 2>&1 &
+-nohup srun -p MoE --gres gpu:1 python -m src.eval.get_alpaca_eval_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/sheared_four_mix_uniform/bash-2485397 --model-id sheared_four_mix_uniform 1>logs/gen_alpaca_eval-sheared_four_mix_uniform.log 2>&1 &
++nohup srun -p MoE --gres gpu:1 python -m src.eval.gen_alpaca_eval_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/llama_moe_four_mix_uniform/bash-2485396 --model-id llama_moe_four_mix_uniform 1>logs/gen_alpaca_eval-llama_moe_four_mix_uniform.log 2>&1 &
++nohup srun -p MoE --gres gpu:1 python -m src.eval.gen_alpaca_eval_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/sheared_four_mix_uniform/bash-2485397 --model-id sheared_four_mix_uniform 1>logs/gen_alpaca_eval-sheared_four_mix_uniform.log 2>&1 &
++
++nohup srun -p MoE --gres gpu:1 bash scripts/eval/eval.sh reasoning /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048_dynamic_remove_padding_tokens/llama_moe_four_mix_wo_pad_wo_gate_noise/moe_sft-2492650 True results/llama_moe_four_mix_wo_pad_wo_gate_noise 1>logs/eval-reasoning-llama_moe_four_mix_wo_pad_wo_gate_noise.log 2>&1 &
++nohup srun -p MoE --gres gpu:1 bash scripts/eval/eval.sh reasoning /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048_dynamic_remove_padding_tokens/llama_moe_four_mix_wo_pad/moe_sft-2491633 True results/llama_moe_four_mix_wo_pad 1>logs/eval-reasoning-llama_moe_four_mix_wo_pad.log 2>&1 &
+diff --git a/src/callbacks.py b/src/callbacks.py
+index a750f69..e9d0c04 100644
+--- a/src/callbacks.py
++++ b/src/callbacks.py
+@@ -6,6 +6,7 @@ import torch
+ import numpy as np
+ from loguru import logger
+ from transformers.trainer_callback import TrainerCallback, TrainerState, TrainerControl
++from transformers.utils import is_flash_attn_2_available
+
+ from src.utils.config import TrainingArguments
+ from src.utils.io import append_jsonlines
+@@ -22,6 +23,7 @@ class AdaptiveSamplingCallback(TrainerCallback):
+ criterion: Optional[Literal["min", "max", "mean"]] = "mean",
+ sim_type: Optional[Literal["cos", "l2"]] = "cos",
+ ):
++ assert is_flash_attn_2_available(), "Make sure you have flash-attn installed"
+ self.criterion = criterion
+ self.sim_type = sim_type
+ self.prob_map = {}
+@@ -74,8 +76,8 @@ class AdaptiveSamplingCallback(TrainerCallback):
+ cls,
+ ori_weights: np.ndarray,
+ delta: np.ndarray,
+- eta: float = 1.0,
+- c: float = 1e-4,
++ eta: float = 10.0,
++ c: float = 5e-2,
+ ) -> np.ndarray:
+ def _softmax(vec: np.ndarray) -> np.ndarray:
+ exps = np.exp(vec - np.max(vec))
+diff --git a/src/core/train.py b/src/core/train.py
+index 2be5558..9b1f694 100644
+--- a/src/core/train.py
++++ b/src/core/train.py
+@@ -7,13 +7,12 @@ from loguru import logger
+ from src.utils.config import ModelArguments, DataArguments, TrainingArguments
+ from src.data import (
+ SubDirWeightedPackedJsonlDataset,
+- get_uniform_sampling_ratio,
+ fault_tolerance_data_collator,
+ CachedJsonlDataset,
+ get_cached_datasets_from_dir,
+ )
+ from src.utils.io import trainer_save_model_safe
+-from src.models import LlamaMoEForCausalLM, LlamaMoEConfig
++from src.models import LlamaMoEForCausalLM, LlamaMoEConfig, DeepseekConfig, DeepseekForCausalLM
+ from src.trainer import GateLoadRecordingTrainer
+ from src.callbacks import AdaptiveSamplingCallback
+
+@@ -36,6 +35,9 @@ def get_model_and_tokenizer(
+ elif model_type == "llama_moe":
+ ConfigClass = LlamaMoEConfig
+ ModelClass = LlamaMoEForCausalLM
++ elif model_type == "deepseek":
++ ConfigClass = DeepseekConfig
++ ModelClass = DeepseekForCausalLM
+ else:
+ raise ValueError(f"Unknown model type: {model_type}")
+
+@@ -54,6 +56,21 @@ def get_model_and_tokenizer(
+ config.update(additional_config)
+ logger.info("Config ready")
+
++ tokenizer = transformers.AutoTokenizer.from_pretrained(
++ model_name_or_path,
++ cache_dir=cache_dir,
++ model_max_length=model_max_length,
++ padding_side=padding_side,
++ use_fast=False,
++ trust_remote_code=trust_remote_code,
++ )
++ if tokenizer.pad_token is None:
++ if tokenizer.unk_token is not None:
++ tokenizer.pad_token = tokenizer.unk_token
++ else:
++ tokenizer.pad_token = tokenizer.eos_token
++ logger.info(f"tokenizer ready, pad_token: {tokenizer.pad_token}")
++
+ # Load model and tokenizer
+ model = ModelClass.from_pretrained(
+ model_name_or_path,
+@@ -65,18 +82,6 @@ def get_model_and_tokenizer(
+ )
+ logger.info("model ready")
+
+- tokenizer = transformers.AutoTokenizer.from_pretrained(
+- model_name_or_path,
+- cache_dir=cache_dir,
+- model_max_length=model_max_length,
+- padding_side=padding_side,
+- use_fast=False,
+- trust_remote_code=trust_remote_code,
+- )
+- if tokenizer.pad_token != tokenizer.unk_token:
+- tokenizer.pad_token = tokenizer.unk_token
+- logger.info("tokenizer ready")
+-
+ return model, tokenizer
+
+
+@@ -117,7 +122,9 @@ def train():
+ train_dataset = SubDirWeightedPackedJsonlDataset(
+ data_args.dataset_dir_or_path,
+ tokenizer,
+- prob_map=get_uniform_sampling_ratio(data_args.dataset_dir_or_path),
++ # prob_map=get_uniform_sampling_ratio(data_args.dataset_dir_or_path),
++ # prob_map={"code": 0.25119094959816823, "math": 0.2674581878910902, "orca": 0.243050776175138, "sharegpt": 0.23830008633560357},
++ prob_map=data_args.prob_map,
+ seed=training_args.seed,
+ )
+ elif datapath.is_file():
+diff --git a/src/data.py b/src/data.py
+index d783a21..a1a8ff7 100644
+--- a/src/data.py
++++ b/src/data.py
+@@ -20,6 +20,7 @@ def preprocess(
+ instances,
+ tokenizer: transformers.PreTrainedTokenizer,
+ ) -> Dict:
++ tokenizer_legacy = getattr(tokenizer, "legacy", None)
+ conv = Conversation()
+ roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+@@ -72,7 +73,7 @@ def preprocess(
+ # "-2" is hardcoded for the Llama tokenizer to make the offset correct.
+ instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+- if i != 0 and not tokenizer.legacy:
++ if i != 0 and not tokenizer_legacy:
+ # The legacy and non-legacy modes handle special tokens differently
+ instruction_len -= 1
+
+@@ -80,7 +81,7 @@ def preprocess(
+ target[cur_len : cur_len + instruction_len] = IGNORE_TOKEN_ID
+ cur_len += turn_len
+
+- if i != 0 and not tokenizer.legacy:
++ if i != 0 and not tokenizer_legacy:
+ # The legacy and non-legacy modes handle special tokens differently
+ cur_len -= 1
+
+diff --git a/src/eval/get_alpaca_eval_ans.py b/src/eval/get_alpaca_eval_ans.py
+deleted file mode 100644
+index 1ff3e5e..0000000
+--- a/src/eval/get_alpaca_eval_ans.py
++++ /dev/null
+@@ -1,113 +0,0 @@
+-import argparse
+-from pathlib import Path
+-
+-import torch
+-import datasets
+-from tqdm import tqdm
+-
+-from src.core.train import get_model_and_tokenizer
+-from src.utils.conversation import Conversation
+-from src.utils.io import dump_json
+-
+-
+-@torch.inference_mode()
+-def run_eval(model_path, model_id, max_new_tokens):
+- model, tokenizer = get_model_and_tokenizer(
+- "auto",
+- model_path,
+- torch_dtype=torch.bfloat16,
+- trust_remote_code=True,
+- )
+- model.cuda()
+- model.eval()
+-
+- conv = Conversation()
+- outputs = []
+- eval_set = datasets.load_dataset("tatsu-lab/alpaca_eval", "alpaca_eval")["eval"]
+- for example in tqdm(eval_set, desc="Eval"):
+- conv.append_message(conv.roles[0], example["instruction"])
+- conv.append_message(conv.roles[1], None)
+- prompt = conv.get_prompt()
+- input_ids = tokenizer([prompt], return_tensors="pt").input_ids
+- conv.clear_msg()
+- # generate here is a placeholder for your models generations
+- output_ids = model.generate(
+- input_ids.cuda(),
+- do_sample=False,
+- temperature=0.0,
+- max_new_tokens=max_new_tokens,
+- )
+- if model.config.is_encoder_decoder:
+- output_ids = output_ids[0]
+- else:
+- output_ids = output_ids[0][len(input_ids[0]) :] # noqa: E203
+- # be consistent with the template's stop_token_ids
+- if conv.stop_token_ids:
+- stop_token_ids_index = [
+- i
+- for i, id in enumerate(output_ids)
+- if id in conv.stop_token_ids
+- ]
+- if len(stop_token_ids_index) > 0:
+- output_ids = output_ids[: stop_token_ids_index[0]]
+-
+- output = tokenizer.decode(
+- output_ids,
+- spaces_between_special_tokens=False,
+- )
+- if conv.stop_str and isinstance(conv.stop_str, list):
+- stop_str_indices = sorted(
+- [
+- output.find(stop_str)
+- for stop_str in conv.stop_str
+- if output.find(stop_str) > 0
+- ]
+- )
+- if len(stop_str_indices) > 0:
+- output = output[: stop_str_indices[0]]
+- elif conv.stop_str and output.find(conv.stop_str) > 0:
+- output = output[: output.find(conv.stop_str)]
+-
+- for special_token in tokenizer.special_tokens_map.values():
+- if isinstance(special_token, list):
+- for special_tok in special_token:
+- output = output.replace(special_tok, "")
+- else:
+- output = output.replace(special_token, "")
+- output = output.strip()
+-
+- if conv.name == "xgen" and output.startswith("Assistant:"):
+- output = output.replace("Assistant:", "", 1).strip()
+-
+- example["output"] = output
+- outputs.append(example)
+-
+- outpath = Path("results/alpaca_eval") / f"{model_id}.json"
+- dump_json(outputs, outpath, indent=2)
+-
+-
+-if __name__ == "__main__":
+- parser = argparse.ArgumentParser()
+- parser.add_argument(
+- "--model-path",
+- type=str,
+- required=True,
+- help="The path to the weights. This can be a local folder or a Hugging Face repo ID.",
+- )
+- parser.add_argument(
+- "--model-id", type=str, required=True, help="A custom name for the model."
+- )
+- parser.add_argument(
+- "--max-new-token",
+- type=int,
+- default=1024,
+- help="The maximum number of new generated tokens.",
+- )
+-
+- args = parser.parse_args()
+-
+- run_eval(
+- model_path=args.model_path,
+- model_id=args.model_id,
+- max_new_tokens=args.max_new_token,
+- )
+diff --git a/src/eval/show.py b/src/eval/show.py
+index d500054..ea0c210 100644
+--- a/src/eval/show.py
++++ b/src/eval/show.py
+@@ -55,13 +55,13 @@ def collect_results(result_dir: str, verbose: bool = True) -> dict:
+ avg = sum(vals) / len(vals)
+ tot_vals.append(avg)
+ if verbose:
+- logger.info(f"task: {name}, num: {len(tasks.split(','))}, avg: {avg:.3%}")
++ logger.info(f"task: {name}, num: {len(tasks.split(','))}, avg: {100 * avg:.3f} %")
+
+ if len(tot_vals) == 0:
+ tot_avg = 0.0
+ else:
+ tot_avg = sum(tot_vals) / len(tot_vals)
+- logger.info(f"total avg: {tot_avg:.3%}")
++ logger.info(f"total avg: {100 * tot_avg:.3f} %")
+
+
+ if __name__ == "__main__":
+diff --git a/src/models/deepseek/modeling_deepseek.py b/src/models/deepseek/modeling_deepseek.py
+index 1dae56e..20498b2 100644
+--- a/src/models/deepseek/modeling_deepseek.py
++++ b/src/models/deepseek/modeling_deepseek.py
+@@ -20,6 +20,7 @@
+ """ PyTorch DeepSeek model."""
+ import math
+ import warnings
++from dataclasses import dataclass
+ from typing import List, Optional, Tuple, Union
+
+ import torch
+@@ -297,7 +298,7 @@ class DeepseekMLP(nn.Module):
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+- def forward(self, x):
++ def forward(self, x, **kwargs):
+ if self.config.pretraining_tp > 1:
+ slice = self.intermediate_size // self.config.pretraining_tp
+ gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+@@ -328,7 +329,9 @@ class DeepseekMLP(nn.Module):
+ else:
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+- return down_proj
++ bsz, seq_len, _ = x.shape
++ load = torch.zeros(bsz * seq_len, self.config.n_routed_experts)
++ return down_proj, load
+
+
+ class MoEGate(nn.Module):
+@@ -356,7 +359,10 @@ class MoEGate(nn.Module):
+ init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+
+ def forward(self, hidden_states):
+- bsz, seq_len, h = hidden_states.shape
++ if len(hidden_states.shape) == 2:
++ bsz, h = hidden_states.shape
++ else:
++ bsz, seq_len, h = hidden_states.shape
+ ### compute gating score
+ hidden_states = hidden_states.view(-1, h)
+ logits = F.linear(hidden_states, self.weight, None)
+@@ -404,7 +410,10 @@ class MoEGate(nn.Module):
+ aux_loss = (Pi * fi).sum() * self.alpha
+ else:
+ aux_loss = None
+- return topk_idx, topk_weight, aux_loss
++ _zeros = torch.zeros_like(logits)
++ _scores_filtered = _zeros.scatter(dim=1, index=topk_idx, src=topk_weight)
++ load = (_scores_filtered > 0).sum(0)
++ return topk_idx, topk_weight, aux_loss, load
+
+
+ class AddAuxiliaryLoss(torch.autograd.Function):
+@@ -450,10 +459,19 @@ class DeepseekMoE(nn.Module):
+ config=config, intermediate_size=intermediate_size
+ )
+
+- def forward(self, hidden_states):
++ def forward(self, hidden_states, attention_mask=None):
++ bsz, seq_len, hsz = hidden_states.shape
++ hidden_states = hidden_states.reshape(-1, hsz)
++ flattened_mask = None
++ flattened_shape = None
++ if attention_mask is not None and len(attention_mask.shape) == 2:
++ flattened_mask = attention_mask.flatten()
++ flattened_shape = flattened_mask.shape
++ hidden_states = hidden_states[flattened_mask.bool()]
++
+ identity = hidden_states
+ orig_shape = hidden_states.shape
+- topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
++ topk_idx, topk_weight, aux_loss, load = self.gate(hidden_states)
+ hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+ flat_topk_idx = topk_idx.view(-1)
+ if self.training:
+@@ -472,7 +490,15 @@ class DeepseekMoE(nn.Module):
+ ).view(*orig_shape)
+ if self.config.n_shared_experts is not None:
+ y = y + self.shared_experts(identity)
+- return y
++
++ if flattened_mask is not None:
++ _y = torch.zeros(flattened_shape + (hsz,), dtype=y.dtype, device=y.device)
++ _y[flattened_mask.bool()] = y
++ y = _y
++
++ y = y.reshape(bsz, seq_len, hsz)
++
++ return y, load
+
+ @torch.no_grad()
+ def moe_infer(self, x, flat_expert_indices, flat_expert_weights):
+@@ -1163,7 +1189,7 @@ class DeepseekDecoderLayer(nn.Module):
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+- hidden_states = self.mlp(hidden_states)
++ hidden_states, load = self.mlp(hidden_states, attention_mask=attention_mask)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+@@ -1174,6 +1200,8 @@ class DeepseekDecoderLayer(nn.Module):
+ if use_cache:
+ outputs += (present_key_value,)
+
++ outputs += (load,)
++
+ return outputs
+
+
+@@ -1220,6 +1248,11 @@ class DeepseekPreTrainedModel(PreTrainedModel):
+ module.weight.data[module.padding_idx].zero_()
+
+
++@dataclass
++class BaseMoEModelOutputWithPast(BaseModelOutputWithPast):
++ gate_load: Optional[torch.Tensor] = None
++
++
+ Deepseek_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+@@ -1429,6 +1462,7 @@ class DeepseekModel(DeepseekPreTrainedModel):
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
++ gate_load = ()
+ next_decoder_cache = None
+
+ for decoder_layer in self.layers:
+@@ -1463,6 +1497,8 @@ class DeepseekModel(DeepseekPreTrainedModel):
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
++ gate_load += (layer_outputs[-1],)
++
+ hidden_states = self.norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+@@ -1482,14 +1518,20 @@ class DeepseekModel(DeepseekPreTrainedModel):
+ for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+ if v is not None
+ )
+- return BaseModelOutputWithPast(
++ return BaseMoEModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
++ gate_load=gate_load,
+ )
+
+
++@dataclass
++class MoECausalLMOutputWithPast(CausalLMOutputWithPast):
++ gate_load: Optional[torch.Tensor] = None
++
++
+ class DeepseekForCausalLM(DeepseekPreTrainedModel):
+ _tied_weights_keys = ["lm_head.weight"]
+
+@@ -1620,12 +1662,13 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel):
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+- return CausalLMOutputWithPast(
++ return MoECausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
++ gate_load=outputs.gate_load,
+ )
+
+ def prepare_inputs_for_generation(
+diff --git a/src/utils/config.py b/src/utils/config.py
+index 3ea5283..d4060d9 100644
+--- a/src/utils/config.py
++++ b/src/utils/config.py
+@@ -6,6 +6,7 @@ import torch
+ import transformers
+
+ from src.utils.io import load_json
++from src.data import get_uniform_sampling_ratio
+
+
+ @dataclass
+@@ -33,7 +34,9 @@ class ModelArguments:
+ )
+ attn_impl: str = field(
+ default="flash_attention_2",
+- metadata={"help": "attention implementation, choice from [eager, flash_attention_2, sdpa] (default: `flash_attention_2`)"}
++ metadata={
++ "help": "attention implementation, choice from [eager, flash_attention_2, sdpa] (default: `flash_attention_2`)"
++ },
+ )
+
+ def __post_init__(self):
+@@ -56,6 +59,18 @@ class DataArguments:
+ default="data/merged",
+ metadata={"help": "Path to dataset directory or a single jsonl file"},
+ )
++ prob_map: str = field(
++ default=None,
++ metadata={"help": "Path to the probability map file"},
++ )
++
++ def __post_init__(self):
++ if self.prob_map is not None:
++ if not pathlib.Path(self.prob_map).exists():
++ raise ValueError(f"Probability map file {self.prob_map} not found")
++ self.prob_map = load_json(self.prob_map)
++ else:
++ self.prob_map = get_uniform_sampling_ratio(self.dataset_dir_or_path)
+
+
+ @dataclass
+@@ -70,9 +85,7 @@ class TrainingArguments(transformers.TrainingArguments):
+ )
+ max_eval_steps_per_type: int = field(
+ default=10,
+- metadata={
+- "help": "Maximum number of steps to perform during evaluation."
+- },
++ metadata={"help": "Maximum number of steps to perform during evaluation."},
+ )
+ dynamic_sampling_sim_type: Literal["cos", "l2"] = field(
+ default="l2",
+@@ -88,7 +101,5 @@ class TrainingArguments(transformers.TrainingArguments):
+ )
+ freeze_gate: bool = field(
+ default=False,
+- metadata={
+- "help": "Whether to freeze the gate during training."
+- },
++ metadata={"help": "Whether to freeze the gate during training."},
+ )
+diff --git a/src/utils/visualization.py b/src/utils/visualization.py
+index 794f6c8..02bd236 100644
+--- a/src/utils/visualization.py
++++ b/src/utils/visualization.py
+@@ -180,6 +180,86 @@ def gate_load_stats(model_dir, data_dir, result_dir, update_strategy: str = "cos
+ )
+
+
++def sampling_info_stats(filepath: str, data_type: str, output_dir: str):
++ from pathlib import Path
++ import numpy as np
++ from src.utils.io import load_jsonlines
++
++ Path(output_dir).mkdir(exist_ok=True, parents=True)
++
++ data = load_jsonlines(filepath)
++ step2data = {ins["step"]: ins for ins in data}
++
++ data_types = sorted(data[0]["old_prob_map"].keys())
++ data_type_idx = data_types.index(data_type)
++
++ probs = []
++ loads = []
++ sims = []
++ steps = sorted(step2data.keys())
++ for step in steps:
++ ins = step2data[step]
++ probs.append(ins["old_prob_map"][data_type])
++ loads.append(ins["name2load"][data_type])
++ sims.append(ins["sim"][data_type_idx])
++
++ # probs
++ fig = plt.figure()
++ ax = fig.add_subplot(111)
++ ax.plot(steps, probs)
++ ax.set_title(f"Sampling Probability of {data_type}")
++ ax.set_xlabel("step")
++ fig.savefig(f"{output_dir}/prob-{data_type}.png")
++
++ # loads
++ def cv_square(data):
++ return np.var(data, axis=1) / (np.mean(data, axis=1)**2 + 1e-10)
++
++ fig = plt.figure()
++ ax = fig.add_subplot(111)
++ ax.plot(steps, cv_square(loads))
++ ax.set_title(f"cv(load)^2 of {data_type}")
++ ax.set_xlabel("step")
++ fig.savefig(f"{output_dir}/load_cv-{data_type}.png")
++
++ # sims
++ fig = plt.figure()
++ ax = fig.add_subplot(111)
++ ax.plot(steps, np.mean(sims, axis=1))
++ ax.set_title(f"Mean Similarities with {data_type}")
++ ax.set_xlabel("step")
++ fig.savefig(f"{output_dir}/sim-{data_type}.png")
++
++
++def test_sampling_convergence():
++ from collections import defaultdict
++ from src.callbacks import AdaptiveSamplingCallback
++
++ # freeze gate
++ name2load = {"code": [0.1359794776119403, 0.1333115671641791, 0.12858208955223882, 0.10330223880597016, 0.12544776119402984, 0.12625932835820897, 0.12761194029850748, 0.11950559701492537], "orca": [0.1509941502743006, 0.11721425756978752, 0.1232988815809414, 0.12714439426545024, 0.11256554420634679, 0.14008274482465977, 0.11819552632376563, 0.11050450095474797], "math": [0.15956486572028086, 0.10727138452881943, 0.11506675888262392, 0.10958069091633744, 0.11805010139847842, 0.11915200393871546, 0.13648938539627462, 0.13482480921846976], "sharegpt": [0.15337086599959998, 0.11428233411553493, 0.12873151621889287, 0.1177436980734424, 0.11538123789498336, 0.13793986642403783, 0.12419686111124664, 0.10835362016226212]} # fmt: skip
++ # # dynamic
++ # name2load = {"code": [0.14031716417910448, 0.1310634328358209, 0.12651119402985075, 0.10993470149253731, 0.12196828358208955, 0.12552238805970148, 0.12791977611940297, 0.11676305970149255], "orca": [0.15106234655836084, 0.11803640166095838, 0.12349968175067437, 0.12884551268450883, 0.11344072985178673, 0.1383778377231534, 0.11733170672566907, 0.1094057830448883], "math": [0.16001617686708006, 0.10756444371505268, 0.11391210568886491, 0.114803005615014, 0.11676650216277679, 0.1177863481308685, 0.13630182751708533, 0.13284959030325763], "sharegpt": [0.15440024978412215, 0.113654214863131, 0.12914741653941664, 0.12104040941178769, 0.11470799162832905, 0.13593110446537907, 0.12316259873058931, 0.10795601457724527]} # fmt: skip
++ names = sorted(name2load.keys())
++ callback = AdaptiveSamplingCallback()
++ callback.prob_map = {"code": 0.25, "math": 0.25, "orca": 0.25, "sharegpt": 0.25}
++ name2probs = defaultdict(list)
++ for _ in range(100):
++ for name in names:
++ name2probs[name].append(callback.prob_map[name])
++ new_name2prob, _ = callback._update_prob_map(name2load)
++ callback.prob_map = new_name2prob
++ print(f"final prob_map: {callback.prob_map}")
++
++ fig = plt.figure()
++ ax = fig.add_subplot(111)
++ for name in names:
++ ax.plot(name2probs[name], label=name)
++ ax.legend()
++ ax.set_title("Sampling Probability")
++ ax.set_xlabel("step")
++ fig.savefig("results/sampling_convergence.png")
++
++
+ if __name__ == "__main__":
+ # gate_load_stats(
+ # "/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new",
+@@ -195,12 +275,12 @@ if __name__ == "__main__":
+ # "results/gate_load_vis_llama_moe_2_8_orca_4clusters",
+ # )
+
+- gate_load_stats(
+- "/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new",
+- "data/four_types_mix/dev",
+- "results/debug",
+- update_strategy="l2",
+- )
++ # gate_load_stats(
++ # "/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new",
++ # "data/four_types_mix/dev",
++ # "results/debug",
++ # update_strategy="l2",
++ # )
+
+ # gate_load_stats(
+ # "/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new",
+@@ -227,3 +307,29 @@ if __name__ == "__main__":
+ # "results/gate_load_vis_llama_moe_2_8_four_types_mix_l2",
+ # update_strategy="l2"
+ # )
++
++ # sampling_info_stats(
++ # "/mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048_dynamic_remove_padding_tokens/llama_moe_four_mix_wo_pad_freeze_gate/moe_sft-2491632/sampling_info/data.jsonl",
++ # "code",
++ # "results/sampling_info/llama_moe_four_mix_wo_pad_freeze_gate/code",
++ # )
++
++ # sampling_info_stats(
++ # "/mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048_dynamic_remove_padding_tokens/llama_moe_four_mix_wo_pad/moe_sft-2491633/sampling_info/data.jsonl",
++ # "code",
++ # "results/sampling_info/llama_moe_four_mix_wo_pad/code",
++ # )
++
++ # sampling_info_stats(
++ # "/mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048_dynamic_remove_padding_tokens/llama_moe_four_mix_wo_pad_freeze_gate_wo_gate_noise/moe_sft-2493315/sampling_info/data.jsonl",
++ # "code",
++ # "results/sampling_info/llama_moe_four_mix_wo_pad_freeze_gate_wo_gate_noise/code",
++ # )
++
++ # sampling_info_stats(
++ # "outputs/len2048_dynamic_remove_padding_tokens/llama_moe_four_mix_wo_pad_wo_gate_noise/moe_sft-2492650/sampling_info/data.jsonl",
++ # "code",
++ # "results/sampling_info/llama_moe_four_mix_wo_pad_wo_gate_noise/code",
++ # )
++
++ test_sampling_convergence()
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf84ec1a28ba89feb07162d95b06633a40b4975f
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,7 @@
+{
+ "_from_model_config": true,
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "pad_token_id": 0,
+ "transformers_version": "4.36.2"
+}
diff --git a/model-00001-of-00003.safetensors b/model-00001-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..29beccaf47b70d020c6d6d9b7799376ec56bc504
--- /dev/null
+++ b/model-00001-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8919f505f53749e1c46511cd975e9da2c91fbcd8105ad30bc26ea2bb5fec3f38
+size 4996976432
diff --git a/model-00002-of-00003.safetensors b/model-00002-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8867e0f43b4565f683c43b336330afdcff22e872
--- /dev/null
+++ b/model-00002-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61cb496bd075daa995cf0341587193b2e7a4d5805b4aa561bff4013b1861afff
+size 4982823704
diff --git a/model-00003-of-00003.safetensors b/model-00003-of-00003.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0241a0fbfe0f8ce4c5b8b6fba7427f4ab0813a8b
--- /dev/null
+++ b/model-00003-of-00003.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21fffb1ada83903f9906325e0244222f88a5a97fdc3ab778e424f940e2d07974
+size 3501371152
diff --git a/model.safetensors.index.json b/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..5651c549a94cadeef10e973b9b2e37a5f20575b9
--- /dev/null
+++ b/model.safetensors.index.json
@@ -0,0 +1,1098 @@
+{
+ "metadata": {
+ "total_size": 13481033728
+ },
+ "weight_map": {
+ "lm_head.weight": "model-00003-of-00003.safetensors",
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors",
+ "model.layers.0.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors",
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.0.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors",
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors",
+ "model.layers.1.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors",
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.1.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors",
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors",
+ "model.layers.10.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors",
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.10.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors",
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors",
+ "model.layers.11.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors",
+ "model.layers.11.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors",
+ "model.layers.11.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors",
+ "model.layers.11.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors",
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.11.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors",
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors",
+ "model.layers.12.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors",
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.12.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors",
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors",
+ "model.layers.13.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors",
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.13.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors",
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors",
+ "model.layers.14.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors",
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.14.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors",
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors",
+ "model.layers.15.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors",
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.15.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors",
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors",
+ "model.layers.16.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors",
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.16.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors",
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors",
+ "model.layers.17.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors",
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.17.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors",
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors",
+ "model.layers.18.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors",
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.18.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors",
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors",
+ "model.layers.19.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors",
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.19.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors",
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors",
+ "model.layers.2.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors",
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.2.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors",
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors",
+ "model.layers.20.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors",
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.20.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors",
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors",
+ "model.layers.21.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors",
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.21.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors",
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors",
+ "model.layers.22.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors",
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.22.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors",
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors",
+ "model.layers.23.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors",
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.23.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors",
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_down.0": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_down.1": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_down.2": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_down.3": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_down.4": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_down.5": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_down.6": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_down.7": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_gate.0": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_gate.1": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_gate.2": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_gate.3": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_gate.4": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_gate.5": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_gate.6": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_gate.7": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_up.0": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_up.1": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_up.2": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_up.3": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_up.4": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_up.5": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_up.6": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.calculator.experts.weight_up.7": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.gate.gate_network.0.weight": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.gate.gate_network.2.weight": "model-00003-of-00003.safetensors",
+ "model.layers.24.mlp.gate.weight_noise.weight": "model-00003-of-00003.safetensors",
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.24.self_attn.rotary_emb.inv_freq": "model-00003-of-00003.safetensors",
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_down.0": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_down.1": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_down.2": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_down.3": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_down.4": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_down.5": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_down.6": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_down.7": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_gate.0": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_gate.1": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_gate.2": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_gate.3": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_gate.4": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_gate.5": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_gate.6": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_gate.7": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_up.0": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_up.1": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_up.2": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_up.3": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_up.4": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_up.5": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_up.6": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.calculator.experts.weight_up.7": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.gate.gate_network.0.weight": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.gate.gate_network.2.weight": "model-00003-of-00003.safetensors",
+ "model.layers.25.mlp.gate.weight_noise.weight": "model-00003-of-00003.safetensors",
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.25.self_attn.rotary_emb.inv_freq": "model-00003-of-00003.safetensors",
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_down.0": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_down.1": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_down.2": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_down.3": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_down.4": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_down.5": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_down.6": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_down.7": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_gate.0": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_gate.1": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_gate.2": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_gate.3": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_gate.4": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_gate.5": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_gate.6": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_gate.7": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_up.0": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_up.1": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_up.2": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_up.3": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_up.4": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_up.5": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_up.6": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.calculator.experts.weight_up.7": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.gate.gate_network.0.weight": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.gate.gate_network.2.weight": "model-00003-of-00003.safetensors",
+ "model.layers.26.mlp.gate.weight_noise.weight": "model-00003-of-00003.safetensors",
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.26.self_attn.rotary_emb.inv_freq": "model-00003-of-00003.safetensors",
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_down.0": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_down.1": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_down.2": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_down.3": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_down.4": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_down.5": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_down.6": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_down.7": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_gate.0": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_gate.1": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_gate.2": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_gate.3": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_gate.4": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_gate.5": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_gate.6": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_gate.7": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_up.0": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_up.1": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_up.2": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_up.3": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_up.4": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_up.5": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_up.6": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.calculator.experts.weight_up.7": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.gate.gate_network.0.weight": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.gate.gate_network.2.weight": "model-00003-of-00003.safetensors",
+ "model.layers.27.mlp.gate.weight_noise.weight": "model-00003-of-00003.safetensors",
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.27.self_attn.rotary_emb.inv_freq": "model-00003-of-00003.safetensors",
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_down.0": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_down.1": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_down.2": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_down.3": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_down.4": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_down.5": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_down.6": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_down.7": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_gate.0": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_gate.1": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_gate.2": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_gate.3": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_gate.4": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_gate.5": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_gate.6": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_gate.7": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_up.0": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_up.1": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_up.2": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_up.3": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_up.4": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_up.5": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_up.6": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.calculator.experts.weight_up.7": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.gate.gate_network.0.weight": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.gate.gate_network.2.weight": "model-00003-of-00003.safetensors",
+ "model.layers.28.mlp.gate.weight_noise.weight": "model-00003-of-00003.safetensors",
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.28.self_attn.rotary_emb.inv_freq": "model-00003-of-00003.safetensors",
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_down.0": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_down.1": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_down.2": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_down.3": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_down.4": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_down.5": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_down.6": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_down.7": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_gate.0": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_gate.1": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_gate.2": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_gate.3": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_gate.4": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_gate.5": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_gate.6": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_gate.7": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_up.0": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_up.1": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_up.2": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_up.3": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_up.4": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_up.5": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_up.6": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.calculator.experts.weight_up.7": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.gate.gate_network.0.weight": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.gate.gate_network.2.weight": "model-00003-of-00003.safetensors",
+ "model.layers.29.mlp.gate.weight_noise.weight": "model-00003-of-00003.safetensors",
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.29.self_attn.rotary_emb.inv_freq": "model-00003-of-00003.safetensors",
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors",
+ "model.layers.3.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors",
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.3.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors",
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_down.0": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_down.1": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_down.2": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_down.3": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_down.4": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_down.5": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_down.6": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_down.7": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_gate.0": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_gate.1": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_gate.2": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_gate.3": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_gate.4": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_gate.5": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_gate.6": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_gate.7": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_up.0": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_up.1": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_up.2": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_up.3": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_up.4": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_up.5": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_up.6": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.calculator.experts.weight_up.7": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.gate.gate_network.0.weight": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.gate.gate_network.2.weight": "model-00003-of-00003.safetensors",
+ "model.layers.30.mlp.gate.weight_noise.weight": "model-00003-of-00003.safetensors",
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.30.self_attn.rotary_emb.inv_freq": "model-00003-of-00003.safetensors",
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_down.0": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_down.1": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_down.2": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_down.3": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_down.4": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_down.5": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_down.6": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_down.7": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_gate.0": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_gate.1": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_gate.2": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_gate.3": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_gate.4": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_gate.5": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_gate.6": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_gate.7": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_up.0": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_up.1": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_up.2": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_up.3": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_up.4": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_up.5": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_up.6": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.calculator.experts.weight_up.7": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.gate.gate_network.0.weight": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.gate.gate_network.2.weight": "model-00003-of-00003.safetensors",
+ "model.layers.31.mlp.gate.weight_noise.weight": "model-00003-of-00003.safetensors",
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.31.self_attn.rotary_emb.inv_freq": "model-00003-of-00003.safetensors",
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors",
+ "model.layers.4.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors",
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.4.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors",
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors",
+ "model.layers.5.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors",
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.5.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors",
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors",
+ "model.layers.6.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors",
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.6.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors",
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors",
+ "model.layers.7.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors",
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.7.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors",
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors",
+ "model.layers.8.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors",
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.8.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors",
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors",
+ "model.layers.9.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors",
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+ "model.layers.9.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors",
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+ "model.norm.weight": "model-00003-of-00003.safetensors"
+ }
+}
diff --git a/modeling_llama_moe_hf.py b/modeling_llama_moe_hf.py
new file mode 100644
index 0000000000000000000000000000000000000000..9769c70f0b6897ecd8ebecd6dd913dd57aa334c6
--- /dev/null
+++ b/modeling_llama_moe_hf.py
@@ -0,0 +1,1690 @@
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributions.normal import Normal
+from transformers.modeling_outputs import (
+ CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.activations import ACT2FN
+from transformers.utils import ModelOutput, logging
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import (
+ AttentionMaskConverter,
+ _prepare_4d_attention_mask,
+ _prepare_4d_causal_attention_mask,
+ _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from transformers.utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10
+
+from .configuration_llama_moe import LlamaMoEConfig
+
+
+if is_flash_attn_2_available():
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
+
+
+def _get_unpad_data(attention_mask):
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+ return (
+ indices,
+ cu_seqlens,
+ max_seqlen_in_batch,
+ )
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "LlamaMoEConfig"
+
+
+@dataclass
+class CalculatorOutput(ModelOutput):
+ hidden_states: Optional[torch.FloatTensor] = None
+ num_dropped_tokens: Optional[int] = None
+
+
+@dataclass
+class BaseMoEModelOutputWithPast(ModelOutput):
+ """
+ Args:
+ num_dropped_tokens: layer idx to the number of dropped tokens
+ """
+
+ last_hidden_state: torch.FloatTensor = None
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
+ balance_loss: Optional[float] = None
+ num_dropped_tokens: Optional[Tuple[torch.Tensor]] = None
+ gate_load: Optional[Tuple[list]] = None
+ gate_importance: Optional[Tuple[list]] = None
+
+
+@dataclass
+class MoECausalLMOutputWithPast(CausalLMOutputWithPast):
+ balance_loss: Optional[float] = None
+ num_dropped_tokens: Optional[Tuple[int]] = None
+ gate_load: Optional[Tuple[list[torch.Tensor]]] = None
+ gate_importance: Optional[Tuple[list[torch.Tensor]]] = None
+
+
+@dataclass
+class MoEMlpOutput(ModelOutput):
+ hidden_states: Optional[torch.FloatTensor] = None
+ balance_loss: Optional[torch.FloatTensor] = None
+ num_dropped_tokens: Optional[int] = None
+ gate_load: Optional[list] = None
+ gate_importance: Optional[list] = None
+
+
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.size()
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+class LlamaRMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ LlamaRMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+
+class LlamaRotaryEmbedding(torch.nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+ return (
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ )
+
+
+class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
+ """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ t = t / self.scaling_factor
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
+ """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+
+ if seq_len > self.max_position_embeddings:
+ base = self.base * (
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+ ) ** (self.dim / (self.dim - 2))
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq)
+
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+ # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+ cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
+ sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
+ cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
+ sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class LlamaAttention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: LlamaMoEConfig, layer_idx: Optional[int] = None):
+ super().__init__()
+ self.config = config
+ self.layer_idx = layer_idx
+ if layer_idx is None:
+ logger.warning_once(
+ f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+ "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+ "when creating this class."
+ )
+
+ self.attention_dropout = config.attention_dropout
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.hidden_size // self.num_heads
+ self.num_key_value_heads = config.num_key_value_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.rope_theta = config.rope_theta
+ self.is_causal = True
+
+ if (self.head_dim * self.num_heads) != self.hidden_size:
+ raise ValueError(
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+ f" and `num_heads`: {self.num_heads})."
+ )
+
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+ self._init_rope()
+
+ def _init_rope(self):
+ if self.config.rope_scaling is None:
+ self.rotary_emb = LlamaRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.rope_theta,
+ )
+ else:
+ scaling_type = self.config.rope_scaling["type"]
+ scaling_factor = self.config.rope_scaling["factor"]
+ if scaling_type == "linear":
+ self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ elif scaling_type == "dynamic":
+ self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ else:
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+ )
+
+ bsz, q_len, _ = hidden_states.size()
+
+ if self.config.pretraining_tp > 1:
+ key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+ query_slices = self.q_proj.weight.split(
+ (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+ )
+ key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+ value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+ query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+ query_states = torch.cat(query_states, dim=-1)
+
+ key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+ key_states = torch.cat(key_states, dim=-1)
+
+ value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+ value_states = torch.cat(value_states, dim=-1)
+
+ else:
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ if self.layer_idx is None:
+ raise ValueError(
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+ "with a layer index."
+ )
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+ )
+ attn_weights = attn_weights + attention_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+ if self.config.pretraining_tp > 1:
+ attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+ o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+ attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+ else:
+ attn_output = self.o_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+class LlamaFlashAttention2(LlamaAttention):
+ """
+ Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+ flash attention and deal with padding tokens in case the input contains any of them.
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ # LlamaFlashAttention2 attention does not support output_attentions
+ if "padding_mask" in kwargs:
+ warnings.warn(
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+ )
+
+ # overwrite attention_mask with padding_mask
+ attention_mask = kwargs.pop("padding_mask")
+
+ output_attentions = False
+
+ bsz, q_len, _ = hidden_states.size()
+
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ # Flash attention requires the input to have the shape
+ # batch_size x seq_length x head_dim x hidden_dim
+ # therefore we just need to keep the original shape
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+ # to be able to avoid many of these transpose/reshape/view.
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ dropout_rate = self.attention_dropout if self.training else 0.0
+
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
+ # cast them back in the correct dtype just to be sure everything works as expected.
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+ # in fp32. (LlamaRMSNorm handles it correctly)
+
+ input_dtype = query_states.dtype
+ if input_dtype == torch.float32:
+ if torch.is_autocast_enabled():
+ target_dtype = torch.get_autocast_gpu_dtype()
+ # Handle the case where the model is quantized
+ elif hasattr(self.config, "_pre_quantization_dtype"):
+ target_dtype = self.config._pre_quantization_dtype
+ else:
+ target_dtype = self.q_proj.weight.dtype
+
+ logger.warning_once(
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+ f" {target_dtype}."
+ )
+
+ query_states = query_states.to(target_dtype)
+ key_states = key_states.to(target_dtype)
+ value_states = value_states.to(target_dtype)
+
+ attn_output = self._flash_attention_forward(
+ query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
+ )
+
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+ attn_output = self.o_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+ def _flash_attention_forward(
+ self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+ ):
+ """
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+ first unpad the input, then computes the attention scores and pad the final attention scores.
+
+ Args:
+ query_states (`torch.Tensor`):
+ Input query states to be passed to Flash Attention API
+ key_states (`torch.Tensor`):
+ Input key states to be passed to Flash Attention API
+ value_states (`torch.Tensor`):
+ Input value states to be passed to Flash Attention API
+ attention_mask (`torch.Tensor`):
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+ position of padding tokens and 1 for the position of non-padding tokens.
+ dropout (`int`, *optional*):
+ Attention dropout
+ softmax_scale (`float`, *optional*):
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+ """
+ if not self._flash_attn_uses_top_left_mask:
+ causal = self.is_causal
+ else:
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+ causal = self.is_causal and query_length != 1
+
+ # Contains at least one padding token in the sequence
+ if attention_mask is not None:
+ batch_size = query_states.shape[0]
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+ query_states, key_states, value_states, attention_mask, query_length
+ )
+
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+ attn_output_unpad = flash_attn_varlen_func(
+ query_states,
+ key_states,
+ value_states,
+ cu_seqlens_q=cu_seqlens_q,
+ cu_seqlens_k=cu_seqlens_k,
+ max_seqlen_q=max_seqlen_in_batch_q,
+ max_seqlen_k=max_seqlen_in_batch_k,
+ dropout_p=dropout,
+ softmax_scale=softmax_scale,
+ causal=causal,
+ )
+
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+ else:
+ attn_output = flash_attn_func(
+ query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+ )
+
+ return attn_output
+
+ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+ key_layer = index_first_axis(
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+ )
+ value_layer = index_first_axis(
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+ )
+ if query_length == kv_seq_len:
+ query_layer = index_first_axis(
+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+ )
+ cu_seqlens_q = cu_seqlens_k
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
+ indices_q = indices_k
+ elif query_length == 1:
+ max_seqlen_in_batch_q = 1
+ cu_seqlens_q = torch.arange(
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
+ ) # There is a memcpy here, that is very bad.
+ indices_q = cu_seqlens_q[:-1]
+ query_layer = query_layer.squeeze(1)
+ else:
+ # The -q_len: slice assumes left padding.
+ attention_mask = attention_mask[:, -query_length:]
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+ return (
+ query_layer,
+ key_layer,
+ value_layer,
+ indices_q,
+ (cu_seqlens_q, cu_seqlens_k),
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+ )
+
+
+class LlamaSdpaAttention(LlamaAttention):
+ """
+ Llama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+ `LlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+ SDPA API.
+ """
+
+ # Adapted from LlamaAttention.forward
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ if output_attentions:
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+ logger.warning_once(
+ "LlamaModel is using LlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+ )
+ return super().forward(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+
+ bsz, q_len, _ = hidden_states.size()
+
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+ )
+
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
+ if query_states.device.type == "cuda" and attention_mask is not None:
+ query_states = query_states.contiguous()
+ key_states = key_states.contiguous()
+ value_states = value_states.contiguous()
+
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
+ query_states,
+ key_states,
+ value_states,
+ attn_mask=attention_mask,
+ dropout_p=self.attention_dropout if self.training else 0.0,
+ # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+ is_causal=self.is_causal and attention_mask is None and q_len > 1,
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+ attn_output = self.o_proj(attn_output)
+
+ return attn_output, None, past_key_value
+
+
+LLAMA_ATTENTION_CLASSES = {
+ "eager": LlamaAttention,
+ "flash_attention_2": LlamaFlashAttention2,
+ "sdpa": LlamaSdpaAttention,
+}
+
+
+class TopKBalancedNoisyGate(nn.Module):
+ def __init__(
+ self,
+ input_size,
+ num_experts,
+ num_selects,
+ gate_network="mlp",
+ use_softmax=True,
+ use_balance=True,
+ balance_loss_weight=1e-2,
+ add_noise=True,
+ noise_epsilon=1e-2,
+ ):
+ super(TopKBalancedNoisyGate, self).__init__()
+ assert num_selects <= num_experts
+ self.input_size = input_size
+ self.num_experts = num_experts
+ self.num_selects = num_selects
+
+ self.gate_network_type = gate_network
+ self.gate_network = self.get_gate_network(gate_network, input_size, num_experts)
+
+ self.use_softmax = use_softmax
+ self.softmax = nn.Softmax(1)
+
+ self.use_balance = use_balance
+ self.balance_loss_weight = balance_loss_weight
+
+ # add_noise
+ self.add_noise = add_noise
+ self.noise_epsilon = noise_epsilon
+ self.warned = False
+ if self.add_noise:
+ self.weight_noise = nn.Linear(input_size, num_experts, bias=False)
+ self.weight_noise.weight.data = torch.zeros(
+ (num_experts, input_size),
+ requires_grad=True,
+ device=self.weight_noise.weight.data.device,
+ dtype=self.weight_noise.weight.data.dtype,
+ )
+ self.mean = 0.0
+ self.std = 1.0
+ self.normal = Normal(self.mean, self.std)
+ self.softplus = nn.Softplus()
+
+ self.reset_parameters()
+
+ def get_gate_network(self, gate_type, input_size, num_experts):
+ gate_type = gate_type.lower()
+
+ if gate_type == "linear":
+ gate_network = nn.Linear(input_size, num_experts, bias=False)
+ nn.init.zeros_(gate_network.weight)
+ elif gate_type == "mlp":
+ gate_network = torch.nn.Sequential(
+ torch.nn.Linear(input_size, num_experts, bias=False),
+ torch.nn.Tanh(),
+ torch.nn.Linear(num_experts, num_experts, bias=False),
+ )
+ else:
+ raise ValueError(f'Unexpected gate_type: {gate_type}.')
+
+ return gate_network
+
+ def reset_gate_network(self):
+ if "gate_network_type" not in vars(self):
+ raise KeyError(f"{type(self)} does not have a gate network.")
+ else:
+ self.gate_network = self.get_gate_network(
+ self.gate_network_type, self.input_size, self.num_experts
+ )
+
+ def reset_parameters(self):
+ if self.add_noise:
+ nn.init.zeros_(self.weight_noise.weight)
+ # nn.init.zeros_(self.weight_noise)
+
+ def cv_squared(self, x, eps=1e-10):
+ """The squared coefficient of variation of a sample.
+ Useful as a loss to encourage a positive distribution to be more uniform.
+ Epsilons added for numerical stability.
+ Returns 0 for an empty Tensor.
+ Args:
+ x: a `Tensor`.
+ Returns:
+ a `Scalar`.s
+ """
+ if x.shape[0] == 1:
+ return torch.tensor(0.0, device=x.device)
+ return x.float().var() / (x.float().mean() ** 2 + eps)
+
+ def forward(self, x):
+ logits_gate = self.gate_network(x)
+ if self.training and self.add_noise:
+ noise_mm = self.weight_noise(x)
+ noise_control = self.softplus(noise_mm) + self.noise_epsilon
+ logits_noise = torch.randn_like(logits_gate) * noise_control
+ logits = logits_gate + logits_noise
+ else:
+ logits = logits_gate
+
+ top_logits, top_indices = logits.topk(min(self.num_selects + 1, self.num_experts), dim=1) # 选择并排序前k+1个权重
+ top_k_logits = top_logits[:, :self.num_selects]
+ top_k_indices = top_indices[:, :self.num_selects]
+ top_k_scores = self.softmax(top_k_logits.to(torch.float32)) if self.use_softmax else top_k_logits
+ top_k_scores = top_k_scores.to(logits.dtype)
+
+ zeros = torch.zeros_like(logits, requires_grad=True, device=logits.device)
+ scores_filtered = zeros.scatter(dim=1, index=top_k_indices, src=top_k_scores) # shape(batch_size, num_experts)
+ importance = scores_filtered.sum(0) # shape(num_experts)
+
+ if self.training:
+ if self.add_noise and self.num_selects != self.num_experts:
+ batch_size = top_logits.size(0)
+ m = top_logits.size(1)
+ top_values_flat = top_logits.flatten()
+ threshold_positions_if_in = torch.arange(batch_size, device=x.device) * m + self.num_selects
+ threshold_if_in = torch.unsqueeze(torch.gather(top_values_flat, 0, threshold_positions_if_in), 1)
+ is_in = torch.gt(logits_noise, threshold_if_in)
+ threshold_positions_if_out = threshold_positions_if_in - 1
+ threshold_if_out = torch.unsqueeze(torch.gather(top_values_flat, 0, threshold_positions_if_out), 1)
+ # is each value currently in the top k.
+ prob_if_in = self.normal.cdf((logits_gate - threshold_if_in) / noise_control)
+ prob_if_out = self.normal.cdf((logits_gate - threshold_if_out) / noise_control)
+ prob = torch.where(is_in, prob_if_in, prob_if_out)
+ load = prob.sum(0)
+ else:
+ load = (scores_filtered > 0).sum(0)
+ if not self.add_noise and not self.warned:
+ warnings.warn('Gradient-trackable implementation for load calculation is only available when "add_noise=True". '
+ 'Training without noise will block the gradient from "load" path and lead to inconsistency in optimization objectives.')
+ self.warned = True
+ else:
+ load = (scores_filtered > 0).sum(0)
+
+ if self.use_balance:
+ balance_loss = self.cv_squared(importance) + self.cv_squared(load)
+ balance_loss *= self.balance_loss_weight
+ else:
+ balance_loss = torch.tensor(-100.0, device=x.device)
+
+ return {
+ "topK_indices": top_k_indices,
+ "topK_scores": top_k_scores,
+ "balance_loss": balance_loss,
+ "load": load,
+ "importance": importance,
+ }
+
+
+class LinearGLUExperts(nn.Module):
+ """
+ Modified from transformers.models.llama.modeling_llama.LlamaMLP
+ """
+
+ __constants__ = [
+ "bias",
+ "in_features",
+ "hidden_features",
+ "out_features",
+ "hidden_act",
+ "num_experts",
+ "size_experts",
+ ]
+
+ def __init__(
+ self,
+ in_features,
+ hidden_features,
+ out_features,
+ hidden_act,
+ num_experts,
+ size_experts=None,
+ bias=True,
+ device=None,
+ dtype=None,
+ ):
+ factory_kwargs = {"device": device, "dtype": dtype}
+ super(LinearGLUExperts, self).__init__()
+ self.in_features = in_features
+ self.hidden_features = hidden_features
+ self.out_features = out_features
+ self.hidden_act = hidden_act
+ self.num_experts = num_experts
+
+ if size_experts is None:
+ # all experts share the same number of hidden neurons
+ assert hidden_features % num_experts == 0
+ size_per_expert = hidden_features // num_experts
+ size_experts = [size_per_expert for _ in range(num_experts)]
+ else:
+ # use specified expert sizes
+ assert (
+ len(size_experts) == num_experts
+ and sum(size_experts) == hidden_features
+ )
+ self.size_experts = size_experts
+
+ self.act_fn = ACT2FN[hidden_act]
+
+ self.weight_gate = nn.ParameterList()
+ self.weight_up = nn.ParameterList()
+ self.weight_down = nn.ParameterList()
+
+ for i in range(num_experts):
+ # this matrix will be transposed when performing linear forwarding
+ this_expert_weight_gate = nn.Parameter(
+ torch.empty((size_experts[i], in_features), **factory_kwargs)
+ )
+ # this matrix will be transposed when performing linear forwarding
+ this_expert_weight_up = nn.Parameter(
+ torch.empty((size_experts[i], in_features), **factory_kwargs)
+ )
+ # this matrix will be transposed when performing linear forwarding
+ this_expert_weight_down = nn.Parameter(
+ torch.empty((out_features, size_experts[i]), **factory_kwargs)
+ )
+ self.weight_gate.append(this_expert_weight_gate)
+ self.weight_up.append(this_expert_weight_up)
+ self.weight_down.append(this_expert_weight_down)
+
+ if bias:
+ self.bias_gate = nn.ParameterList()
+ self.bias_up = nn.ParameterList()
+ self.bias_down = nn.ParameterList()
+
+ for i in range(num_experts):
+ this_expert_bias_gate = nn.Parameter(
+ torch.empty((size_experts[i],), **factory_kwargs)
+ )
+ this_expert_bias_up = nn.Parameter(
+ torch.empty((size_experts[i],), **factory_kwargs)
+ )
+ this_expert_bias_down = nn.Parameter(
+ torch.empty((out_features,), **factory_kwargs)
+ )
+ self.bias_gate.append(this_expert_bias_gate)
+ self.bias_up.append(this_expert_bias_up)
+ self.bias_down.append(this_expert_bias_down)
+ else:
+ self.register_parameter("bias_gate", None)
+ self.register_parameter("bias_up", None)
+ self.register_parameter("bias_down", None)
+
+ self.reset_parameters()
+
+ def reset_parameters(self):
+ for i in range(self.num_experts):
+ nn.init.kaiming_uniform_(self.weight_gate[i], a=math.sqrt(5))
+ nn.init.kaiming_uniform_(self.weight_up[i], a=math.sqrt(5))
+ nn.init.kaiming_uniform_(self.weight_down[i], a=math.sqrt(5))
+ if self.bias_gate is not None:
+ fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight_gate[i])
+ bound = 1 / math.sqrt(fan_in)
+ nn.init.uniform_(self.bias_gate[i], -bound, bound)
+ if self.bias_up is not None:
+ fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight_up[i])
+ bound = 1 / math.sqrt(fan_in)
+ nn.init.uniform_(self.bias_up[i], -bound, bound)
+ if self.bias_down is not None:
+ fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight_down[i])
+ bound = 1 / math.sqrt(fan_in)
+ nn.init.uniform_(self.bias_down[i], -bound, bound)
+
+ def forward(self, input, i):
+ gate = self.act_fn(
+ F.linear(
+ input,
+ self.weight_gate[i],
+ self.bias_gate[i] if self.bias_gate is not None else None,
+ )
+ )
+ up = F.linear(
+ input,
+ self.weight_up[i],
+ self.bias_up[i] if self.bias_up is not None else None,
+ )
+ down = F.linear(
+ gate * up,
+ self.weight_down[i],
+ self.bias_down[i] if self.bias_down is not None else None,
+ )
+ return down
+
+ def extra_repr(self):
+ return (
+ "in_features={}, hidden_features={}, out_features={}, hidden_act={},"
+ " num_experts={}, size_experts={}, bias={}".format(
+ self.in_features,
+ self.hidden_features,
+ self.out_features,
+ self.hidden_act,
+ self.num_experts,
+ self.size_experts,
+ self.bias_gate is not None,
+ )
+ )
+
+
+class UniversalCalculator(nn.Module):
+ def __init__(
+ self,
+ experts: LinearGLUExperts,
+ multiply_gate_scores=True,
+ score_scale_factor=1.0,
+ add_weight_norm: bool = False,
+ ):
+ super(UniversalCalculator, self).__init__()
+ self.experts = experts
+ # TODO (zhutong): use vmap to boost the training efficiency
+ # self.experts_vmap = torch.vmap(self.experts)
+ self.multiply_gate_scores = multiply_gate_scores
+ self.score_scale_factor = score_scale_factor
+ self.num_experts = experts.num_experts
+ self.mlp_norm = None
+ if multiply_gate_scores and add_weight_norm:
+ raise NotImplementedError
+
+ def reset_experts(self):
+ self.experts.reset_parameters()
+
+ def forward(
+ self, x, topK_indices, topK_scores, expert_batch_size=None, **kwargs
+ ) -> CalculatorOutput:
+ batch_size = topK_indices.size(0) # topK_indices: (bsz*seq_len, num_selects)
+ num_selects = topK_indices.size(1)
+ topK_indices = topK_indices.flatten() # shape(batch_size*num_selects)
+ topK_scores = topK_scores.flatten() # shape(batch_size*num_selects)
+ batch_indices = torch.arange(
+ batch_size, device=topK_scores.device
+ ).repeat_interleave(num_selects)
+
+ _, index_sorted_topK_indices = topK_indices.sort(0)
+
+ sorted_topK_scores = topK_scores.index_select(0, index_sorted_topK_indices)
+ sorted_batch_indices = batch_indices.index_select(0, index_sorted_topK_indices)
+
+ if expert_batch_size is None:
+ expert_batch_size = topK_indices.bincount(
+ minlength=self.num_experts
+ ).tolist()
+
+ sorted_x = x.index_select(0, sorted_batch_indices)
+ split_x = torch.split(sorted_x, expert_batch_size, dim=0)
+
+ expert_outputs = [
+ self.experts(split_x[i], i)
+ for i in range(self.num_experts)
+ if split_x[i].shape[0] > 0
+ ]
+
+ # (bsz*seq_len*num_selects, hidden_size)
+ cat_expert_outputs = torch.cat(expert_outputs, 0)
+ output_dim = cat_expert_outputs.size(1)
+ if self.multiply_gate_scores:
+ if self.mlp_norm is None:
+ cat_expert_outputs = torch.mul(
+ cat_expert_outputs,
+ sorted_topK_scores.reshape(-1, 1) * self.score_scale_factor,
+ )
+ # cat_expert_outputs = torch.mul(cat_expert_outputs, sorted_topK_scores.reshape(-1, 1) * 1.0)
+ else:
+ cat_expert_outputs = torch.mul(
+ cat_expert_outputs, sorted_topK_scores.reshape(-1, 1)
+ )
+ cat_expert_outputs = self.mlp_norm(cat_expert_outputs)
+
+ zeros = torch.zeros(
+ (batch_size, output_dim),
+ device=cat_expert_outputs.device,
+ dtype=cat_expert_outputs.dtype,
+ )
+ y = zeros.index_add(0, sorted_batch_indices, cat_expert_outputs)
+
+ return CalculatorOutput(hidden_states=y, num_dropped_tokens=torch.tensor(-1.0))
+
+
+class BaseMoELayer(nn.Module):
+ def __init__(self):
+ super(BaseMoELayer, self).__init__()
+
+ self.gate: TopKBalancedNoisyGate
+ self.calculator: UniversalCalculator
+
+ def _create_gate(self, **kwargs):
+ self.gate_type = kwargs.get("gate_type", "TopKBalancedNoisyGate")
+
+ if self.gate_type == "TopKBalancedNoisyGate": # noisy gate
+ self.gate = TopKBalancedNoisyGate(
+ self.input_size,
+ self.num_experts,
+ self.num_selects,
+ gate_network=kwargs.get("gate_network", "mlp"),
+ use_softmax=kwargs.get("gate_use_softmax", True),
+ use_balance=kwargs.get("gate_use_balance", True),
+ balance_loss_weight=kwargs.get("gate_balance_loss_weight", 1e-2),
+ add_noise=kwargs.get("gate_add_noise", True),
+ noise_epsilon=kwargs.get("gate_noise_epsilon", 1e-2),
+ )
+ else:
+ raise NotImplementedError
+
+ def _create_calculator(self, experts, **kwargs):
+ self.calculator_type = kwargs.get("calculator_type", "UniversalCalculator")
+
+ if self.calculator_type == "UniversalCalculator": # top K calculator
+ self.calculator = UniversalCalculator(
+ experts,
+ multiply_gate_scores=kwargs.get("multiply_gate_scores", True),
+ score_scale_factor=kwargs.get("score_scale_factor", 1.0),
+ add_weight_norm=kwargs.get("add_weight_norm", False),
+ )
+ else:
+ raise NotImplementedError
+
+ def forward(self, x, attention_mask=None) -> MoEMlpOutput:
+ original_shape = x.shape[:-1]
+ x = x.reshape(-1, self.input_size)
+ flattened_mask = None
+ if attention_mask is not None and len(attention_mask.shape) == 2:
+ flattened_mask = attention_mask.flatten()
+ flattened_shape = flattened_mask.shape
+ x = x[flattened_mask.bool()]
+
+ gate_outputs: dict = self.gate(x)
+ calc_outs: CalculatorOutput = self.calculator(x, **gate_outputs)
+
+ y = calc_outs.hidden_states
+ if flattened_mask is not None:
+ y = torch.zeros(flattened_shape + (self.output_size,), dtype=x.dtype, device=x.device) # (batch_size*seq_len, output_size)
+ y[flattened_mask.bool()] = calc_outs.hidden_states # (non_padding_num, output_size)
+ y = y.reshape(original_shape + (self.output_size,))
+
+ return MoEMlpOutput(
+ hidden_states=y,
+ balance_loss=gate_outputs.get("balance_loss"),
+ num_dropped_tokens=calc_outs.num_dropped_tokens,
+ gate_load=gate_outputs.get("load", torch.tensor(-1)),
+ gate_importance=gate_outputs.get("importance", torch.tensor(-1)),
+ )
+
+ def reset_gate_network(self):
+ self.gate.reset_gate_network()
+
+ def reset_experts(self):
+ self.calculator.reset_experts()
+
+
+class LinearGLUMoELayer(BaseMoELayer):
+ def __init__(
+ self,
+ input_size,
+ hidden_size,
+ output_size,
+ hidden_act,
+ num_experts,
+ num_selects,
+ size_experts=None,
+ bias=True,
+ **kwargs,
+ ):
+ super(LinearGLUMoELayer, self).__init__()
+ assert num_selects <= num_experts
+ self.input_size = input_size
+ self.hidden_size = hidden_size
+ self.output_size = output_size
+ self.hidden_act = hidden_act
+ self.num_experts = num_experts
+ self.num_selects = num_selects
+ self.size_experts = size_experts
+ self.bias = bias
+
+ experts = LinearGLUExperts(
+ input_size,
+ hidden_size,
+ output_size,
+ hidden_act,
+ num_experts,
+ size_experts=size_experts,
+ bias=bias,
+ )
+
+ self._create_gate(**kwargs)
+ self._create_calculator(experts, **kwargs)
+
+
+class LlamaMoEDecoderLayer(nn.Module):
+ def __init__(self, config: LlamaMoEConfig, layer_index):
+ super().__init__()
+
+ self.hidden_size = config.hidden_size
+ # self.self_attn = LlamaAttention(config=config)
+ self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_index)
+
+ self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ gating_config = {
+ # all gates
+ "gate_type": config.gate_type,
+ "gate_network": config.gate_network,
+ "gate_use_softmax": config.gate_use_softmax,
+ "gate_use_balance": config.gate_use_balance,
+ "gate_balance_loss_weight": config.gate_balance_loss_weight,
+ "gate_add_noise": config.gate_add_noise,
+ # TopKBalancedNoisyGate
+ "gate_noise_epsilon": config.gate_noise_epsilon,
+ }
+ calculator_config = {
+ # all calculators
+ "calculator_type": config.calculator_type,
+ "multiply_gate_scores": config.multiply_gate_scores,
+ "score_scale_factor": (
+ config.score_scale_factor[layer_index]
+ if isinstance(config.score_scale_factor, list)
+ else config.score_scale_factor
+ ),
+ "add_weight_norm": config.add_weight_norm,
+ # SwitchDropTokenCalculator
+ "drop_tokens": config.drop_tokens,
+ "dropped_padding": config.dropped_padding,
+ "capacity_factor": config.capacity_factor,
+ }
+
+ self.mlp = LinearGLUMoELayer(
+ input_size=self.hidden_size,
+ hidden_size=config.intermediate_size,
+ output_size=self.hidden_size,
+ hidden_act=config.hidden_act,
+ num_experts=config.num_experts,
+ num_selects=config.num_selects,
+ size_experts=(
+ config.size_experts[layer_index]
+ if config.size_experts is not None
+ else None
+ ),
+ bias=False,
+ **gating_config,
+ **calculator_config,
+ )
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ position_ids=None,
+ past_key_value=None,
+ output_attentions=False,
+ use_cache=False,
+ ) -> tuple:
+ residual = hidden_states
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ mlp_outs: MoEMlpOutput = self.mlp(hidden_states, attention_mask=attention_mask)
+ hidden_states = residual + mlp_outs.hidden_states
+
+ outputs = (
+ hidden_states,
+ mlp_outs.balance_loss,
+ mlp_outs.num_dropped_tokens,
+ mlp_outs.gate_load,
+ mlp_outs.gate_importance,
+ )
+ if output_attentions:
+ outputs += (self_attn_weights,)
+ if use_cache:
+ outputs += (present_key_value,)
+
+ return outputs
+
+
+class LlamaMoEPreTrainedModel(PreTrainedModel):
+ config_class = LlamaMoEConfig
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["LlamaMoEDecoderLayer"]
+ _skip_keys_device_placement = "past_key_values"
+ _supports_flash_attn_2 = True
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+
+class LlamaMoEModel(LlamaMoEPreTrainedModel):
+ def __init__(self, config: LlamaMoEConfig):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+ self.layers = nn.ModuleList(
+ [LlamaMoEDecoderLayer(config, i) for i in range(config.num_hidden_layers)]
+ )
+ self._use_sdpa = config._attn_implementation == "sdpa"
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+ self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.gradient_checkpointing = False
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.embed_tokens = value
+
+ def forward(
+ self,
+ input_ids=None,
+ attention_mask=None,
+ position_ids=None,
+ past_key_values=None,
+ inputs_embeds=None,
+ use_cache=None,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ ):
+ output_attentions = (
+ output_attentions
+ if output_attentions is not None
+ else self.config.output_attentions
+ )
+ output_hidden_states = (
+ output_hidden_states
+ if output_hidden_states is not None
+ else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = (
+ return_dict if return_dict is not None else self.config.use_return_dict
+ )
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError(
+ "You cannot specify both decoder_input_ids and decoder_inputs_embeds at"
+ " the same time"
+ )
+ elif input_ids is not None:
+ batch_size, seq_length = input_ids.shape
+ elif inputs_embeds is not None:
+ batch_size, seq_length, _ = inputs_embeds.shape
+ else:
+ raise ValueError(
+ "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+ )
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ past_key_values_length = 0
+ if use_cache:
+ use_legacy_cache = not isinstance(past_key_values, Cache)
+ if use_legacy_cache:
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+ past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+ if position_ids is None:
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+ position_ids = torch.arange(
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+ )
+ position_ids = position_ids.unsqueeze(0)
+
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_tokens(input_ids)
+
+ if self._use_flash_attention_2:
+ # 2d mask is passed through the layers
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+ elif self._use_sdpa and not output_attentions:
+ # output_attentions=True can not be supported when using SDPA, and we fall back on
+ # the manual implementation that requires a 4D causal mask in all cases.
+ attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+ attention_mask,
+ (batch_size, seq_length),
+ inputs_embeds,
+ past_key_values_length,
+ )
+ else:
+ # 4d mask is passed through the layers
+ attention_mask = _prepare_4d_causal_attention_mask(
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+ )
+
+ hidden_states = inputs_embeds
+ balance_loss = 0.0
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ next_decoder_cache = None
+
+ num_dropped_tokens = ()
+ gate_load = ()
+ gate_importance = ()
+ for idx, decoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ decoder_layer.__call__,
+ hidden_states,
+ attention_mask,
+ position_ids,
+ past_key_values,
+ output_attentions,
+ use_cache,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_values,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+
+ hidden_states = layer_outputs[0]
+ if layer_outputs[1] is not None:
+ balance_loss += layer_outputs[1]
+
+ if use_cache:
+ next_decoder_cache = layer_outputs[6 if output_attentions else 5]
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[5],)
+
+ num_dropped_tokens += (layer_outputs[2],)
+ gate_load += (layer_outputs[3],)
+ gate_importance += (layer_outputs[4],)
+
+ hidden_states = self.norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = None
+ if use_cache:
+ next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+ if not return_dict:
+ return tuple(
+ v
+ for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+ if v is not None
+ )
+ return BaseMoEModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ balance_loss=balance_loss,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ num_dropped_tokens=num_dropped_tokens,
+ gate_load=gate_load,
+ gate_importance=gate_importance,
+ )
+
+ def reset_gate_network(self):
+ for idx, decoder_layer in enumerate(self.layers):
+ decoder_layer.reset_gate_network()
+
+ def reset_experts(self):
+ for idx, decoder_layer in enumerate(self.layers):
+ decoder_layer.reset_experts()
+
+
+class LlamaMoEForCausalLM(LlamaMoEPreTrainedModel):
+ _tied_weights_keys = ["lm_head.weight"]
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = LlamaMoEModel(config)
+ self.pretraining_tp = config.pretraining_tp
+ self.vocab_size = config.vocab_size
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ def forward(
+ self,
+ input_ids=None,
+ attention_mask=None,
+ position_ids=None,
+ past_key_values=None,
+ inputs_embeds=None,
+ labels=None,
+ use_cache=None,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ **kwargs,
+ ):
+ output_attentions = (
+ output_attentions
+ if output_attentions is not None
+ else self.config.output_attentions
+ )
+ output_hidden_states = (
+ output_hidden_states
+ if output_hidden_states is not None
+ else self.config.output_hidden_states
+ )
+ return_dict = (
+ return_dict if return_dict is not None else self.config.use_return_dict
+ )
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs: BaseMoEModelOutputWithPast = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = outputs.last_hidden_state
+ logits = self.lm_head(hidden_states)
+
+ loss = None
+ if labels is not None:
+ # Shift so that tokens < n predict n
+ shift_logits = logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ # Flatten the tokens
+ loss_fct = nn.CrossEntropyLoss()
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
+ shift_labels = shift_labels.view(-1)
+ # Enable model parallelism
+ shift_labels = shift_labels.to(shift_logits.device)
+ loss = loss_fct(shift_logits, shift_labels)
+ if outputs.balance_loss is not None and outputs.balance_loss > 0:
+ loss += outputs.balance_loss
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return MoECausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ num_dropped_tokens=outputs.num_dropped_tokens,
+ balance_loss=outputs.balance_loss,
+ gate_load=outputs.gate_load,
+ gate_importance=outputs.gate_importance,
+ )
+
+ def prepare_inputs_for_generation(
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+ ):
+ if past_key_values is not None:
+ if isinstance(past_key_values, Cache):
+ cache_length = past_key_values.get_seq_length()
+ past_length = past_key_values.seen_tokens
+ max_cache_length = past_key_values.get_max_length()
+ else:
+ cache_length = past_length = past_key_values[0][0].shape[2]
+ max_cache_length = None
+
+ # Keep only the unprocessed tokens:
+ # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+ # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+ # input)
+ if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+ # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+ # input_ids based on the past_length.
+ elif past_length < input_ids.shape[1]:
+ input_ids = input_ids[:, past_length:]
+ # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+ # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+ if (
+ max_cache_length is not None
+ and attention_mask is not None
+ and cache_length + input_ids.shape[1] > max_cache_length
+ ):
+ attention_mask = attention_mask[:, -max_cache_length:]
+
+ position_ids = kwargs.get("position_ids", None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -input_ids.shape[1] :]
+
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+ if inputs_embeds is not None and past_key_values is None:
+ model_inputs = {"inputs_embeds": inputs_embeds}
+ else:
+ model_inputs = {"input_ids": input_ids}
+
+ model_inputs.update(
+ {
+ "position_ids": position_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "attention_mask": attention_mask,
+ }
+ )
+ return model_inputs
+
+ @staticmethod
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+ )
+ return reordered_past
+
+ def reset_gate_network(self):
+ self.model.reset_gate_network()
+
+ def reset_experts(self):
+ self.model.reset_experts()
diff --git a/sampling_info/100/load.pdf b/sampling_info/100/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..6cc6eb7d5ae651404ed6b300dabd3d8c01dd3cb1
Binary files /dev/null and b/sampling_info/100/load.pdf differ
diff --git a/sampling_info/100/prob_map.pdf b/sampling_info/100/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..9e3272339d043dbca65b92362b44c3cb228ca0c6
Binary files /dev/null and b/sampling_info/100/prob_map.pdf differ
diff --git a/sampling_info/100/sim.pdf b/sampling_info/100/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..cc92d57d5032a2470e386d8b3cfac587cca60503
Binary files /dev/null and b/sampling_info/100/sim.pdf differ
diff --git a/sampling_info/1000/load.pdf b/sampling_info/1000/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..0b378bd8fb22b57cfd1391db4529ac95ec1e2292
Binary files /dev/null and b/sampling_info/1000/load.pdf differ
diff --git a/sampling_info/1000/prob_map.pdf b/sampling_info/1000/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..e4d31f8a6f3f6592d1ff85a88f9629c3123c7c3f
Binary files /dev/null and b/sampling_info/1000/prob_map.pdf differ
diff --git a/sampling_info/1000/sim.pdf b/sampling_info/1000/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..2ece3648860b77efcba8cdb60eee529f6c188091
Binary files /dev/null and b/sampling_info/1000/sim.pdf differ
diff --git a/sampling_info/1100/load.pdf b/sampling_info/1100/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..0e3bc65644ba960eb63153005ab196d63bc55079
Binary files /dev/null and b/sampling_info/1100/load.pdf differ
diff --git a/sampling_info/1100/prob_map.pdf b/sampling_info/1100/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..c082785ec24e7c8dcdeaefc911cdc4df39aaed9a
Binary files /dev/null and b/sampling_info/1100/prob_map.pdf differ
diff --git a/sampling_info/1100/sim.pdf b/sampling_info/1100/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..7d957e7d97b1c90f544e6098c9a2d6d12501e502
Binary files /dev/null and b/sampling_info/1100/sim.pdf differ
diff --git a/sampling_info/1200/load.pdf b/sampling_info/1200/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..7092fef87780daa99c4c27b9b51ee5472efd183d
Binary files /dev/null and b/sampling_info/1200/load.pdf differ
diff --git a/sampling_info/1200/prob_map.pdf b/sampling_info/1200/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..bfe97c1c4f66ea433004745b6fa44deddf219045
Binary files /dev/null and b/sampling_info/1200/prob_map.pdf differ
diff --git a/sampling_info/1200/sim.pdf b/sampling_info/1200/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..3ce61d170a4f24990feaa9befb5cac71f031c8be
Binary files /dev/null and b/sampling_info/1200/sim.pdf differ
diff --git a/sampling_info/1300/load.pdf b/sampling_info/1300/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..5b1c9b63b145a27c90fcf9478c9b9594c67a4065
Binary files /dev/null and b/sampling_info/1300/load.pdf differ
diff --git a/sampling_info/1300/prob_map.pdf b/sampling_info/1300/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..754039ee83685d73a714dd8e6e8cb7e0972025a2
Binary files /dev/null and b/sampling_info/1300/prob_map.pdf differ
diff --git a/sampling_info/1300/sim.pdf b/sampling_info/1300/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..482a9db9ddca31346756930f033971d6342700c4
Binary files /dev/null and b/sampling_info/1300/sim.pdf differ
diff --git a/sampling_info/1400/load.pdf b/sampling_info/1400/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..d42c2a335c2ff9ea94ebc028d1c8a6bfff21a483
Binary files /dev/null and b/sampling_info/1400/load.pdf differ
diff --git a/sampling_info/1400/prob_map.pdf b/sampling_info/1400/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..65a5485fa34a7debdefcd6bf974956ce68b5abae
Binary files /dev/null and b/sampling_info/1400/prob_map.pdf differ
diff --git a/sampling_info/1400/sim.pdf b/sampling_info/1400/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..e9d94d9eb05c61342526eb0231d233127b9ec22b
Binary files /dev/null and b/sampling_info/1400/sim.pdf differ
diff --git a/sampling_info/1500/load.pdf b/sampling_info/1500/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..c53bd776397dfea411c5faba63706ef290513d75
Binary files /dev/null and b/sampling_info/1500/load.pdf differ
diff --git a/sampling_info/1500/prob_map.pdf b/sampling_info/1500/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..1ff8648bbd97165d588b3c92add6d7c2d067983e
Binary files /dev/null and b/sampling_info/1500/prob_map.pdf differ
diff --git a/sampling_info/1500/sim.pdf b/sampling_info/1500/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..41c96d180634f30300fdc1a88c8df689929a8d9a
Binary files /dev/null and b/sampling_info/1500/sim.pdf differ
diff --git a/sampling_info/1600/load.pdf b/sampling_info/1600/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..8aeb1b86ca976e30de8540b21ae1b5b920e191c9
Binary files /dev/null and b/sampling_info/1600/load.pdf differ
diff --git a/sampling_info/1600/prob_map.pdf b/sampling_info/1600/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..c7362ceb143796dc7fe04e8e25b8dd3b16cb003b
Binary files /dev/null and b/sampling_info/1600/prob_map.pdf differ
diff --git a/sampling_info/1600/sim.pdf b/sampling_info/1600/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..a84bb1ed29423edfbfdea67a2b930dd3859ed96d
Binary files /dev/null and b/sampling_info/1600/sim.pdf differ
diff --git a/sampling_info/1700/load.pdf b/sampling_info/1700/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..a9e2742970fb57a5eebfc752bcee141313be4e1c
Binary files /dev/null and b/sampling_info/1700/load.pdf differ
diff --git a/sampling_info/1700/prob_map.pdf b/sampling_info/1700/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..292030ce663427650c24ba48aff5cdda4dfc8a38
Binary files /dev/null and b/sampling_info/1700/prob_map.pdf differ
diff --git a/sampling_info/1700/sim.pdf b/sampling_info/1700/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..eee716d1ca601d89a65268ae3aef81d02ec13351
Binary files /dev/null and b/sampling_info/1700/sim.pdf differ
diff --git a/sampling_info/1800/load.pdf b/sampling_info/1800/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..eb809e83d770b2845aad8c5d0c0e0c9873caff04
Binary files /dev/null and b/sampling_info/1800/load.pdf differ
diff --git a/sampling_info/1800/prob_map.pdf b/sampling_info/1800/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..515a9ecebc6f8ece1e84b7ca3e100c6055f415b7
Binary files /dev/null and b/sampling_info/1800/prob_map.pdf differ
diff --git a/sampling_info/1800/sim.pdf b/sampling_info/1800/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..69435e32fb379d1983528c8b2d7dc0845f294f4f
Binary files /dev/null and b/sampling_info/1800/sim.pdf differ
diff --git a/sampling_info/1900/load.pdf b/sampling_info/1900/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..40debcd3cc5be2756ac4d19c1d34ada8879f7347
Binary files /dev/null and b/sampling_info/1900/load.pdf differ
diff --git a/sampling_info/1900/prob_map.pdf b/sampling_info/1900/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..5d4abf233eb3bcf034ecb43931ef9b0155a0297b
Binary files /dev/null and b/sampling_info/1900/prob_map.pdf differ
diff --git a/sampling_info/1900/sim.pdf b/sampling_info/1900/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..ffda20f156b1997a7f0a8e3029147fcf61ac4ea4
Binary files /dev/null and b/sampling_info/1900/sim.pdf differ
diff --git a/sampling_info/200/load.pdf b/sampling_info/200/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..4f8ff3b870efb490db41cb6c4cd682018bd92d80
Binary files /dev/null and b/sampling_info/200/load.pdf differ
diff --git a/sampling_info/200/prob_map.pdf b/sampling_info/200/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..07f2d9f09ee8e34417a38de6ac47e374cd494e9b
Binary files /dev/null and b/sampling_info/200/prob_map.pdf differ
diff --git a/sampling_info/200/sim.pdf b/sampling_info/200/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..dbaf5567518363f2e8af88a1dd03588863c032b3
Binary files /dev/null and b/sampling_info/200/sim.pdf differ
diff --git a/sampling_info/2000/load.pdf b/sampling_info/2000/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..437a79244da25b133ac02ab19174b6f22d98c40e
Binary files /dev/null and b/sampling_info/2000/load.pdf differ
diff --git a/sampling_info/2000/prob_map.pdf b/sampling_info/2000/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..395510eb7a42b1f0ccab0b74b00d80de6a071b28
Binary files /dev/null and b/sampling_info/2000/prob_map.pdf differ
diff --git a/sampling_info/2000/sim.pdf b/sampling_info/2000/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..b8abf711b2dda30df0766071b0f9928bddf729dc
Binary files /dev/null and b/sampling_info/2000/sim.pdf differ
diff --git a/sampling_info/300/load.pdf b/sampling_info/300/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..38101113c8187950fc7d88f3c6e96606595c32f0
Binary files /dev/null and b/sampling_info/300/load.pdf differ
diff --git a/sampling_info/300/prob_map.pdf b/sampling_info/300/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..7ec6cbb6302b5b4e3230654f70ba5ffaa7551094
Binary files /dev/null and b/sampling_info/300/prob_map.pdf differ
diff --git a/sampling_info/300/sim.pdf b/sampling_info/300/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..5f5ea72c687f21e583af927e304f375a25c8c322
Binary files /dev/null and b/sampling_info/300/sim.pdf differ
diff --git a/sampling_info/400/load.pdf b/sampling_info/400/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..4886e7616e7ab348e38b2cba5ff09ee1e79c0191
Binary files /dev/null and b/sampling_info/400/load.pdf differ
diff --git a/sampling_info/400/prob_map.pdf b/sampling_info/400/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..200f7b2d3698aa8af6f0b044c1dd87475c49d9b5
Binary files /dev/null and b/sampling_info/400/prob_map.pdf differ
diff --git a/sampling_info/400/sim.pdf b/sampling_info/400/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..bffb512b45ab99f08a748ad280e37e843c7b31ca
Binary files /dev/null and b/sampling_info/400/sim.pdf differ
diff --git a/sampling_info/500/load.pdf b/sampling_info/500/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..39bf8545fd5bf7d139031fdab934223e685e2540
Binary files /dev/null and b/sampling_info/500/load.pdf differ
diff --git a/sampling_info/500/prob_map.pdf b/sampling_info/500/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..447fcc083909f4050349150e1223ea4211b11c79
Binary files /dev/null and b/sampling_info/500/prob_map.pdf differ
diff --git a/sampling_info/500/sim.pdf b/sampling_info/500/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..d78538ca00f8267fcf714608508f3bd4b8877cc8
Binary files /dev/null and b/sampling_info/500/sim.pdf differ
diff --git a/sampling_info/600/load.pdf b/sampling_info/600/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..ad6442d9fa3cccc8559a9637e8157cdc22d47e45
Binary files /dev/null and b/sampling_info/600/load.pdf differ
diff --git a/sampling_info/600/prob_map.pdf b/sampling_info/600/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..c3241a0c02e0cbb40cf2cb0faa7c0486ad14c91d
Binary files /dev/null and b/sampling_info/600/prob_map.pdf differ
diff --git a/sampling_info/600/sim.pdf b/sampling_info/600/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..4c2608e3e9bbd3ed91de2b68a9b42c2f417a47a2
Binary files /dev/null and b/sampling_info/600/sim.pdf differ
diff --git a/sampling_info/700/load.pdf b/sampling_info/700/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..2297ac8a8f832f14c9f4704e20b8490fe1687cae
Binary files /dev/null and b/sampling_info/700/load.pdf differ
diff --git a/sampling_info/700/prob_map.pdf b/sampling_info/700/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..07163f14c62b0cbae11ce761bb52943a7821a3ff
Binary files /dev/null and b/sampling_info/700/prob_map.pdf differ
diff --git a/sampling_info/700/sim.pdf b/sampling_info/700/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..16ac462ad590f4cdc7a44a266e95326416b2a794
Binary files /dev/null and b/sampling_info/700/sim.pdf differ
diff --git a/sampling_info/800/load.pdf b/sampling_info/800/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..9b4aec4c00e04af291272b24dbb4303fb18e9fe5
Binary files /dev/null and b/sampling_info/800/load.pdf differ
diff --git a/sampling_info/800/prob_map.pdf b/sampling_info/800/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..48204b60e01768797624caf568d962f024b3d9e7
Binary files /dev/null and b/sampling_info/800/prob_map.pdf differ
diff --git a/sampling_info/800/sim.pdf b/sampling_info/800/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..f4c2c79366ee3fecdcba8df83df63e3b3b71015f
Binary files /dev/null and b/sampling_info/800/sim.pdf differ
diff --git a/sampling_info/900/load.pdf b/sampling_info/900/load.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..5b4723a369aff02fe3136d5ae61e03f31327261b
Binary files /dev/null and b/sampling_info/900/load.pdf differ
diff --git a/sampling_info/900/prob_map.pdf b/sampling_info/900/prob_map.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..2a0c23874e999b2e77277f67984bf795ba934180
Binary files /dev/null and b/sampling_info/900/prob_map.pdf differ
diff --git a/sampling_info/900/sim.pdf b/sampling_info/900/sim.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..98107fdeb60e40150db3472ca04004f8b2debc22
Binary files /dev/null and b/sampling_info/900/sim.pdf differ
diff --git a/sampling_info/data.jsonl b/sampling_info/data.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5bb1bd27578bb123f8b010a7f61becc470ce116a
--- /dev/null
+++ b/sampling_info/data.jsonl
@@ -0,0 +1,60 @@
+{"step": 100, "old_prob_map": {"code": 0.25, "sharegpt": 0.25, "orca": 0.25, "math": 0.25}, "new_prob_map": {"code": 0.2572303488138314, "math": 0.2588491538502336, "orca": 0.24410416223157866, "sharegpt": 0.23981633510435646}, "sim": [[0.12586449411651235, 0.1258017498961425, 0.12542206677168527, 0.12547745437725044], [0.1258017498961425, 0.1271137439587301, 0.12608451435551657, 0.12595773669791455], [0.12542206677168527, 0.12608451435551657, 0.12707085949740077, 0.12682451018998525], [0.12547745437725044, 0.12595773669791455, 0.12682451018998525, 0.12667757289382245]], "name2load": {"code": [0.13847222222222222, 0.1329861111111111, 0.12631944444444446, 0.10090277777777779, 0.1232638888888889, 0.12413194444444445, 0.13052083333333334, 0.12340277777777776], "orca": [0.15915094039397892, 0.11213124144734748, 0.1264519619387073, 0.12225121726124175, 0.11131973395283709, 0.14101136110492316, 0.11813003214206154, 0.10955351175890271], "math": [0.15772125744084223, 0.10886997589413094, 0.11236286712254635, 0.10700054115216213, 0.12215280169233039, 0.11964382348600382, 0.13885472524228856, 0.13339400796969544], "sharegpt": [0.1538298559263893, 0.11250454013479154, 0.1309021752290246, 0.11805359376891722, 0.11528411154606724, 0.1396192340288147, 0.12180172726905847, 0.10800476209693692]}}
+{"step": 100, "old_prob_map": {"code": 0.25, "sharegpt": 0.25, "orca": 0.25, "math": 0.25}, "new_prob_map": {"code": 0.2572303488138314, "math": 0.2588491538502336, "orca": 0.24410416223157866, "sharegpt": 0.23981633510435646}, "sim": [[0.12586449411651235, 0.1258017498961425, 0.12542206677168527, 0.12547745437725044], [0.1258017498961425, 0.1271137439587301, 0.12608451435551657, 0.12595773669791455], [0.12542206677168527, 0.12608451435551657, 0.12707085949740077, 0.12682451018998525], [0.12547745437725044, 0.12595773669791455, 0.12682451018998525, 0.12667757289382245]], "name2load": {"code": [0.13847222222222222, 0.1329861111111111, 0.12631944444444446, 0.10090277777777779, 0.1232638888888889, 0.12413194444444445, 0.13052083333333334, 0.12340277777777776], "orca": [0.15915094039397892, 0.11213124144734748, 0.1264519619387073, 0.12225121726124175, 0.11131973395283709, 0.14101136110492316, 0.11813003214206154, 0.10955351175890271], "math": [0.15772125744084223, 0.10886997589413094, 0.11236286712254635, 0.10700054115216213, 0.12215280169233039, 0.11964382348600382, 0.13885472524228856, 0.13339400796969544], "sharegpt": [0.1538298559263893, 0.11250454013479154, 0.1309021752290246, 0.11805359376891722, 0.11528411154606724, 0.1396192340288147, 0.12180172726905847, 0.10800476209693692]}}
+{"step": 100, "old_prob_map": {"code": 0.25, "sharegpt": 0.25, "orca": 0.25, "math": 0.25}, "new_prob_map": {"code": 0.2572303488138314, "math": 0.2588491538502336, "orca": 0.24410416223157866, "sharegpt": 0.23981633510435646}, "sim": [[0.12586449411651235, 0.1258017498961425, 0.12542206677168527, 0.12547745437725044], [0.1258017498961425, 0.1271137439587301, 0.12608451435551657, 0.12595773669791455], [0.12542206677168527, 0.12608451435551657, 0.12707085949740077, 0.12682451018998525], [0.12547745437725044, 0.12595773669791455, 0.12682451018998525, 0.12667757289382245]], "name2load": {"code": [0.13847222222222222, 0.1329861111111111, 0.12631944444444446, 0.10090277777777779, 0.1232638888888889, 0.12413194444444445, 0.13052083333333334, 0.12340277777777776], "orca": [0.15915094039397892, 0.11213124144734748, 0.1264519619387073, 0.12225121726124175, 0.11131973395283709, 0.14101136110492316, 0.11813003214206154, 0.10955351175890271], "math": [0.15772125744084223, 0.10886997589413094, 0.11236286712254635, 0.10700054115216213, 0.12215280169233039, 0.11964382348600382, 0.13885472524228856, 0.13339400796969544], "sharegpt": [0.1538298559263893, 0.11250454013479154, 0.1309021752290246, 0.11805359376891722, 0.11528411154606724, 0.1396192340288147, 0.12180172726905847, 0.10800476209693692]}}
+{"step": 200, "old_prob_map": {"code": 0.2572303488138314, "math": 0.2588491538502336, "orca": 0.24410416223157866, "sharegpt": 0.23981633510435646}, "new_prob_map": {"code": 0.26041014282145614, "math": 0.2699967341834913, "orca": 0.2388624915646586, "sharegpt": 0.2307306314303939}, "sim": [[0.12557887008101853, 0.12565037163214773, 0.1255207902385726, 0.12553045151780629], [0.12565037163214773, 0.12691183050468666, 0.12602518187654757, 0.12586824087634546], [0.1255207902385726, 0.12602518187654757, 0.12701431126844212, 0.1267592077148191], [0.12553045151780629, 0.12586824087634546, 0.1267592077148191, 0.12658454559077903]], "name2load": {"code": [0.14038194444444446, 0.12777777777777777, 0.12628472222222223, 0.10847222222222222, 0.1252777777777778, 0.12333333333333332, 0.1295138888888889, 0.11895833333333335], "orca": [0.15915094039397892, 0.11076281704483977, 0.12624510708716544, 0.12474938739140119, 0.11291092511854374, 0.13927696273430287, 0.11712758170766635, 0.10977627852210163], "math": [0.1569587248487234, 0.10840261720863877, 0.10938653023072761, 0.11396172578344076, 0.12547350814188027, 0.11779898656958725, 0.1381905839523786, 0.1298273232646234], "sharegpt": [0.15361798296944992, 0.11186892126397352, 0.1299941482707131, 0.1208079422091287, 0.1149158561685298, 0.13860527059203356, 0.12090883409338553, 0.10928104443278583]}}
+{"step": 200, "old_prob_map": {"code": 0.2572303488138314, "math": 0.2588491538502336, "orca": 0.24410416223157866, "sharegpt": 0.23981633510435646}, "new_prob_map": {"code": 0.26041014282145614, "math": 0.2699967341834913, "orca": 0.2388624915646586, "sharegpt": 0.2307306314303939}, "sim": [[0.12557887008101853, 0.12565037163214773, 0.1255207902385726, 0.12553045151780629], [0.12565037163214773, 0.12691183050468666, 0.12602518187654757, 0.12586824087634546], [0.1255207902385726, 0.12602518187654757, 0.12701431126844212, 0.1267592077148191], [0.12553045151780629, 0.12586824087634546, 0.1267592077148191, 0.12658454559077903]], "name2load": {"code": [0.14038194444444446, 0.12777777777777777, 0.12628472222222223, 0.10847222222222222, 0.1252777777777778, 0.12333333333333332, 0.1295138888888889, 0.11895833333333335], "orca": [0.15915094039397892, 0.11076281704483977, 0.12624510708716544, 0.12474938739140119, 0.11291092511854374, 0.13927696273430287, 0.11712758170766635, 0.10977627852210163], "math": [0.1569587248487234, 0.10840261720863877, 0.10938653023072761, 0.11396172578344076, 0.12547350814188027, 0.11779898656958725, 0.1381905839523786, 0.1298273232646234], "sharegpt": [0.15361798296944992, 0.11186892126397352, 0.1299941482707131, 0.1208079422091287, 0.1149158561685298, 0.13860527059203356, 0.12090883409338553, 0.10928104443278583]}}
+{"step": 200, "old_prob_map": {"code": 0.2572303488138314, "math": 0.2588491538502336, "orca": 0.24410416223157866, "sharegpt": 0.23981633510435646}, "new_prob_map": {"code": 0.26041014282145614, "math": 0.2699967341834913, "orca": 0.2388624915646586, "sharegpt": 0.2307306314303939}, "sim": [[0.12557887008101853, 0.12565037163214773, 0.1255207902385726, 0.12553045151780629], [0.12565037163214773, 0.12691183050468666, 0.12602518187654757, 0.12586824087634546], [0.1255207902385726, 0.12602518187654757, 0.12701431126844212, 0.1267592077148191], [0.12553045151780629, 0.12586824087634546, 0.1267592077148191, 0.12658454559077903]], "name2load": {"code": [0.14038194444444446, 0.12777777777777777, 0.12628472222222223, 0.10847222222222222, 0.1252777777777778, 0.12333333333333332, 0.1295138888888889, 0.11895833333333335], "orca": [0.15915094039397892, 0.11076281704483977, 0.12624510708716544, 0.12474938739140119, 0.11291092511854374, 0.13927696273430287, 0.11712758170766635, 0.10977627852210163], "math": [0.1569587248487234, 0.10840261720863877, 0.10938653023072761, 0.11396172578344076, 0.12547350814188027, 0.11779898656958725, 0.1381905839523786, 0.1298273232646234], "sharegpt": [0.15361798296944992, 0.11186892126397352, 0.1299941482707131, 0.1208079422091287, 0.1149158561685298, 0.13860527059203356, 0.12090883409338553, 0.10928104443278583]}}
+{"step": 300, "old_prob_map": {"code": 0.26041014282145614, "math": 0.2699967341834913, "orca": 0.2388624915646586, "sharegpt": 0.2307306314303939}, "new_prob_map": {"code": 0.262732059091419, "math": 0.2804335637149032, "orca": 0.23390804147258062, "sharegpt": 0.22292633572109724}, "sim": [[0.12574693769290124, 0.1257249944313256, 0.12580449354595183, 0.1257251789850052], [0.1257249944313256, 0.12691919157975104, 0.12625310413982882, 0.12598091200100756], [0.12580449354595183, 0.12625310413982882, 0.12722391119439105, 0.1268789970410317], [0.1257251789850052, 0.12598091200100756, 0.1268789970410317, 0.12665033434484935]], "name2load": {"code": [0.14305555555555555, 0.13052083333333334, 0.13197916666666668, 0.11215277777777778, 0.11579861111111112, 0.12083333333333333, 0.12854166666666667, 0.11711805555555556], "orca": [0.16174458199408076, 0.11194029850746268, 0.128297743690927, 0.12694523120007636, 0.10855106132450752, 0.1353467205550075, 0.11860738949177353, 0.10856697323616459], "math": [0.1594431052294977, 0.11096079106606975, 0.11413391056230628, 0.1136911497023663, 0.12011118217149602, 0.11592955182761842, 0.13833817090569192, 0.1273921385349535], "sharegpt": [0.15417793292707535, 0.11246418338108882, 0.13297550345050244, 0.1228005569232011, 0.11246418338108882, 0.13609810726825133, 0.12055066790427377, 0.10846886476451834]}}
+{"step": 300, "old_prob_map": {"code": 0.26041014282145614, "math": 0.2699967341834913, "orca": 0.2388624915646586, "sharegpt": 0.2307306314303939}, "new_prob_map": {"code": 0.262732059091419, "math": 0.2804335637149032, "orca": 0.23390804147258062, "sharegpt": 0.22292633572109724}, "sim": [[0.12574693769290124, 0.1257249944313256, 0.12580449354595183, 0.1257251789850052], [0.1257249944313256, 0.12691919157975104, 0.12625310413982882, 0.12598091200100756], [0.12580449354595183, 0.12625310413982882, 0.12722391119439105, 0.1268789970410317], [0.1257251789850052, 0.12598091200100756, 0.1268789970410317, 0.12665033434484935]], "name2load": {"code": [0.14305555555555555, 0.13052083333333334, 0.13197916666666668, 0.11215277777777778, 0.11579861111111112, 0.12083333333333333, 0.12854166666666667, 0.11711805555555556], "orca": [0.16174458199408076, 0.11194029850746268, 0.128297743690927, 0.12694523120007636, 0.10855106132450752, 0.1353467205550075, 0.11860738949177353, 0.10856697323616459], "math": [0.1594431052294977, 0.11096079106606975, 0.11413391056230628, 0.1136911497023663, 0.12011118217149602, 0.11592955182761842, 0.13833817090569192, 0.1273921385349535], "sharegpt": [0.15417793292707535, 0.11246418338108882, 0.13297550345050244, 0.1228005569232011, 0.11246418338108882, 0.13609810726825133, 0.12055066790427377, 0.10846886476451834]}}
+{"step": 300, "old_prob_map": {"code": 0.26041014282145614, "math": 0.2699967341834913, "orca": 0.2388624915646586, "sharegpt": 0.2307306314303939}, "new_prob_map": {"code": 0.262732059091419, "math": 0.2804335637149032, "orca": 0.23390804147258062, "sharegpt": 0.22292633572109724}, "sim": [[0.12574693769290124, 0.1257249944313256, 0.12580449354595183, 0.1257251789850052], [0.1257249944313256, 0.12691919157975104, 0.12625310413982882, 0.12598091200100756], [0.12580449354595183, 0.12625310413982882, 0.12722391119439105, 0.1268789970410317], [0.1257251789850052, 0.12598091200100756, 0.1268789970410317, 0.12665033434484935]], "name2load": {"code": [0.14305555555555555, 0.13052083333333334, 0.13197916666666668, 0.11215277777777778, 0.11579861111111112, 0.12083333333333333, 0.12854166666666667, 0.11711805555555556], "orca": [0.16174458199408076, 0.11194029850746268, 0.128297743690927, 0.12694523120007636, 0.10855106132450752, 0.1353467205550075, 0.11860738949177353, 0.10856697323616459], "math": [0.1594431052294977, 0.11096079106606975, 0.11413391056230628, 0.1136911497023663, 0.12011118217149602, 0.11592955182761842, 0.13833817090569192, 0.1273921385349535], "sharegpt": [0.15417793292707535, 0.11246418338108882, 0.13297550345050244, 0.1228005569232011, 0.11246418338108882, 0.13609810726825133, 0.12055066790427377, 0.10846886476451834]}}
+{"step": 400, "old_prob_map": {"code": 0.262732059091419, "math": 0.2804335637149032, "orca": 0.23390804147258062, "sharegpt": 0.22292633572109724}, "new_prob_map": {"code": 0.26592793408730336, "math": 0.28876391543048435, "orca": 0.22985923187970148, "sharegpt": 0.2154489186025108}, "sim": [[0.1256565586419753, 0.12572712965923813, 0.12552322619754816, 0.12548363978542537], [0.12572712965923813, 0.126813335133358, 0.12601790734538068, 0.1258321498287296], [0.12552322619754816, 0.12601790734538068, 0.1269305361144293, 0.12662832848484412], [0.12548363978542537, 0.1258321498287296, 0.12662832848484412, 0.12640048340498414]], "name2load": {"code": [0.14086805555555554, 0.1329861111111111, 0.12416666666666668, 0.11038194444444444, 0.12090277777777778, 0.1207986111111111, 0.1317361111111111, 0.11815972222222222], "orca": [0.15962829774369092, 0.1129268370302008, 0.12680202399516277, 0.12576774973745344, 0.11252903923877415, 0.13525124908506508, 0.11897336345988607, 0.10812143970976673], "math": [0.15784424656860335, 0.1165936931175284, 0.10643479116446106, 0.1138141388301274, 0.12399763860874699, 0.11735622570964727, 0.13629655138485758, 0.12766271461602793], "sharegpt": [0.15344142217200046, 0.11403305218128253, 0.12936357399410792, 0.12228600831349125, 0.11519835344444893, 0.13466039791759152, 0.12164534484846039, 0.10937184712861696]}}
+{"step": 400, "old_prob_map": {"code": 0.262732059091419, "math": 0.2804335637149032, "orca": 0.23390804147258062, "sharegpt": 0.22292633572109724}, "new_prob_map": {"code": 0.26592793408730336, "math": 0.28876391543048435, "orca": 0.22985923187970148, "sharegpt": 0.2154489186025108}, "sim": [[0.1256565586419753, 0.12572712965923813, 0.12552322619754816, 0.12548363978542537], [0.12572712965923813, 0.126813335133358, 0.12601790734538068, 0.1258321498287296], [0.12552322619754816, 0.12601790734538068, 0.1269305361144293, 0.12662832848484412], [0.12548363978542537, 0.1258321498287296, 0.12662832848484412, 0.12640048340498414]], "name2load": {"code": [0.14086805555555554, 0.1329861111111111, 0.12416666666666668, 0.11038194444444444, 0.12090277777777778, 0.1207986111111111, 0.1317361111111111, 0.11815972222222222], "orca": [0.15962829774369092, 0.1129268370302008, 0.12680202399516277, 0.12576774973745344, 0.11252903923877415, 0.13525124908506508, 0.11897336345988607, 0.10812143970976673], "math": [0.15784424656860335, 0.1165936931175284, 0.10643479116446106, 0.1138141388301274, 0.12399763860874699, 0.11735622570964727, 0.13629655138485758, 0.12766271461602793], "sharegpt": [0.15344142217200046, 0.11403305218128253, 0.12936357399410792, 0.12228600831349125, 0.11519835344444893, 0.13466039791759152, 0.12164534484846039, 0.10937184712861696]}}
+{"step": 400, "old_prob_map": {"code": 0.262732059091419, "math": 0.2804335637149032, "orca": 0.23390804147258062, "sharegpt": 0.22292633572109724}, "new_prob_map": {"code": 0.26592793408730336, "math": 0.28876391543048435, "orca": 0.22985923187970148, "sharegpt": 0.2154489186025108}, "sim": [[0.1256565586419753, 0.12572712965923813, 0.12552322619754816, 0.12548363978542537], [0.12572712965923813, 0.126813335133358, 0.12601790734538068, 0.1258321498287296], [0.12552322619754816, 0.12601790734538068, 0.1269305361144293, 0.12662832848484412], [0.12548363978542537, 0.1258321498287296, 0.12662832848484412, 0.12640048340498414]], "name2load": {"code": [0.14086805555555554, 0.1329861111111111, 0.12416666666666668, 0.11038194444444444, 0.12090277777777778, 0.1207986111111111, 0.1317361111111111, 0.11815972222222222], "orca": [0.15962829774369092, 0.1129268370302008, 0.12680202399516277, 0.12576774973745344, 0.11252903923877415, 0.13525124908506508, 0.11897336345988607, 0.10812143970976673], "math": [0.15784424656860335, 0.1165936931175284, 0.10643479116446106, 0.1138141388301274, 0.12399763860874699, 0.11735622570964727, 0.13629655138485758, 0.12766271461602793], "sharegpt": [0.15344142217200046, 0.11403305218128253, 0.12936357399410792, 0.12228600831349125, 0.11519835344444893, 0.13466039791759152, 0.12164534484846039, 0.10937184712861696]}}
+{"step": 500, "old_prob_map": {"code": 0.26592793408730336, "math": 0.28876391543048435, "orca": 0.22985923187970148, "sharegpt": 0.2154489186025108}, "new_prob_map": {"code": 0.2669836751180987, "math": 0.2966379337081988, "orca": 0.22764222531614908, "sharegpt": 0.20873616585755356}, "sim": [[0.1259574701003086, 0.12599376240140372, 0.1260121163682299, 0.1259192261326235], [0.12599376240140372, 0.12696372953269297, 0.12641762455469407, 0.12616699032245762], [0.1260121163682299, 0.12641762455469407, 0.12725356468217505, 0.12685254651404831], [0.1259192261326235, 0.12616699032245762, 0.12685254651404831, 0.12656536029397814]], "name2load": {"code": [0.1470138888888889, 0.12184027777777777, 0.12666666666666668, 0.10680555555555557, 0.12201388888888888, 0.12670138888888888, 0.13211805555555556, 0.11684027777777778], "orca": [0.16257200140024822, 0.11082646469146802, 0.12724755752156064, 0.12562454253253985, 0.11042866690004137, 0.13595137319797598, 0.11851191802183113, 0.10883747573433472], "math": [0.15934471392728883, 0.10412259556255228, 0.11241206277365082, 0.11809416047621392, 0.1262114429084469, 0.1197176169626605, 0.1356078122693954, 0.12448959511979142], "sharegpt": [0.15488417611687316, 0.11187396585818636, 0.12985289963275354, 0.1216705678195246, 0.11496630211065821, 0.13559869244118003, 0.12169074619637597, 0.10946264982444812]}}
+{"step": 500, "old_prob_map": {"code": 0.26592793408730336, "math": 0.28876391543048435, "orca": 0.22985923187970148, "sharegpt": 0.2154489186025108}, "new_prob_map": {"code": 0.2669836751180987, "math": 0.2966379337081988, "orca": 0.22764222531614908, "sharegpt": 0.20873616585755356}, "sim": [[0.1259574701003086, 0.12599376240140372, 0.1260121163682299, 0.1259192261326235], [0.12599376240140372, 0.12696372953269297, 0.12641762455469407, 0.12616699032245762], [0.1260121163682299, 0.12641762455469407, 0.12725356468217505, 0.12685254651404831], [0.1259192261326235, 0.12616699032245762, 0.12685254651404831, 0.12656536029397814]], "name2load": {"code": [0.1470138888888889, 0.12184027777777777, 0.12666666666666668, 0.10680555555555557, 0.12201388888888888, 0.12670138888888888, 0.13211805555555556, 0.11684027777777778], "orca": [0.16257200140024822, 0.11082646469146802, 0.12724755752156064, 0.12562454253253985, 0.11042866690004137, 0.13595137319797598, 0.11851191802183113, 0.10883747573433472], "math": [0.15934471392728883, 0.10412259556255228, 0.11241206277365082, 0.11809416047621392, 0.1262114429084469, 0.1197176169626605, 0.1356078122693954, 0.12448959511979142], "sharegpt": [0.15488417611687316, 0.11187396585818636, 0.12985289963275354, 0.1216705678195246, 0.11496630211065821, 0.13559869244118003, 0.12169074619637597, 0.10946264982444812]}}
+{"step": 500, "old_prob_map": {"code": 0.26592793408730336, "math": 0.28876391543048435, "orca": 0.22985923187970148, "sharegpt": 0.2154489186025108}, "new_prob_map": {"code": 0.2669836751180987, "math": 0.2966379337081988, "orca": 0.22764222531614908, "sharegpt": 0.20873616585755356}, "sim": [[0.1259574701003086, 0.12599376240140372, 0.1260121163682299, 0.1259192261326235], [0.12599376240140372, 0.12696372953269297, 0.12641762455469407, 0.12616699032245762], [0.1260121163682299, 0.12641762455469407, 0.12725356468217505, 0.12685254651404831], [0.1259192261326235, 0.12616699032245762, 0.12685254651404831, 0.12656536029397814]], "name2load": {"code": [0.1470138888888889, 0.12184027777777777, 0.12666666666666668, 0.10680555555555557, 0.12201388888888888, 0.12670138888888888, 0.13211805555555556, 0.11684027777777778], "orca": [0.16257200140024822, 0.11082646469146802, 0.12724755752156064, 0.12562454253253985, 0.11042866690004137, 0.13595137319797598, 0.11851191802183113, 0.10883747573433472], "math": [0.15934471392728883, 0.10412259556255228, 0.11241206277365082, 0.11809416047621392, 0.1262114429084469, 0.1197176169626605, 0.1356078122693954, 0.12448959511979142], "sharegpt": [0.15488417611687316, 0.11187396585818636, 0.12985289963275354, 0.1216705678195246, 0.11496630211065821, 0.13559869244118003, 0.12169074619637597, 0.10946264982444812]}}
+{"step": 600, "old_prob_map": {"code": 0.2669836751180987, "math": 0.2966379337081988, "orca": 0.22764222531614908, "sharegpt": 0.20873616585755356}, "new_prob_map": {"code": 0.27003083823938623, "math": 0.3010794462965474, "orca": 0.2251186490107333, "sharegpt": 0.20377106645333304}, "sim": [[0.12566743103780864, 0.12567633173310813, 0.12562479115615952, 0.12557017629595627], [0.12567633173310813, 0.1264801167028205, 0.1260440652850635, 0.12585004924121554], [0.12562479115615952, 0.1260440652850635, 0.12700713538771485, 0.12670045038512148], [0.12557017629595627, 0.12585004924121554, 0.12670045038512148, 0.12648942901261928]], "name2load": {"code": [0.14253472222222222, 0.1292013888888889, 0.12309027777777778, 0.1090625, 0.12319444444444445, 0.12371527777777779, 0.13090277777777778, 0.1182986111111111], "orca": [0.16037615759157303, 0.11216306527066161, 0.12672246443687743, 0.1264996976736785, 0.11119243865958055, 0.13502848232186615, 0.11914839448811379, 0.10886929955764886], "math": [0.1545235401190535, 0.10980469326511537, 0.11351896492350078, 0.11703645397746841, 0.12744133418605796, 0.11740542136075172, 0.13587838835046984, 0.12439120381758254], "sharegpt": [0.15372391944791963, 0.11282739416441342, 0.1299437023285847, 0.1220438677912749, 0.11535473586504702, 0.1362645788772751, 0.1203942854836757, 0.1094475160418096]}}
+{"step": 600, "old_prob_map": {"code": 0.2669836751180987, "math": 0.2966379337081988, "orca": 0.22764222531614908, "sharegpt": 0.20873616585755356}, "new_prob_map": {"code": 0.27003083823938623, "math": 0.3010794462965474, "orca": 0.2251186490107333, "sharegpt": 0.20377106645333304}, "sim": [[0.12566743103780864, 0.12567633173310813, 0.12562479115615952, 0.12557017629595627], [0.12567633173310813, 0.1264801167028205, 0.1260440652850635, 0.12585004924121554], [0.12562479115615952, 0.1260440652850635, 0.12700713538771485, 0.12670045038512148], [0.12557017629595627, 0.12585004924121554, 0.12670045038512148, 0.12648942901261928]], "name2load": {"code": [0.14253472222222222, 0.1292013888888889, 0.12309027777777778, 0.1090625, 0.12319444444444445, 0.12371527777777779, 0.13090277777777778, 0.1182986111111111], "orca": [0.16037615759157303, 0.11216306527066161, 0.12672246443687743, 0.1264996976736785, 0.11119243865958055, 0.13502848232186615, 0.11914839448811379, 0.10886929955764886], "math": [0.1545235401190535, 0.10980469326511537, 0.11351896492350078, 0.11703645397746841, 0.12744133418605796, 0.11740542136075172, 0.13587838835046984, 0.12439120381758254], "sharegpt": [0.15372391944791963, 0.11282739416441342, 0.1299437023285847, 0.1220438677912749, 0.11535473586504702, 0.1362645788772751, 0.1203942854836757, 0.1094475160418096]}}
+{"step": 600, "old_prob_map": {"code": 0.2669836751180987, "math": 0.2966379337081988, "orca": 0.22764222531614908, "sharegpt": 0.20873616585755356}, "new_prob_map": {"code": 0.27003083823938623, "math": 0.3010794462965474, "orca": 0.2251186490107333, "sharegpt": 0.20377106645333304}, "sim": [[0.12566743103780864, 0.12567633173310813, 0.12562479115615952, 0.12557017629595627], [0.12567633173310813, 0.1264801167028205, 0.1260440652850635, 0.12585004924121554], [0.12562479115615952, 0.1260440652850635, 0.12700713538771485, 0.12670045038512148], [0.12557017629595627, 0.12585004924121554, 0.12670045038512148, 0.12648942901261928]], "name2load": {"code": [0.14253472222222222, 0.1292013888888889, 0.12309027777777778, 0.1090625, 0.12319444444444445, 0.12371527777777779, 0.13090277777777778, 0.1182986111111111], "orca": [0.16037615759157303, 0.11216306527066161, 0.12672246443687743, 0.1264996976736785, 0.11119243865958055, 0.13502848232186615, 0.11914839448811379, 0.10886929955764886], "math": [0.1545235401190535, 0.10980469326511537, 0.11351896492350078, 0.11703645397746841, 0.12744133418605796, 0.11740542136075172, 0.13587838835046984, 0.12439120381758254], "sharegpt": [0.15372391944791963, 0.11282739416441342, 0.1299437023285847, 0.1220438677912749, 0.11535473586504702, 0.1362645788772751, 0.1203942854836757, 0.1094475160418096]}}
+{"step": 700, "old_prob_map": {"code": 0.27003083823938623, "math": 0.3010794462965474, "orca": 0.2251186490107333, "sharegpt": 0.20377106645333304}, "new_prob_map": {"code": 0.2721660599163854, "math": 0.3060484802624147, "orca": 0.22232232353028722, "sharegpt": 0.19946313629091267}, "sim": [[0.12591640866126544, 0.12617961093072705, 0.1258935858863112, 0.12586781576217093], [0.12617961093072705, 0.12723727763421092, 0.1265306236290008, 0.12636545218336875], [0.1258935858863112, 0.1265306236290008, 0.12717894129531937, 0.12692870153713742], [0.12586781576217093, 0.12636545218336875, 0.12692870153713742, 0.12677430653529523]], "name2load": {"code": [0.14517361111111113, 0.12329861111111112, 0.12114583333333333, 0.1057638888888889, 0.12111111111111111, 0.13010416666666666, 0.1328125, 0.12059027777777778], "orca": [0.16211055596219331, 0.11036501925341312, 0.12634057855710787, 0.12439932533494574, 0.11015816440187125, 0.1370492951023136, 0.11717531744263757, 0.11240174394551762], "math": [0.1616323117036454, 0.10572145422344664, 0.1116003345304275, 0.11113297584493531, 0.12362867122546367, 0.12348108427215033, 0.1371328774536331, 0.125670290746298], "sharegpt": [0.156226038177489, 0.10993684168045523, 0.12972174018321966, 0.12017232333831065, 0.11424492513822186, 0.13864058275152347, 0.12030348278784456, 0.11075406594293555]}}
+{"step": 700, "old_prob_map": {"code": 0.27003083823938623, "math": 0.3010794462965474, "orca": 0.2251186490107333, "sharegpt": 0.20377106645333304}, "new_prob_map": {"code": 0.2721660599163854, "math": 0.3060484802624147, "orca": 0.22232232353028722, "sharegpt": 0.19946313629091267}, "sim": [[0.12591640866126544, 0.12617961093072705, 0.1258935858863112, 0.12586781576217093], [0.12617961093072705, 0.12723727763421092, 0.1265306236290008, 0.12636545218336875], [0.1258935858863112, 0.1265306236290008, 0.12717894129531937, 0.12692870153713742], [0.12586781576217093, 0.12636545218336875, 0.12692870153713742, 0.12677430653529523]], "name2load": {"code": [0.14517361111111113, 0.12329861111111112, 0.12114583333333333, 0.1057638888888889, 0.12111111111111111, 0.13010416666666666, 0.1328125, 0.12059027777777778], "orca": [0.16211055596219331, 0.11036501925341312, 0.12634057855710787, 0.12439932533494574, 0.11015816440187125, 0.1370492951023136, 0.11717531744263757, 0.11240174394551762], "math": [0.1616323117036454, 0.10572145422344664, 0.1116003345304275, 0.11113297584493531, 0.12362867122546367, 0.12348108427215033, 0.1371328774536331, 0.125670290746298], "sharegpt": [0.156226038177489, 0.10993684168045523, 0.12972174018321966, 0.12017232333831065, 0.11424492513822186, 0.13864058275152347, 0.12030348278784456, 0.11075406594293555]}}
+{"step": 700, "old_prob_map": {"code": 0.27003083823938623, "math": 0.3010794462965474, "orca": 0.2251186490107333, "sharegpt": 0.20377106645333304}, "new_prob_map": {"code": 0.2721660599163854, "math": 0.3060484802624147, "orca": 0.22232232353028722, "sharegpt": 0.19946313629091267}, "sim": [[0.12591640866126544, 0.12617961093072705, 0.1258935858863112, 0.12586781576217093], [0.12617961093072705, 0.12723727763421092, 0.1265306236290008, 0.12636545218336875], [0.1258935858863112, 0.1265306236290008, 0.12717894129531937, 0.12692870153713742], [0.12586781576217093, 0.12636545218336875, 0.12692870153713742, 0.12677430653529523]], "name2load": {"code": [0.14517361111111113, 0.12329861111111112, 0.12114583333333333, 0.1057638888888889, 0.12111111111111111, 0.13010416666666666, 0.1328125, 0.12059027777777778], "orca": [0.16211055596219331, 0.11036501925341312, 0.12634057855710787, 0.12439932533494574, 0.11015816440187125, 0.1370492951023136, 0.11717531744263757, 0.11240174394551762], "math": [0.1616323117036454, 0.10572145422344664, 0.1116003345304275, 0.11113297584493531, 0.12362867122546367, 0.12348108427215033, 0.1371328774536331, 0.125670290746298], "sharegpt": [0.156226038177489, 0.10993684168045523, 0.12972174018321966, 0.12017232333831065, 0.11424492513822186, 0.13864058275152347, 0.12030348278784456, 0.11075406594293555]}}
+{"step": 800, "old_prob_map": {"code": 0.2721660599163854, "math": 0.3060484802624147, "orca": 0.22232232353028722, "sharegpt": 0.19946313629091267}, "new_prob_map": {"code": 0.2714569825529402, "math": 0.31164467459228024, "orca": 0.22043752359675148, "sharegpt": 0.19646081925802802}, "sim": [[0.1257888744212963, 0.12608671741334734, 0.12586396486826704, 0.12578446662944875], [0.12608671741334734, 0.12703465505802583, 0.1263656558457333, 0.12616774116713667], [0.12586396486826704, 0.1263656558457333, 0.12707414791525715, 0.12679080694508663], [0.12578446662944875, 0.12616774116713667, 0.12679080694508663, 0.1266166860219692]], "name2load": {"code": [0.14774305555555556, 0.12232638888888889, 0.12312500000000001, 0.11069444444444444, 0.12357638888888887, 0.12385416666666667, 0.12944444444444445, 0.1192361111111111], "orca": [0.16113992935111224, 0.11014225249021417, 0.12837730324921234, 0.1275021481080737, 0.11046049072335551, 0.13389873659421442, 0.11617286700824236, 0.1123062724755752], "math": [0.1604516160771388, 0.10520489988685001, 0.11302700841245635, 0.11408471491120187, 0.12601466030402914, 0.12092291041471935, 0.13528804053721652, 0.12500614945638808], "sharegpt": [0.1546823923483595, 0.10977541466564429, 0.13218854675329916, 0.12226582993663988, 0.1145879575446951, 0.1362393559062109, 0.11788207756568062, 0.1123784252794705]}}
+{"step": 800, "old_prob_map": {"code": 0.2721660599163854, "math": 0.3060484802624147, "orca": 0.22232232353028722, "sharegpt": 0.19946313629091267}, "new_prob_map": {"code": 0.2714569825529402, "math": 0.31164467459228024, "orca": 0.22043752359675148, "sharegpt": 0.19646081925802802}, "sim": [[0.1257888744212963, 0.12608671741334734, 0.12586396486826704, 0.12578446662944875], [0.12608671741334734, 0.12703465505802583, 0.1263656558457333, 0.12616774116713667], [0.12586396486826704, 0.1263656558457333, 0.12707414791525715, 0.12679080694508663], [0.12578446662944875, 0.12616774116713667, 0.12679080694508663, 0.1266166860219692]], "name2load": {"code": [0.14774305555555556, 0.12232638888888889, 0.12312500000000001, 0.11069444444444444, 0.12357638888888887, 0.12385416666666667, 0.12944444444444445, 0.1192361111111111], "orca": [0.16113992935111224, 0.11014225249021417, 0.12837730324921234, 0.1275021481080737, 0.11046049072335551, 0.13389873659421442, 0.11617286700824236, 0.1123062724755752], "math": [0.1604516160771388, 0.10520489988685001, 0.11302700841245635, 0.11408471491120187, 0.12601466030402914, 0.12092291041471935, 0.13528804053721652, 0.12500614945638808], "sharegpt": [0.1546823923483595, 0.10977541466564429, 0.13218854675329916, 0.12226582993663988, 0.1145879575446951, 0.1362393559062109, 0.11788207756568062, 0.1123784252794705]}}
+{"step": 800, "old_prob_map": {"code": 0.2721660599163854, "math": 0.3060484802624147, "orca": 0.22232232353028722, "sharegpt": 0.19946313629091267}, "new_prob_map": {"code": 0.2714569825529402, "math": 0.31164467459228024, "orca": 0.22043752359675148, "sharegpt": 0.19646081925802802}, "sim": [[0.1257888744212963, 0.12608671741334734, 0.12586396486826704, 0.12578446662944875], [0.12608671741334734, 0.12703465505802583, 0.1263656558457333, 0.12616774116713667], [0.12586396486826704, 0.1263656558457333, 0.12707414791525715, 0.12679080694508663], [0.12578446662944875, 0.12616774116713667, 0.12679080694508663, 0.1266166860219692]], "name2load": {"code": [0.14774305555555556, 0.12232638888888889, 0.12312500000000001, 0.11069444444444444, 0.12357638888888887, 0.12385416666666667, 0.12944444444444445, 0.1192361111111111], "orca": [0.16113992935111224, 0.11014225249021417, 0.12837730324921234, 0.1275021481080737, 0.11046049072335551, 0.13389873659421442, 0.11617286700824236, 0.1123062724755752], "math": [0.1604516160771388, 0.10520489988685001, 0.11302700841245635, 0.11408471491120187, 0.12601466030402914, 0.12092291041471935, 0.13528804053721652, 0.12500614945638808], "sharegpt": [0.1546823923483595, 0.10977541466564429, 0.13218854675329916, 0.12226582993663988, 0.1145879575446951, 0.1362393559062109, 0.11788207756568062, 0.1123784252794705]}}
+{"step": 900, "old_prob_map": {"code": 0.2714569825529402, "math": 0.31164467459228024, "orca": 0.22043752359675148, "sharegpt": 0.19646081925802802}, "new_prob_map": {"code": 0.2723148904404529, "math": 0.31564146378610697, "orca": 0.21837309802996122, "sharegpt": 0.1936705477434788}, "sim": [[0.12570747251157408, 0.12585408006182255, 0.12568240001131514, 0.12568915602817798], [0.12585408006182255, 0.12659409417087344, 0.1260897289567614, 0.12595491163084996], [0.12568240001131514, 0.1260897289567614, 0.12685691586212447, 0.1266720302140431], [0.12568915602817798, 0.12595491163084996, 0.1266720302140431, 0.12656430150737044]], "name2load": {"code": [0.14270833333333333, 0.12319444444444445, 0.12736111111111112, 0.10944444444444444, 0.12482638888888889, 0.12458333333333334, 0.1323263888888889, 0.11555555555555556], "orca": [0.1588327021608376, 0.11092193616141043, 0.128297743690927, 0.12584730929573879, 0.11176526747923494, 0.1350125704102091, 0.1172389650892658, 0.11208350571237627], "math": [0.1558272248733212, 0.11076400846165199, 0.11388793230678408, 0.11187091061150195, 0.126113051606238, 0.12212820386677817, 0.13646873616372313, 0.12293993211000148], "sharegpt": [0.15380463295532507, 0.11132410508898664, 0.13251140078292103, 0.1214637394567981, 0.1148200088784858, 0.13663787884902537, 0.11847229508858308, 0.11096593889987488]}}
+{"step": 900, "old_prob_map": {"code": 0.2714569825529402, "math": 0.31164467459228024, "orca": 0.22043752359675148, "sharegpt": 0.19646081925802802}, "new_prob_map": {"code": 0.2723148904404529, "math": 0.31564146378610697, "orca": 0.21837309802996122, "sharegpt": 0.1936705477434788}, "sim": [[0.12570747251157408, 0.12585408006182255, 0.12568240001131514, 0.12568915602817798], [0.12585408006182255, 0.12659409417087344, 0.1260897289567614, 0.12595491163084996], [0.12568240001131514, 0.1260897289567614, 0.12685691586212447, 0.1266720302140431], [0.12568915602817798, 0.12595491163084996, 0.1266720302140431, 0.12656430150737044]], "name2load": {"code": [0.14270833333333333, 0.12319444444444445, 0.12736111111111112, 0.10944444444444444, 0.12482638888888889, 0.12458333333333334, 0.1323263888888889, 0.11555555555555556], "orca": [0.1588327021608376, 0.11092193616141043, 0.128297743690927, 0.12584730929573879, 0.11176526747923494, 0.1350125704102091, 0.1172389650892658, 0.11208350571237627], "math": [0.1558272248733212, 0.11076400846165199, 0.11388793230678408, 0.11187091061150195, 0.126113051606238, 0.12212820386677817, 0.13646873616372313, 0.12293993211000148], "sharegpt": [0.15380463295532507, 0.11132410508898664, 0.13251140078292103, 0.1214637394567981, 0.1148200088784858, 0.13663787884902537, 0.11847229508858308, 0.11096593889987488]}}
+{"step": 900, "old_prob_map": {"code": 0.2714569825529402, "math": 0.31164467459228024, "orca": 0.22043752359675148, "sharegpt": 0.19646081925802802}, "new_prob_map": {"code": 0.2723148904404529, "math": 0.31564146378610697, "orca": 0.21837309802996122, "sharegpt": 0.1936705477434788}, "sim": [[0.12570747251157408, 0.12585408006182255, 0.12568240001131514, 0.12568915602817798], [0.12585408006182255, 0.12659409417087344, 0.1260897289567614, 0.12595491163084996], [0.12568240001131514, 0.1260897289567614, 0.12685691586212447, 0.1266720302140431], [0.12568915602817798, 0.12595491163084996, 0.1266720302140431, 0.12656430150737044]], "name2load": {"code": [0.14270833333333333, 0.12319444444444445, 0.12736111111111112, 0.10944444444444444, 0.12482638888888889, 0.12458333333333334, 0.1323263888888889, 0.11555555555555556], "orca": [0.1588327021608376, 0.11092193616141043, 0.128297743690927, 0.12584730929573879, 0.11176526747923494, 0.1350125704102091, 0.1172389650892658, 0.11208350571237627], "math": [0.1558272248733212, 0.11076400846165199, 0.11388793230678408, 0.11187091061150195, 0.126113051606238, 0.12212820386677817, 0.13646873616372313, 0.12293993211000148], "sharegpt": [0.15380463295532507, 0.11132410508898664, 0.13251140078292103, 0.1214637394567981, 0.1148200088784858, 0.13663787884902537, 0.11847229508858308, 0.11096593889987488]}}
+{"step": 1000, "old_prob_map": {"code": 0.2723148904404529, "math": 0.31564146378610697, "orca": 0.21837309802996122, "sharegpt": 0.1936705477434788}, "new_prob_map": {"code": 0.2722841717572351, "math": 0.3194814726996295, "orca": 0.21690109310035738, "sharegpt": 0.191333262442778}, "sim": [[0.1257504653742284, 0.12595864815133678, 0.12580714111125255, 0.12576848017026065], [0.12595864815133678, 0.1268039313992917, 0.12628966584110354, 0.12609522044107874], [0.12580714111125255, 0.12628966584110354, 0.12696985787479292, 0.12670862298735203], [0.12576848017026065, 0.12609522044107874, 0.12670862298735203, 0.12656973927211368]], "name2load": {"code": [0.14548611111111112, 0.12215277777777778, 0.12652777777777777, 0.11104166666666668, 0.11947916666666668, 0.12413194444444445, 0.13243055555555555, 0.11875], "orca": [0.15951691436209145, 0.10928300926073257, 0.12837730324921234, 0.12732711707984598, 0.11020590013684244, 0.13491709894026668, 0.1167297839162397, 0.1136428730547688], "math": [0.1587297682884833, 0.10852560633639986, 0.11265804102917303, 0.11573276922320068, 0.1213410734491071, 0.12175923648349488, 0.13651793181482758, 0.12473557337531364], "sharegpt": [0.15346160054885183, 0.1094122038823197, 0.1319514508252956, 0.12137293676096694, 0.11503188183542516, 0.13752068283627264, 0.11903728964042132, 0.11221195367044674]}}
+{"step": 1000, "old_prob_map": {"code": 0.2723148904404529, "math": 0.31564146378610697, "orca": 0.21837309802996122, "sharegpt": 0.1936705477434788}, "new_prob_map": {"code": 0.2722841717572351, "math": 0.3194814726996295, "orca": 0.21690109310035738, "sharegpt": 0.191333262442778}, "sim": [[0.1257504653742284, 0.12595864815133678, 0.12580714111125255, 0.12576848017026065], [0.12595864815133678, 0.1268039313992917, 0.12628966584110354, 0.12609522044107874], [0.12580714111125255, 0.12628966584110354, 0.12696985787479292, 0.12670862298735203], [0.12576848017026065, 0.12609522044107874, 0.12670862298735203, 0.12656973927211368]], "name2load": {"code": [0.14548611111111112, 0.12215277777777778, 0.12652777777777777, 0.11104166666666668, 0.11947916666666668, 0.12413194444444445, 0.13243055555555555, 0.11875], "orca": [0.15951691436209145, 0.10928300926073257, 0.12837730324921234, 0.12732711707984598, 0.11020590013684244, 0.13491709894026668, 0.1167297839162397, 0.1136428730547688], "math": [0.1587297682884833, 0.10852560633639986, 0.11265804102917303, 0.11573276922320068, 0.1213410734491071, 0.12175923648349488, 0.13651793181482758, 0.12473557337531364], "sharegpt": [0.15346160054885183, 0.1094122038823197, 0.1319514508252956, 0.12137293676096694, 0.11503188183542516, 0.13752068283627264, 0.11903728964042132, 0.11221195367044674]}}
+{"step": 1000, "old_prob_map": {"code": 0.2723148904404529, "math": 0.31564146378610697, "orca": 0.21837309802996122, "sharegpt": 0.1936705477434788}, "new_prob_map": {"code": 0.2722841717572351, "math": 0.3194814726996295, "orca": 0.21690109310035738, "sharegpt": 0.191333262442778}, "sim": [[0.1257504653742284, 0.12595864815133678, 0.12580714111125255, 0.12576848017026065], [0.12595864815133678, 0.1268039313992917, 0.12628966584110354, 0.12609522044107874], [0.12580714111125255, 0.12628966584110354, 0.12696985787479292, 0.12670862298735203], [0.12576848017026065, 0.12609522044107874, 0.12670862298735203, 0.12656973927211368]], "name2load": {"code": [0.14548611111111112, 0.12215277777777778, 0.12652777777777777, 0.11104166666666668, 0.11947916666666668, 0.12413194444444445, 0.13243055555555555, 0.11875], "orca": [0.15951691436209145, 0.10928300926073257, 0.12837730324921234, 0.12732711707984598, 0.11020590013684244, 0.13491709894026668, 0.1167297839162397, 0.1136428730547688], "math": [0.1587297682884833, 0.10852560633639986, 0.11265804102917303, 0.11573276922320068, 0.1213410734491071, 0.12175923648349488, 0.13651793181482758, 0.12473557337531364], "sharegpt": [0.15346160054885183, 0.1094122038823197, 0.1319514508252956, 0.12137293676096694, 0.11503188183542516, 0.13752068283627264, 0.11903728964042132, 0.11221195367044674]}}
+{"step": 1100, "old_prob_map": {"code": 0.2722841717572351, "math": 0.3194814726996295, "orca": 0.21690109310035738, "sharegpt": 0.191333262442778}, "new_prob_map": {"code": 0.2736675593567714, "math": 0.3204557297882405, "orca": 0.21683951539994323, "sharegpt": 0.18903719545504494}, "sim": [[0.12571141493055557, 0.12577790367081548, 0.12564886676249173, 0.1256420827826318], [0.12577790367081548, 0.12634154987995758, 0.12599165116469266, 0.12587112686965984], [0.12564886676249173, 0.12599165116469266, 0.1268617978511225, 0.1266110307265288], [0.1256420827826318, 0.12587112686965984, 0.1266110307265288, 0.12645934080435833]], "name2load": {"code": [0.1420486111111111, 0.12375, 0.12586805555555555, 0.11027777777777778, 0.1209375, 0.1255902777777778, 0.13534722222222223, 0.11618055555555556], "orca": [0.158450816281068, 0.11044457881169843, 0.1288864844222385, 0.12737485281481717, 0.11128791012952295, 0.13451930114884, 0.11695255067943862, 0.11208350571237627], "math": [0.15245732277266685, 0.10923894327741426, 0.11674128007084172, 0.11378954100457518, 0.12471097554976139, 0.12257096472671815, 0.13673931224479755, 0.12375166035322477], "sharegpt": [0.15234674522781388, 0.11115258888575005, 0.13157310625933252, 0.12223556237136286, 0.11503692642963802, 0.13709693692239397, 0.11942067880059731, 0.11113745510311152]}}
+{"step": 1100, "old_prob_map": {"code": 0.2722841717572351, "math": 0.3194814726996295, "orca": 0.21690109310035738, "sharegpt": 0.191333262442778}, "new_prob_map": {"code": 0.2736675593567714, "math": 0.3204557297882405, "orca": 0.21683951539994323, "sharegpt": 0.18903719545504494}, "sim": [[0.12571141493055557, 0.12577790367081548, 0.12564886676249173, 0.1256420827826318], [0.12577790367081548, 0.12634154987995758, 0.12599165116469266, 0.12587112686965984], [0.12564886676249173, 0.12599165116469266, 0.1268617978511225, 0.1266110307265288], [0.1256420827826318, 0.12587112686965984, 0.1266110307265288, 0.12645934080435833]], "name2load": {"code": [0.1420486111111111, 0.12375, 0.12586805555555555, 0.11027777777777778, 0.1209375, 0.1255902777777778, 0.13534722222222223, 0.11618055555555556], "orca": [0.158450816281068, 0.11044457881169843, 0.1288864844222385, 0.12737485281481717, 0.11128791012952295, 0.13451930114884, 0.11695255067943862, 0.11208350571237627], "math": [0.15245732277266685, 0.10923894327741426, 0.11674128007084172, 0.11378954100457518, 0.12471097554976139, 0.12257096472671815, 0.13673931224479755, 0.12375166035322477], "sharegpt": [0.15234674522781388, 0.11115258888575005, 0.13157310625933252, 0.12223556237136286, 0.11503692642963802, 0.13709693692239397, 0.11942067880059731, 0.11113745510311152]}}
+{"step": 1100, "old_prob_map": {"code": 0.2722841717572351, "math": 0.3194814726996295, "orca": 0.21690109310035738, "sharegpt": 0.191333262442778}, "new_prob_map": {"code": 0.2736675593567714, "math": 0.3204557297882405, "orca": 0.21683951539994323, "sharegpt": 0.18903719545504494}, "sim": [[0.12571141493055557, 0.12577790367081548, 0.12564886676249173, 0.1256420827826318], [0.12577790367081548, 0.12634154987995758, 0.12599165116469266, 0.12587112686965984], [0.12564886676249173, 0.12599165116469266, 0.1268617978511225, 0.1266110307265288], [0.1256420827826318, 0.12587112686965984, 0.1266110307265288, 0.12645934080435833]], "name2load": {"code": [0.1420486111111111, 0.12375, 0.12586805555555555, 0.11027777777777778, 0.1209375, 0.1255902777777778, 0.13534722222222223, 0.11618055555555556], "orca": [0.158450816281068, 0.11044457881169843, 0.1288864844222385, 0.12737485281481717, 0.11128791012952295, 0.13451930114884, 0.11695255067943862, 0.11208350571237627], "math": [0.15245732277266685, 0.10923894327741426, 0.11674128007084172, 0.11378954100457518, 0.12471097554976139, 0.12257096472671815, 0.13673931224479755, 0.12375166035322477], "sharegpt": [0.15234674522781388, 0.11115258888575005, 0.13157310625933252, 0.12223556237136286, 0.11503692642963802, 0.13709693692239397, 0.11942067880059731, 0.11113745510311152]}}
+{"step": 1200, "old_prob_map": {"code": 0.2736675593567714, "math": 0.3204557297882405, "orca": 0.21683951539994323, "sharegpt": 0.18903719545504494}, "new_prob_map": {"code": 0.27580974388598356, "math": 0.3222952911181522, "orca": 0.21566611167128363, "sharegpt": 0.1862288533245807}, "sim": [[0.12588290171682098, 0.1261328801867248, 0.1257449084092685, 0.1257601303158813], [0.1261328801867248, 0.12701436399988345, 0.12633952452114355, 0.12623899298926336], [0.1257449084092685, 0.12633952452114355, 0.126932493771256, 0.12671202848577792], [0.1257601303158813, 0.12623899298926336, 0.12671202848577792, 0.12658558590218905]], "name2load": {"code": [0.14434027777777778, 0.12267361111111111, 0.12274305555555555, 0.10788194444444445, 0.12180555555555556, 0.1282986111111111, 0.13565972222222222, 0.11659722222222223], "orca": [0.15932597142220664, 0.10999904528530056, 0.129013779715495, 0.1276453553129873, 0.10996722146198644, 0.13369188174267257, 0.11758902714572128, 0.11276771791363015], "math": [0.15981207261278105, 0.10670536724553549, 0.11440448664338072, 0.11076400846165199, 0.12382545382988143, 0.12453879077089586, 0.13710827962808086, 0.12284154080779258], "sharegpt": [0.15416784373864967, 0.10968461196981318, 0.13148230356350138, 0.12223051777715002, 0.11419447919609348, 0.13658238831268416, 0.11979397877234756, 0.1118638766697607]}}
+{"step": 1200, "old_prob_map": {"code": 0.2736675593567714, "math": 0.3204557297882405, "orca": 0.21683951539994323, "sharegpt": 0.18903719545504494}, "new_prob_map": {"code": 0.27580974388598356, "math": 0.3222952911181522, "orca": 0.21566611167128363, "sharegpt": 0.1862288533245807}, "sim": [[0.12588290171682098, 0.1261328801867248, 0.1257449084092685, 0.1257601303158813], [0.1261328801867248, 0.12701436399988345, 0.12633952452114355, 0.12623899298926336], [0.1257449084092685, 0.12633952452114355, 0.126932493771256, 0.12671202848577792], [0.1257601303158813, 0.12623899298926336, 0.12671202848577792, 0.12658558590218905]], "name2load": {"code": [0.14434027777777778, 0.12267361111111111, 0.12274305555555555, 0.10788194444444445, 0.12180555555555556, 0.1282986111111111, 0.13565972222222222, 0.11659722222222223], "orca": [0.15932597142220664, 0.10999904528530056, 0.129013779715495, 0.1276453553129873, 0.10996722146198644, 0.13369188174267257, 0.11758902714572128, 0.11276771791363015], "math": [0.15981207261278105, 0.10670536724553549, 0.11440448664338072, 0.11076400846165199, 0.12382545382988143, 0.12453879077089586, 0.13710827962808086, 0.12284154080779258], "sharegpt": [0.15416784373864967, 0.10968461196981318, 0.13148230356350138, 0.12223051777715002, 0.11419447919609348, 0.13658238831268416, 0.11979397877234756, 0.1118638766697607]}}
+{"step": 1200, "old_prob_map": {"code": 0.2736675593567714, "math": 0.3204557297882405, "orca": 0.21683951539994323, "sharegpt": 0.18903719545504494}, "new_prob_map": {"code": 0.27580974388598356, "math": 0.3222952911181522, "orca": 0.21566611167128363, "sharegpt": 0.1862288533245807}, "sim": [[0.12588290171682098, 0.1261328801867248, 0.1257449084092685, 0.1257601303158813], [0.1261328801867248, 0.12701436399988345, 0.12633952452114355, 0.12623899298926336], [0.1257449084092685, 0.12633952452114355, 0.126932493771256, 0.12671202848577792], [0.1257601303158813, 0.12623899298926336, 0.12671202848577792, 0.12658558590218905]], "name2load": {"code": [0.14434027777777778, 0.12267361111111111, 0.12274305555555555, 0.10788194444444445, 0.12180555555555556, 0.1282986111111111, 0.13565972222222222, 0.11659722222222223], "orca": [0.15932597142220664, 0.10999904528530056, 0.129013779715495, 0.1276453553129873, 0.10996722146198644, 0.13369188174267257, 0.11758902714572128, 0.11276771791363015], "math": [0.15981207261278105, 0.10670536724553549, 0.11440448664338072, 0.11076400846165199, 0.12382545382988143, 0.12453879077089586, 0.13710827962808086, 0.12284154080779258], "sharegpt": [0.15416784373864967, 0.10968461196981318, 0.13148230356350138, 0.12223051777715002, 0.11419447919609348, 0.13658238831268416, 0.11979397877234756, 0.1118638766697607]}}
+{"step": 1300, "old_prob_map": {"code": 0.27580974388598356, "math": 0.3222952911181522, "orca": 0.21566611167128363, "sharegpt": 0.1862288533245807}, "new_prob_map": {"code": 0.27666332592621296, "math": 0.32285034883570185, "orca": 0.2155287691198032, "sharegpt": 0.184957556118282}, "sim": [[0.12582070071373458, 0.12591443526808896, 0.1257385115997836, 0.12572414519351066], [0.12591443526808896, 0.1264367622335894, 0.12607809581270985, 0.12593663363183027], [0.1257385115997836, 0.12607809581270985, 0.1268740739697077, 0.12664481223383642], [0.12572414519351066, 0.12593663363183027, 0.12664481223383642, 0.12650471527925894]], "name2load": {"code": [0.14350694444444445, 0.12170138888888889, 0.12128472222222222, 0.10913194444444445, 0.12020833333333333, 0.13041666666666668, 0.1353125, 0.1184375], "orca": [0.1590713808356936, 0.11033319543009897, 0.1284886866308118, 0.128059065016071, 0.11139929351112242, 0.13307131718804696, 0.11777997008560609, 0.11179709130254908], "math": [0.15393319230580013, 0.11059182368278643, 0.11189550843705415, 0.1153146061888129, 0.12343188862104588, 0.12493235597973137, 0.13607517095488758, 0.12382545382988142], "sharegpt": [0.15327999515718954, 0.11059263892812463, 0.13096271035957865, 0.12303765285120465, 0.1143205940514145, 0.13666814641430242, 0.1194358125832358, 0.11170244965494976]}}
+{"step": 1300, "old_prob_map": {"code": 0.27580974388598356, "math": 0.3222952911181522, "orca": 0.21566611167128363, "sharegpt": 0.1862288533245807}, "new_prob_map": {"code": 0.27666332592621296, "math": 0.32285034883570185, "orca": 0.2155287691198032, "sharegpt": 0.184957556118282}, "sim": [[0.12582070071373458, 0.12591443526808896, 0.1257385115997836, 0.12572414519351066], [0.12591443526808896, 0.1264367622335894, 0.12607809581270985, 0.12593663363183027], [0.1257385115997836, 0.12607809581270985, 0.1268740739697077, 0.12664481223383642], [0.12572414519351066, 0.12593663363183027, 0.12664481223383642, 0.12650471527925894]], "name2load": {"code": [0.14350694444444445, 0.12170138888888889, 0.12128472222222222, 0.10913194444444445, 0.12020833333333333, 0.13041666666666668, 0.1353125, 0.1184375], "orca": [0.1590713808356936, 0.11033319543009897, 0.1284886866308118, 0.128059065016071, 0.11139929351112242, 0.13307131718804696, 0.11777997008560609, 0.11179709130254908], "math": [0.15393319230580013, 0.11059182368278643, 0.11189550843705415, 0.1153146061888129, 0.12343188862104588, 0.12493235597973137, 0.13607517095488758, 0.12382545382988142], "sharegpt": [0.15327999515718954, 0.11059263892812463, 0.13096271035957865, 0.12303765285120465, 0.1143205940514145, 0.13666814641430242, 0.1194358125832358, 0.11170244965494976]}}
+{"step": 1300, "old_prob_map": {"code": 0.27580974388598356, "math": 0.3222952911181522, "orca": 0.21566611167128363, "sharegpt": 0.1862288533245807}, "new_prob_map": {"code": 0.27666332592621296, "math": 0.32285034883570185, "orca": 0.2155287691198032, "sharegpt": 0.184957556118282}, "sim": [[0.12582070071373458, 0.12591443526808896, 0.1257385115997836, 0.12572414519351066], [0.12591443526808896, 0.1264367622335894, 0.12607809581270985, 0.12593663363183027], [0.1257385115997836, 0.12607809581270985, 0.1268740739697077, 0.12664481223383642], [0.12572414519351066, 0.12593663363183027, 0.12664481223383642, 0.12650471527925894]], "name2load": {"code": [0.14350694444444445, 0.12170138888888889, 0.12128472222222222, 0.10913194444444445, 0.12020833333333333, 0.13041666666666668, 0.1353125, 0.1184375], "orca": [0.1590713808356936, 0.11033319543009897, 0.1284886866308118, 0.128059065016071, 0.11139929351112242, 0.13307131718804696, 0.11777997008560609, 0.11179709130254908], "math": [0.15393319230580013, 0.11059182368278643, 0.11189550843705415, 0.1153146061888129, 0.12343188862104588, 0.12493235597973137, 0.13607517095488758, 0.12382545382988142], "sharegpt": [0.15327999515718954, 0.11059263892812463, 0.13096271035957865, 0.12303765285120465, 0.1143205940514145, 0.13666814641430242, 0.1194358125832358, 0.11170244965494976]}}
+{"step": 1400, "old_prob_map": {"code": 0.27666332592621296, "math": 0.32285034883570185, "orca": 0.2155287691198032, "sharegpt": 0.184957556118282}, "new_prob_map": {"code": 0.2779217751998221, "math": 0.3233091416247052, "orca": 0.21481783112919145, "sharegpt": 0.18395125204628138}, "sim": [[0.1258207561728395, 0.12601849431926887, 0.1256774280914192, 0.12570311536622633], [0.12601849431926887, 0.12665956090784278, 0.1261289811308243, 0.12604632036294167], [0.1256774280914192, 0.1261289811308243, 0.12685240757999092, 0.126695717235162], [0.12570311536622633, 0.12604632036294167, 0.126695717235162, 0.12662251517588774]], "name2load": {"code": [0.14270833333333333, 0.11940972222222221, 0.1225, 0.1076388888888889, 0.12232638888888889, 0.1287847222222222, 0.13621527777777778, 0.12041666666666667], "orca": [0.1590236451007224, 0.10963307131718804, 0.12936384177195048, 0.12710435031664702, 0.11149476498106482, 0.13260987174999203, 0.11762085096903542, 0.11314960379339974], "math": [0.15526147488562014, 0.10781226939538546, 0.11310080188911302, 0.11105918236827865, 0.12473557337531364, 0.12606385595513359, 0.13543562749052984, 0.12653121464062578], "sharegpt": [0.15414766536179828, 0.10903385931635659, 0.1319363170426571, 0.12240203398038661, 0.11416421163081641, 0.13701622341498848, 0.11928951935106338, 0.11201016990193309]}}
+{"step": 1400, "old_prob_map": {"code": 0.27666332592621296, "math": 0.32285034883570185, "orca": 0.2155287691198032, "sharegpt": 0.184957556118282}, "new_prob_map": {"code": 0.2779217751998221, "math": 0.3233091416247052, "orca": 0.21481783112919145, "sharegpt": 0.18395125204628138}, "sim": [[0.1258207561728395, 0.12601849431926887, 0.1256774280914192, 0.12570311536622633], [0.12601849431926887, 0.12665956090784278, 0.1261289811308243, 0.12604632036294167], [0.1256774280914192, 0.1261289811308243, 0.12685240757999092, 0.126695717235162], [0.12570311536622633, 0.12604632036294167, 0.126695717235162, 0.12662251517588774]], "name2load": {"code": [0.14270833333333333, 0.11940972222222221, 0.1225, 0.1076388888888889, 0.12232638888888889, 0.1287847222222222, 0.13621527777777778, 0.12041666666666667], "orca": [0.1590236451007224, 0.10963307131718804, 0.12936384177195048, 0.12710435031664702, 0.11149476498106482, 0.13260987174999203, 0.11762085096903542, 0.11314960379339974], "math": [0.15526147488562014, 0.10781226939538546, 0.11310080188911302, 0.11105918236827865, 0.12473557337531364, 0.12606385595513359, 0.13543562749052984, 0.12653121464062578], "sharegpt": [0.15414766536179828, 0.10903385931635659, 0.1319363170426571, 0.12240203398038661, 0.11416421163081641, 0.13701622341498848, 0.11928951935106338, 0.11201016990193309]}}
+{"step": 1400, "old_prob_map": {"code": 0.27666332592621296, "math": 0.32285034883570185, "orca": 0.2155287691198032, "sharegpt": 0.184957556118282}, "new_prob_map": {"code": 0.2779217751998221, "math": 0.3233091416247052, "orca": 0.21481783112919145, "sharegpt": 0.18395125204628138}, "sim": [[0.1258207561728395, 0.12601849431926887, 0.1256774280914192, 0.12570311536622633], [0.12601849431926887, 0.12665956090784278, 0.1261289811308243, 0.12604632036294167], [0.1256774280914192, 0.1261289811308243, 0.12685240757999092, 0.126695717235162], [0.12570311536622633, 0.12604632036294167, 0.126695717235162, 0.12662251517588774]], "name2load": {"code": [0.14270833333333333, 0.11940972222222221, 0.1225, 0.1076388888888889, 0.12232638888888889, 0.1287847222222222, 0.13621527777777778, 0.12041666666666667], "orca": [0.1590236451007224, 0.10963307131718804, 0.12936384177195048, 0.12710435031664702, 0.11149476498106482, 0.13260987174999203, 0.11762085096903542, 0.11314960379339974], "math": [0.15526147488562014, 0.10781226939538546, 0.11310080188911302, 0.11105918236827865, 0.12473557337531364, 0.12606385595513359, 0.13543562749052984, 0.12653121464062578], "sharegpt": [0.15414766536179828, 0.10903385931635659, 0.1319363170426571, 0.12240203398038661, 0.11416421163081641, 0.13701622341498848, 0.11928951935106338, 0.11201016990193309]}}
+{"step": 1500, "old_prob_map": {"code": 0.2779217751998221, "math": 0.3233091416247052, "orca": 0.21481783112919145, "sharegpt": 0.18395125204628138}, "new_prob_map": {"code": 0.2794000506707779, "math": 0.32403823816235056, "orca": 0.2141558894834887, "sharegpt": 0.18240582168338285}, "sim": [[0.12583274016203705, 0.12605009740055645, 0.12582082701276845, 0.12585137441370606], [0.12605009740055645, 0.1267408267893199, 0.12630800638000103, 0.12623558657906186], [0.12582082701276845, 0.12630800638000103, 0.12691902867744342, 0.1267594364818609], [0.12585137441370606, 0.12623558657906186, 0.1267594364818609, 0.1266761751407769]], "name2load": {"code": [0.14378472222222222, 0.1182986111111111, 0.12659722222222222, 0.10895833333333334, 0.12434027777777779, 0.12802083333333333, 0.13409722222222223, 0.11590277777777779], "orca": [0.1595010024504344, 0.10934665690736085, 0.1295547847118353, 0.12653152149699265, 0.1111765267479235, 0.1331986124813035, 0.11816185596537568, 0.11252903923877415], "math": [0.15666355094209672, 0.10655778029222217, 0.11602794312982732, 0.11118217149603976, 0.12367786687656812, 0.127219953756088, 0.1357553992227087, 0.12291533428444926], "sharegpt": [0.15482868558053192, 0.10909943904112354, 0.13166390895516367, 0.12214980426974456, 0.11431050486298884, 0.13694559909600873, 0.12017232333831067, 0.1108297348561282]}}
+{"step": 1500, "old_prob_map": {"code": 0.2779217751998221, "math": 0.3233091416247052, "orca": 0.21481783112919145, "sharegpt": 0.18395125204628138}, "new_prob_map": {"code": 0.2794000506707779, "math": 0.32403823816235056, "orca": 0.2141558894834887, "sharegpt": 0.18240582168338285}, "sim": [[0.12583274016203705, 0.12605009740055645, 0.12582082701276845, 0.12585137441370606], [0.12605009740055645, 0.1267408267893199, 0.12630800638000103, 0.12623558657906186], [0.12582082701276845, 0.12630800638000103, 0.12691902867744342, 0.1267594364818609], [0.12585137441370606, 0.12623558657906186, 0.1267594364818609, 0.1266761751407769]], "name2load": {"code": [0.14378472222222222, 0.1182986111111111, 0.12659722222222222, 0.10895833333333334, 0.12434027777777779, 0.12802083333333333, 0.13409722222222223, 0.11590277777777779], "orca": [0.1595010024504344, 0.10934665690736085, 0.1295547847118353, 0.12653152149699265, 0.1111765267479235, 0.1331986124813035, 0.11816185596537568, 0.11252903923877415], "math": [0.15666355094209672, 0.10655778029222217, 0.11602794312982732, 0.11118217149603976, 0.12367786687656812, 0.127219953756088, 0.1357553992227087, 0.12291533428444926], "sharegpt": [0.15482868558053192, 0.10909943904112354, 0.13166390895516367, 0.12214980426974456, 0.11431050486298884, 0.13694559909600873, 0.12017232333831067, 0.1108297348561282]}}
+{"step": 1500, "old_prob_map": {"code": 0.2779217751998221, "math": 0.3233091416247052, "orca": 0.21481783112919145, "sharegpt": 0.18395125204628138}, "new_prob_map": {"code": 0.2794000506707779, "math": 0.32403823816235056, "orca": 0.2141558894834887, "sharegpt": 0.18240582168338285}, "sim": [[0.12583274016203705, 0.12605009740055645, 0.12582082701276845, 0.12585137441370606], [0.12605009740055645, 0.1267408267893199, 0.12630800638000103, 0.12623558657906186], [0.12582082701276845, 0.12630800638000103, 0.12691902867744342, 0.1267594364818609], [0.12585137441370606, 0.12623558657906186, 0.1267594364818609, 0.1266761751407769]], "name2load": {"code": [0.14378472222222222, 0.1182986111111111, 0.12659722222222222, 0.10895833333333334, 0.12434027777777779, 0.12802083333333333, 0.13409722222222223, 0.11590277777777779], "orca": [0.1595010024504344, 0.10934665690736085, 0.1295547847118353, 0.12653152149699265, 0.1111765267479235, 0.1331986124813035, 0.11816185596537568, 0.11252903923877415], "math": [0.15666355094209672, 0.10655778029222217, 0.11602794312982732, 0.11118217149603976, 0.12367786687656812, 0.127219953756088, 0.1357553992227087, 0.12291533428444926], "sharegpt": [0.15482868558053192, 0.10909943904112354, 0.13166390895516367, 0.12214980426974456, 0.11431050486298884, 0.13694559909600873, 0.12017232333831067, 0.1108297348561282]}}
+{"step": 1600, "old_prob_map": {"code": 0.2794000506707779, "math": 0.32403823816235056, "orca": 0.2141558894834887, "sharegpt": 0.18240582168338285}, "new_prob_map": {"code": 0.2804732627789791, "math": 0.3241530780111496, "orca": 0.21407924627956532, "sharegpt": 0.18129441293030601}, "sim": [[0.12593104021990742, 0.12613996914339437, 0.12578566997546028, 0.12582446745900427], [0.12613996914339437, 0.1267514866134596, 0.1262219844422851, 0.12617636192126522], [0.12578566997546028, 0.1262219844422851, 0.12686584735491024, 0.12671370049191125], [0.12582446745900427, 0.12617636192126522, 0.12671370049191125, 0.1266354352939243]], "name2load": {"code": [0.14371527777777776, 0.11833333333333333, 0.12493055555555556, 0.10666666666666666, 0.12430555555555556, 0.12975694444444444, 0.1354861111111111, 0.11680555555555555], "orca": [0.15848264010438212, 0.10853514941285046, 0.1307640899977723, 0.12614963561722303, 0.11184482703752027, 0.13342137924450242, 0.11773223435063487, 0.11307004423511441], "math": [0.15486790967678457, 0.10518030206129778, 0.11784818222069168, 0.10872238894081762, 0.1250799429330447, 0.12795788852265458, 0.13681310572145422, 0.12353027992325477], "sharegpt": [0.15386012349166633, 0.10845373098187981, 0.13279894265305298, 0.12157472052948061, 0.11488558860325275, 0.13725331934299206, 0.1192289842205093, 0.11194459017716614]}}
+{"step": 1600, "old_prob_map": {"code": 0.2794000506707779, "math": 0.32403823816235056, "orca": 0.2141558894834887, "sharegpt": 0.18240582168338285}, "new_prob_map": {"code": 0.2804732627789791, "math": 0.3241530780111496, "orca": 0.21407924627956532, "sharegpt": 0.18129441293030601}, "sim": [[0.12593104021990742, 0.12613996914339437, 0.12578566997546028, 0.12582446745900427], [0.12613996914339437, 0.1267514866134596, 0.1262219844422851, 0.12617636192126522], [0.12578566997546028, 0.1262219844422851, 0.12686584735491024, 0.12671370049191125], [0.12582446745900427, 0.12617636192126522, 0.12671370049191125, 0.1266354352939243]], "name2load": {"code": [0.14371527777777776, 0.11833333333333333, 0.12493055555555556, 0.10666666666666666, 0.12430555555555556, 0.12975694444444444, 0.1354861111111111, 0.11680555555555555], "orca": [0.15848264010438212, 0.10853514941285046, 0.1307640899977723, 0.12614963561722303, 0.11184482703752027, 0.13342137924450242, 0.11773223435063487, 0.11307004423511441], "math": [0.15486790967678457, 0.10518030206129778, 0.11784818222069168, 0.10872238894081762, 0.1250799429330447, 0.12795788852265458, 0.13681310572145422, 0.12353027992325477], "sharegpt": [0.15386012349166633, 0.10845373098187981, 0.13279894265305298, 0.12157472052948061, 0.11488558860325275, 0.13725331934299206, 0.1192289842205093, 0.11194459017716614]}}
+{"step": 1600, "old_prob_map": {"code": 0.2794000506707779, "math": 0.32403823816235056, "orca": 0.2141558894834887, "sharegpt": 0.18240582168338285}, "new_prob_map": {"code": 0.2804732627789791, "math": 0.3241530780111496, "orca": 0.21407924627956532, "sharegpt": 0.18129441293030601}, "sim": [[0.12593104021990742, 0.12613996914339437, 0.12578566997546028, 0.12582446745900427], [0.12613996914339437, 0.1267514866134596, 0.1262219844422851, 0.12617636192126522], [0.12578566997546028, 0.1262219844422851, 0.12686584735491024, 0.12671370049191125], [0.12582446745900427, 0.12617636192126522, 0.12671370049191125, 0.1266354352939243]], "name2load": {"code": [0.14371527777777776, 0.11833333333333333, 0.12493055555555556, 0.10666666666666666, 0.12430555555555556, 0.12975694444444444, 0.1354861111111111, 0.11680555555555555], "orca": [0.15848264010438212, 0.10853514941285046, 0.1307640899977723, 0.12614963561722303, 0.11184482703752027, 0.13342137924450242, 0.11773223435063487, 0.11307004423511441], "math": [0.15486790967678457, 0.10518030206129778, 0.11784818222069168, 0.10872238894081762, 0.1250799429330447, 0.12795788852265458, 0.13681310572145422, 0.12353027992325477], "sharegpt": [0.15386012349166633, 0.10845373098187981, 0.13279894265305298, 0.12157472052948061, 0.11488558860325275, 0.13725331934299206, 0.1192289842205093, 0.11194459017716614]}}
+{"step": 1700, "old_prob_map": {"code": 0.2804732627789791, "math": 0.3241530780111496, "orca": 0.21407924627956532, "sharegpt": 0.18129441293030601}, "new_prob_map": {"code": 0.2813282261479211, "math": 0.32312641954300747, "orca": 0.21468698073291553, "sharegpt": 0.180858373576156}, "sim": [[0.1259065369405864, 0.126133320897766, 0.12580441674887824, 0.12582797783097024], [0.126133320897766, 0.12669842951396948, 0.126244217829939, 0.12618712353198738], [0.12580441674887824, 0.126244217829939, 0.1268750365940294, 0.12670858269224505], [0.12582797783097024, 0.12618712353198738, 0.12670858269224505, 0.1266261001785833]], "name2load": {"code": [0.14402777777777778, 0.11847222222222221, 0.12458333333333334, 0.10750000000000001, 0.12350694444444445, 0.1295138888888889, 0.13541666666666666, 0.11697916666666666], "orca": [0.15884861407249468, 0.10874200426439233, 0.1294593132418929, 0.12602234032396653, 0.11192438659580563, 0.1341055914457563, 0.11741399611749356, 0.11348375393819815], "math": [0.1546465292468146, 0.10621341073449109, 0.11816795395287057, 0.1085502041619521, 0.12471097554976142, 0.12891720371919124, 0.1362965513848576, 0.1224971712500615], "sharegpt": [0.15373400863634532, 0.10872613906937328, 0.13202711973848824, 0.121403204326244, 0.11506214940070222, 0.13804027604019536, 0.11897675450986725, 0.11203034827878448]}}
+{"step": 1700, "old_prob_map": {"code": 0.2804732627789791, "math": 0.3241530780111496, "orca": 0.21407924627956532, "sharegpt": 0.18129441293030601}, "new_prob_map": {"code": 0.2813282261479211, "math": 0.32312641954300747, "orca": 0.21468698073291553, "sharegpt": 0.180858373576156}, "sim": [[0.1259065369405864, 0.126133320897766, 0.12580441674887824, 0.12582797783097024], [0.126133320897766, 0.12669842951396948, 0.126244217829939, 0.12618712353198738], [0.12580441674887824, 0.126244217829939, 0.1268750365940294, 0.12670858269224505], [0.12582797783097024, 0.12618712353198738, 0.12670858269224505, 0.1266261001785833]], "name2load": {"code": [0.14402777777777778, 0.11847222222222221, 0.12458333333333334, 0.10750000000000001, 0.12350694444444445, 0.1295138888888889, 0.13541666666666666, 0.11697916666666666], "orca": [0.15884861407249468, 0.10874200426439233, 0.1294593132418929, 0.12602234032396653, 0.11192438659580563, 0.1341055914457563, 0.11741399611749356, 0.11348375393819815], "math": [0.1546465292468146, 0.10621341073449109, 0.11816795395287057, 0.1085502041619521, 0.12471097554976142, 0.12891720371919124, 0.1362965513848576, 0.1224971712500615], "sharegpt": [0.15373400863634532, 0.10872613906937328, 0.13202711973848824, 0.121403204326244, 0.11506214940070222, 0.13804027604019536, 0.11897675450986725, 0.11203034827878448]}}
+{"step": 1700, "old_prob_map": {"code": 0.2804732627789791, "math": 0.3241530780111496, "orca": 0.21407924627956532, "sharegpt": 0.18129441293030601}, "new_prob_map": {"code": 0.2813282261479211, "math": 0.32312641954300747, "orca": 0.21468698073291553, "sharegpt": 0.180858373576156}, "sim": [[0.1259065369405864, 0.126133320897766, 0.12580441674887824, 0.12582797783097024], [0.126133320897766, 0.12669842951396948, 0.126244217829939, 0.12618712353198738], [0.12580441674887824, 0.126244217829939, 0.1268750365940294, 0.12670858269224505], [0.12582797783097024, 0.12618712353198738, 0.12670858269224505, 0.1266261001785833]], "name2load": {"code": [0.14402777777777778, 0.11847222222222221, 0.12458333333333334, 0.10750000000000001, 0.12350694444444445, 0.1295138888888889, 0.13541666666666666, 0.11697916666666666], "orca": [0.15884861407249468, 0.10874200426439233, 0.1294593132418929, 0.12602234032396653, 0.11192438659580563, 0.1341055914457563, 0.11741399611749356, 0.11348375393819815], "math": [0.1546465292468146, 0.10621341073449109, 0.11816795395287057, 0.1085502041619521, 0.12471097554976142, 0.12891720371919124, 0.1362965513848576, 0.1224971712500615], "sharegpt": [0.15373400863634532, 0.10872613906937328, 0.13202711973848824, 0.121403204326244, 0.11506214940070222, 0.13804027604019536, 0.11897675450986725, 0.11203034827878448]}}
+{"step": 1800, "old_prob_map": {"code": 0.2813282261479211, "math": 0.32312641954300747, "orca": 0.21468698073291553, "sharegpt": 0.180858373576156}, "new_prob_map": {"code": 0.282973176307897, "math": 0.3216829267748556, "orca": 0.21494254071910104, "sharegpt": 0.18040135619814648}, "sim": [[0.1259092978395062, 0.1261164543054394, 0.12576030318379672, 0.1258058630656111], [0.1261164543054394, 0.1266795881628676, 0.12622637671543455, 0.12617597886763374], [0.12576030318379672, 0.12622637671543455, 0.12683592295496837, 0.126692137071093], [0.1258058630656111, 0.12617597886763374, 0.126692137071093, 0.12662839604000165]], "name2load": {"code": [0.14326388888888889, 0.11875, 0.1238888888888889, 0.10666666666666666, 0.12399305555555556, 0.1305902777777778, 0.13545138888888889, 0.11739583333333334], "orca": [0.15853037583935334, 0.108662444706107, 0.1288864844222385, 0.12602234032396653, 0.11216306527066162, 0.13428062247398403, 0.11741399611749356, 0.11404067084619549], "math": [0.1546465292468146, 0.10660697594332662, 0.11784818222069171, 0.10827962808087765, 0.12463718207310474, 0.1288680080680868, 0.13568160574605206, 0.12343188862104591], "sharegpt": [0.1538954356511562, 0.10891278905524839, 0.13152770491141694, 0.12142338270309536, 0.11505710480648937, 0.13823197062028328, 0.11893135316195165, 0.11202025909035876]}}
+{"step": 1800, "old_prob_map": {"code": 0.2813282261479211, "math": 0.32312641954300747, "orca": 0.21468698073291553, "sharegpt": 0.180858373576156}, "new_prob_map": {"code": 0.282973176307897, "math": 0.3216829267748556, "orca": 0.21494254071910104, "sharegpt": 0.18040135619814648}, "sim": [[0.1259092978395062, 0.1261164543054394, 0.12576030318379672, 0.1258058630656111], [0.1261164543054394, 0.1266795881628676, 0.12622637671543455, 0.12617597886763374], [0.12576030318379672, 0.12622637671543455, 0.12683592295496837, 0.126692137071093], [0.1258058630656111, 0.12617597886763374, 0.126692137071093, 0.12662839604000165]], "name2load": {"code": [0.14326388888888889, 0.11875, 0.1238888888888889, 0.10666666666666666, 0.12399305555555556, 0.1305902777777778, 0.13545138888888889, 0.11739583333333334], "orca": [0.15853037583935334, 0.108662444706107, 0.1288864844222385, 0.12602234032396653, 0.11216306527066162, 0.13428062247398403, 0.11741399611749356, 0.11404067084619549], "math": [0.1546465292468146, 0.10660697594332662, 0.11784818222069171, 0.10827962808087765, 0.12463718207310474, 0.1288680080680868, 0.13568160574605206, 0.12343188862104591], "sharegpt": [0.1538954356511562, 0.10891278905524839, 0.13152770491141694, 0.12142338270309536, 0.11505710480648937, 0.13823197062028328, 0.11893135316195165, 0.11202025909035876]}}
+{"step": 1800, "old_prob_map": {"code": 0.2813282261479211, "math": 0.32312641954300747, "orca": 0.21468698073291553, "sharegpt": 0.180858373576156}, "new_prob_map": {"code": 0.282973176307897, "math": 0.3216829267748556, "orca": 0.21494254071910104, "sharegpt": 0.18040135619814648}, "sim": [[0.1259092978395062, 0.1261164543054394, 0.12576030318379672, 0.1258058630656111], [0.1261164543054394, 0.1266795881628676, 0.12622637671543455, 0.12617597886763374], [0.12576030318379672, 0.12622637671543455, 0.12683592295496837, 0.126692137071093], [0.1258058630656111, 0.12617597886763374, 0.126692137071093, 0.12662839604000165]], "name2load": {"code": [0.14326388888888889, 0.11875, 0.1238888888888889, 0.10666666666666666, 0.12399305555555556, 0.1305902777777778, 0.13545138888888889, 0.11739583333333334], "orca": [0.15853037583935334, 0.108662444706107, 0.1288864844222385, 0.12602234032396653, 0.11216306527066162, 0.13428062247398403, 0.11741399611749356, 0.11404067084619549], "math": [0.1546465292468146, 0.10660697594332662, 0.11784818222069171, 0.10827962808087765, 0.12463718207310474, 0.1288680080680868, 0.13568160574605206, 0.12343188862104591], "sharegpt": [0.1538954356511562, 0.10891278905524839, 0.13152770491141694, 0.12142338270309536, 0.11505710480648937, 0.13823197062028328, 0.11893135316195165, 0.11202025909035876]}}
+{"step": 1900, "old_prob_map": {"code": 0.282973176307897, "math": 0.3216829267748556, "orca": 0.21494254071910104, "sharegpt": 0.18040135619814648}, "new_prob_map": {"code": 0.28506630267298727, "math": 0.32022976576466383, "orca": 0.21494599194174854, "sharegpt": 0.17975793962060052}, "sim": [[0.1259095100308642, 0.12610713531673798, 0.12573655355065466, 0.12578400070512216], [0.12610713531673798, 0.12669060012786607, 0.12622065525322654, 0.12617461950497313], [0.12573655355065466, 0.12622065525322654, 0.12681874560502623, 0.12667662506027874], [0.12578400070512216, 0.12617461950497313, 0.12667662506027874, 0.12661365756551973]], "name2load": {"code": [0.14274305555555555, 0.11902777777777779, 0.1240625, 0.10652777777777778, 0.12375, 0.130625, 0.13607638888888887, 0.1171875], "orca": [0.15827578525284028, 0.10906024249753364, 0.1293956655952646, 0.12605416414728066, 0.11170161983260668, 0.13401011997581389, 0.11773223435063487, 0.11377016834802532], "math": [0.1548925075023368, 0.10665617159443104, 0.11755300831406501, 0.10862399763860874, 0.12451419294534362, 0.12881881241698234, 0.13587838835046978, 0.12306292123776257], "sharegpt": [0.15360789378102424, 0.1088421647362686, 0.13162859679567376, 0.12163021106582185, 0.11487045482061423, 0.13820170305500626, 0.11918862746680656, 0.11203034827878446]}}
+{"step": 1900, "old_prob_map": {"code": 0.282973176307897, "math": 0.3216829267748556, "orca": 0.21494254071910104, "sharegpt": 0.18040135619814648}, "new_prob_map": {"code": 0.28506630267298727, "math": 0.32022976576466383, "orca": 0.21494599194174854, "sharegpt": 0.17975793962060052}, "sim": [[0.1259095100308642, 0.12610713531673798, 0.12573655355065466, 0.12578400070512216], [0.12610713531673798, 0.12669060012786607, 0.12622065525322654, 0.12617461950497313], [0.12573655355065466, 0.12622065525322654, 0.12681874560502623, 0.12667662506027874], [0.12578400070512216, 0.12617461950497313, 0.12667662506027874, 0.12661365756551973]], "name2load": {"code": [0.14274305555555555, 0.11902777777777779, 0.1240625, 0.10652777777777778, 0.12375, 0.130625, 0.13607638888888887, 0.1171875], "orca": [0.15827578525284028, 0.10906024249753364, 0.1293956655952646, 0.12605416414728066, 0.11170161983260668, 0.13401011997581389, 0.11773223435063487, 0.11377016834802532], "math": [0.1548925075023368, 0.10665617159443104, 0.11755300831406501, 0.10862399763860874, 0.12451419294534362, 0.12881881241698234, 0.13587838835046978, 0.12306292123776257], "sharegpt": [0.15360789378102424, 0.1088421647362686, 0.13162859679567376, 0.12163021106582185, 0.11487045482061423, 0.13820170305500626, 0.11918862746680656, 0.11203034827878446]}}
+{"step": 1900, "old_prob_map": {"code": 0.282973176307897, "math": 0.3216829267748556, "orca": 0.21494254071910104, "sharegpt": 0.18040135619814648}, "new_prob_map": {"code": 0.28506630267298727, "math": 0.32022976576466383, "orca": 0.21494599194174854, "sharegpt": 0.17975793962060052}, "sim": [[0.1259095100308642, 0.12610713531673798, 0.12573655355065466, 0.12578400070512216], [0.12610713531673798, 0.12669060012786607, 0.12622065525322654, 0.12617461950497313], [0.12573655355065466, 0.12622065525322654, 0.12681874560502623, 0.12667662506027874], [0.12578400070512216, 0.12617461950497313, 0.12667662506027874, 0.12661365756551973]], "name2load": {"code": [0.14274305555555555, 0.11902777777777779, 0.1240625, 0.10652777777777778, 0.12375, 0.130625, 0.13607638888888887, 0.1171875], "orca": [0.15827578525284028, 0.10906024249753364, 0.1293956655952646, 0.12605416414728066, 0.11170161983260668, 0.13401011997581389, 0.11773223435063487, 0.11377016834802532], "math": [0.1548925075023368, 0.10665617159443104, 0.11755300831406501, 0.10862399763860874, 0.12451419294534362, 0.12881881241698234, 0.13587838835046978, 0.12306292123776257], "sharegpt": [0.15360789378102424, 0.1088421647362686, 0.13162859679567376, 0.12163021106582185, 0.11487045482061423, 0.13820170305500626, 0.11918862746680656, 0.11203034827878446]}}
+{"step": 2000, "old_prob_map": {"code": 0.28506630267298727, "math": 0.32022976576466383, "orca": 0.21494599194174854, "sharegpt": 0.17975793962060052}, "new_prob_map": {"code": 0.2867481424046919, "math": 0.31854989012606805, "orca": 0.2152416645459714, "sharegpt": 0.1794603029232688}, "sim": [[0.12588450038580248, 0.12610842670257952, 0.12573015618867284, 0.12577819013832278], [0.12610842670257952, 0.12667020379049795, 0.12621567588243646, 0.1261686910458374], [0.12573015618867284, 0.12621567588243646, 0.12683392023051163, 0.12669522454319626], [0.12577819013832278, 0.1261686910458374, 0.12669522454319626, 0.12663247442717893]], "name2load": {"code": [0.1425, 0.11784722222222221, 0.12364583333333334, 0.10694444444444444, 0.12427083333333333, 0.1305902777777778, 0.13600694444444444, 0.11819444444444444], "orca": [0.15818031378289787, 0.10869426852942113, 0.1293956655952646, 0.1262769309104796, 0.11186073894917735, 0.13436018203226935, 0.11782770582057728, 0.11340419437991281], "math": [0.15474492054902347, 0.10672996507108772, 0.11779898656958725, 0.10882078024302652, 0.12458798642200031, 0.12842524720814683, 0.13602597530378316, 0.12286613863334482], "sharegpt": [0.15376427620162234, 0.10894810121473827, 0.13194640623108275, 0.12141833810888251, 0.11465858186367489, 0.13821683683764477, 0.11906755720569838, 0.11197990233665604]}}
+{"step": 2000, "old_prob_map": {"code": 0.28506630267298727, "math": 0.32022976576466383, "orca": 0.21494599194174854, "sharegpt": 0.17975793962060052}, "new_prob_map": {"code": 0.2867481424046919, "math": 0.31854989012606805, "orca": 0.2152416645459714, "sharegpt": 0.1794603029232688}, "sim": [[0.12588450038580248, 0.12610842670257952, 0.12573015618867284, 0.12577819013832278], [0.12610842670257952, 0.12667020379049795, 0.12621567588243646, 0.1261686910458374], [0.12573015618867284, 0.12621567588243646, 0.12683392023051163, 0.12669522454319626], [0.12577819013832278, 0.1261686910458374, 0.12669522454319626, 0.12663247442717893]], "name2load": {"code": [0.1425, 0.11784722222222221, 0.12364583333333334, 0.10694444444444444, 0.12427083333333333, 0.1305902777777778, 0.13600694444444444, 0.11819444444444444], "orca": [0.15818031378289787, 0.10869426852942113, 0.1293956655952646, 0.1262769309104796, 0.11186073894917735, 0.13436018203226935, 0.11782770582057728, 0.11340419437991281], "math": [0.15474492054902347, 0.10672996507108772, 0.11779898656958725, 0.10882078024302652, 0.12458798642200031, 0.12842524720814683, 0.13602597530378316, 0.12286613863334482], "sharegpt": [0.15376427620162234, 0.10894810121473827, 0.13194640623108275, 0.12141833810888251, 0.11465858186367489, 0.13821683683764477, 0.11906755720569838, 0.11197990233665604]}}
+{"step": 2000, "old_prob_map": {"code": 0.28506630267298727, "math": 0.32022976576466383, "orca": 0.21494599194174854, "sharegpt": 0.17975793962060052}, "new_prob_map": {"code": 0.2867481424046919, "math": 0.31854989012606805, "orca": 0.2152416645459714, "sharegpt": 0.1794603029232688}, "sim": [[0.12588450038580248, 0.12610842670257952, 0.12573015618867284, 0.12577819013832278], [0.12610842670257952, 0.12667020379049795, 0.12621567588243646, 0.1261686910458374], [0.12573015618867284, 0.12621567588243646, 0.12683392023051163, 0.12669522454319626], [0.12577819013832278, 0.1261686910458374, 0.12669522454319626, 0.12663247442717893]], "name2load": {"code": [0.1425, 0.11784722222222221, 0.12364583333333334, 0.10694444444444444, 0.12427083333333333, 0.1305902777777778, 0.13600694444444444, 0.11819444444444444], "orca": [0.15818031378289787, 0.10869426852942113, 0.1293956655952646, 0.1262769309104796, 0.11186073894917735, 0.13436018203226935, 0.11782770582057728, 0.11340419437991281], "math": [0.15474492054902347, 0.10672996507108772, 0.11779898656958725, 0.10882078024302652, 0.12458798642200031, 0.12842524720814683, 0.13602597530378316, 0.12286613863334482], "sharegpt": [0.15376427620162234, 0.10894810121473827, 0.13194640623108275, 0.12141833810888251, 0.11465858186367489, 0.13821683683764477, 0.11906755720569838, 0.11197990233665604]}}
diff --git a/sbatch.sh b/sbatch.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b6d768922ddc37ba312c356013fbbb2af334bfda
--- /dev/null
+++ b/sbatch.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/bash
+
+#SBATCH --job-name=moe_sft
+#SBATCH --output=logs/%x-%j.log
+#SBATCH --error=logs/%x-%j.log
+
+#SBATCH --partition=MoE
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=64G
+
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:4
+#SBATCH --quotatype=auto
+
+export WANDB_PROJECT="adaptive-moe-sft"
+num_gpus=4
+
+{
+ task_name="llama_moe_four_mix_freeze_gate_100"
+ model_type="auto"
+ model_name_or_path="/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new"
+ dataset_dir_or_path="data/four_types_mix/train"
+ eval_data_dir="data/four_types_mix/dev"
+
+ comment=$task_name
+ base_dir="outputs/dynamic_eval_interval_20"
+ output_dir="${base_dir}/${task_name}/$SLURM_JOB_NAME-$SLURM_JOB_ID"
+ mkdir -p $output_dir
+ scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh
+ git diff > $output_dir/diff.patch
+ env > $output_dir/env
+ echo -e "Job ID: ${SLURM_JOB_ID}\n\nGit commit: $(git log -1 --oneline)\n\nGit branch: $(git branch | grep "*")\n\nComment: ${comment}" > $output_dir/comment.txt
+ echo "$SLURM_JOB_ID" > $base_dir/latest.jobid
+ ln -snf $output_dir $base_dir/latest.dir
+ ln -snf $(scontrol show job $SLURM_JOB_ID | grep "StdOut=" | cut -d '=' -f 2) $base_dir/latest.log
+
+ nodes=($(scontrol show hostnames $SLURM_JOB_NODELIS))
+ nodes_array=($nodes)
+ head_node=${nodes_array[0]}
+ echo "Node: $head_node"
+
+ torchrun \
+ --nnodes 1 \
+ --nproc_per_node $num_gpus \
+ --node_rank $SLURM_NODEID \
+ --rdzv_id $RANDOM \
+ --rdzv_backend c10d \
+ --rdzv_endpoint $head_node:29522 \
+ -m src.core.train \
+ --do_train \
+ --do_eval \
+ --freeze_gate True \
+ --eval_data_dir $eval_data_dir \
+ --evaluation_strategy steps \
+ --eval_steps 100 \
+ --max_eval_steps 5 \
+ --dynamic_sampling_criterion mean \
+ --run_name $task_name \
+ --model_type $model_type \
+ --model_name_or_path $model_name_or_path \
+ --dataset_dir_or_path $dataset_dir_or_path \
+ --output_dir $output_dir \
+ --deepspeed conf/ds_bf16_zero1.json \
+ --bf16 True \
+ --tf32 True \
+ --torch_dtype bfloat16 \
+ --per_device_train_batch_size 4 \
+ --per_device_eval_batch_size 4 \
+ --gradient_accumulation_steps 8 \
+ --max_steps 2000 \
+ --save_strategy steps \
+ --save_steps 9999999999999 \
+ --save_total_limit 1 \
+ --learning_rate 2e-5 \
+ --weight_decay 0. \
+ --warmup_ratio 0.03 \
+ --lr_scheduler_type cosine \
+ --logging_steps 1 \
+ --model_max_length 2048 \
+ --gradient_checkpointing True \
+ --report_to wandb
+
+ python -m src.eval.gen_mt_ans \
+ --model-path $output_dir \
+ --model-id $task_name
+
+ python -m src.eval.gen_alpaca_eval_ans \
+ --model-path $output_dir \
+ --model-id $task_name
+}
+
+# nohup srun -p MoE --ntasks-per-node=1 --cpus-per-task=16 --mem=128G --nodes=1 --gres=gpu:4 bash "/mnt/petrelfs/zhutong/adaptive-sft-for-moe/scripts/one_data_steps_dynamic.sh" "llama_moe_orca_epochs_cluster_4" "auto" "/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new" "data/open_orca_clustered/4" "data/open_orca_clustered_eval/4" 1>logs/llama_moe_orca_cluster_4_dynamic.log 2>&1 &
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/tokenizer.model b/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a927b63b5663a28db1959828168610a93b3b9f67
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,43 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": false,
+ "model_max_length": 2048,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": true
+}
diff --git a/trainer_state.json b/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ebbeebefa9083fb9e9971d69516859d7072dc8a
--- /dev/null
+++ b/trainer_state.json
@@ -0,0 +1,13470 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.0,
+ "eval_steps": 100,
+ "global_step": 2000,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0,
+ "learning_rate": 3.3333333333333335e-07,
+ "loss": 0.9672,
+ "step": 1
+ },
+ {
+ "epoch": 0.0,
+ "learning_rate": 6.666666666666667e-07,
+ "loss": 1.101,
+ "step": 2
+ },
+ {
+ "epoch": 0.0,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 1.0874,
+ "step": 3
+ },
+ {
+ "epoch": 0.0,
+ "learning_rate": 1.3333333333333334e-06,
+ "loss": 1.0302,
+ "step": 4
+ },
+ {
+ "epoch": 0.0,
+ "learning_rate": 1.6666666666666667e-06,
+ "loss": 1.0754,
+ "step": 5
+ },
+ {
+ "epoch": 0.0,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 1.0552,
+ "step": 6
+ },
+ {
+ "epoch": 0.0,
+ "learning_rate": 2.3333333333333336e-06,
+ "loss": 1.056,
+ "step": 7
+ },
+ {
+ "epoch": 0.0,
+ "learning_rate": 2.666666666666667e-06,
+ "loss": 1.0807,
+ "step": 8
+ },
+ {
+ "epoch": 0.0,
+ "learning_rate": 3e-06,
+ "loss": 0.9727,
+ "step": 9
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 3.3333333333333333e-06,
+ "loss": 1.0609,
+ "step": 10
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 3.6666666666666666e-06,
+ "loss": 1.0426,
+ "step": 11
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.9514,
+ "step": 12
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 4.333333333333334e-06,
+ "loss": 0.946,
+ "step": 13
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 4.666666666666667e-06,
+ "loss": 0.9148,
+ "step": 14
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 5e-06,
+ "loss": 0.9166,
+ "step": 15
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 5.333333333333334e-06,
+ "loss": 0.8649,
+ "step": 16
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 5.666666666666667e-06,
+ "loss": 0.9305,
+ "step": 17
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 6e-06,
+ "loss": 0.8224,
+ "step": 18
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 6.333333333333333e-06,
+ "loss": 0.8195,
+ "step": 19
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 6.666666666666667e-06,
+ "loss": 0.8794,
+ "step": 20
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 7e-06,
+ "loss": 0.7984,
+ "step": 21
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 7.333333333333333e-06,
+ "loss": 0.8589,
+ "step": 22
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 7.666666666666667e-06,
+ "loss": 0.7756,
+ "step": 23
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 8.000000000000001e-06,
+ "loss": 0.8005,
+ "step": 24
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 8.333333333333334e-06,
+ "loss": 0.8899,
+ "step": 25
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 8.666666666666668e-06,
+ "loss": 0.8451,
+ "step": 26
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 9e-06,
+ "loss": 0.8091,
+ "step": 27
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 9.333333333333334e-06,
+ "loss": 0.8062,
+ "step": 28
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 9.666666666666667e-06,
+ "loss": 0.8387,
+ "step": 29
+ },
+ {
+ "epoch": 0.01,
+ "learning_rate": 1e-05,
+ "loss": 0.774,
+ "step": 30
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.0333333333333335e-05,
+ "loss": 0.7554,
+ "step": 31
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.0666666666666667e-05,
+ "loss": 0.8233,
+ "step": 32
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.1000000000000001e-05,
+ "loss": 0.7762,
+ "step": 33
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.1333333333333334e-05,
+ "loss": 0.8007,
+ "step": 34
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.1666666666666668e-05,
+ "loss": 0.7668,
+ "step": 35
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.2e-05,
+ "loss": 0.7904,
+ "step": 36
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.2333333333333334e-05,
+ "loss": 0.824,
+ "step": 37
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.2666666666666667e-05,
+ "loss": 0.7715,
+ "step": 38
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.3000000000000001e-05,
+ "loss": 0.7424,
+ "step": 39
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.3333333333333333e-05,
+ "loss": 0.7362,
+ "step": 40
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.3666666666666667e-05,
+ "loss": 0.7583,
+ "step": 41
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.4e-05,
+ "loss": 0.8013,
+ "step": 42
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.4333333333333334e-05,
+ "loss": 0.7942,
+ "step": 43
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.4666666666666666e-05,
+ "loss": 0.7419,
+ "step": 44
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.5000000000000002e-05,
+ "loss": 0.7636,
+ "step": 45
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.5333333333333334e-05,
+ "loss": 0.8152,
+ "step": 46
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.5666666666666667e-05,
+ "loss": 0.7442,
+ "step": 47
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.6000000000000003e-05,
+ "loss": 0.7432,
+ "step": 48
+ },
+ {
+ "epoch": 0.02,
+ "learning_rate": 1.6333333333333335e-05,
+ "loss": 0.7055,
+ "step": 49
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.6666666666666667e-05,
+ "loss": 0.7479,
+ "step": 50
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.7e-05,
+ "loss": 0.7984,
+ "step": 51
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.7333333333333336e-05,
+ "loss": 0.7365,
+ "step": 52
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.7666666666666668e-05,
+ "loss": 0.7525,
+ "step": 53
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.8e-05,
+ "loss": 0.7407,
+ "step": 54
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.8333333333333333e-05,
+ "loss": 0.798,
+ "step": 55
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.866666666666667e-05,
+ "loss": 0.7416,
+ "step": 56
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.9e-05,
+ "loss": 0.8083,
+ "step": 57
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.9333333333333333e-05,
+ "loss": 0.7662,
+ "step": 58
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.9666666666666666e-05,
+ "loss": 0.7259,
+ "step": 59
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 2e-05,
+ "loss": 0.7616,
+ "step": 60
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.9999986888082895e-05,
+ "loss": 0.6824,
+ "step": 61
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.999994755236596e-05,
+ "loss": 0.7889,
+ "step": 62
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.9999881992952353e-05,
+ "loss": 0.7583,
+ "step": 63
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.999979021001399e-05,
+ "loss": 0.7744,
+ "step": 64
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.9999672203791564e-05,
+ "loss": 0.756,
+ "step": 65
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.999952797459453e-05,
+ "loss": 0.7075,
+ "step": 66
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.9999357522801125e-05,
+ "loss": 0.7573,
+ "step": 67
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.999916084885832e-05,
+ "loss": 0.6942,
+ "step": 68
+ },
+ {
+ "epoch": 0.03,
+ "learning_rate": 1.999893795328188e-05,
+ "loss": 0.7428,
+ "step": 69
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.9998688836656322e-05,
+ "loss": 0.7451,
+ "step": 70
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.9998413499634927e-05,
+ "loss": 0.7405,
+ "step": 71
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.9998111942939727e-05,
+ "loss": 0.7407,
+ "step": 72
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.9997784167361526e-05,
+ "loss": 0.7318,
+ "step": 73
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.9997430173759876e-05,
+ "loss": 0.7698,
+ "step": 74
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.999704996306308e-05,
+ "loss": 0.7822,
+ "step": 75
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.9996643536268202e-05,
+ "loss": 0.8002,
+ "step": 76
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.9996210894441047e-05,
+ "loss": 0.7528,
+ "step": 77
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.9995752038716166e-05,
+ "loss": 0.7819,
+ "step": 78
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.9995266970296856e-05,
+ "loss": 0.7695,
+ "step": 79
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.9994755690455154e-05,
+ "loss": 0.7493,
+ "step": 80
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.9994218200531823e-05,
+ "loss": 0.7173,
+ "step": 81
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.999365450193638e-05,
+ "loss": 0.7805,
+ "step": 82
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.999306459614705e-05,
+ "loss": 0.7364,
+ "step": 83
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.99924484847108e-05,
+ "loss": 0.7128,
+ "step": 84
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.9991806169243302e-05,
+ "loss": 0.7227,
+ "step": 85
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.9991137651428957e-05,
+ "loss": 0.8038,
+ "step": 86
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.999044293302088e-05,
+ "loss": 0.8071,
+ "step": 87
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.998972201584088e-05,
+ "loss": 0.7258,
+ "step": 88
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.9988974901779482e-05,
+ "loss": 0.7094,
+ "step": 89
+ },
+ {
+ "epoch": 0.04,
+ "learning_rate": 1.998820159279591e-05,
+ "loss": 0.7975,
+ "step": 90
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.998740209091807e-05,
+ "loss": 0.7445,
+ "step": 91
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.9986576398242566e-05,
+ "loss": 0.7527,
+ "step": 92
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.998572451693468e-05,
+ "loss": 0.7194,
+ "step": 93
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.998484644922837e-05,
+ "loss": 0.8186,
+ "step": 94
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.9983942197426272e-05,
+ "loss": 0.7771,
+ "step": 95
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.9983011763899674e-05,
+ "loss": 0.7875,
+ "step": 96
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.998205515108853e-05,
+ "loss": 0.767,
+ "step": 97
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.998107236150145e-05,
+ "loss": 0.7356,
+ "step": 98
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.9980063397715685e-05,
+ "loss": 0.7307,
+ "step": 99
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.997902826237712e-05,
+ "loss": 0.712,
+ "step": 100
+ },
+ {
+ "epoch": 0.05,
+ "eval_code_gate_load": [
+ 199.4,
+ 191.5,
+ 181.9,
+ 145.3,
+ 177.5,
+ 178.75,
+ 187.95,
+ 177.7
+ ],
+ "eval_code_loss": 0.5201171636581421,
+ "eval_code_runtime": 1.7839,
+ "eval_code_samples_per_second": 560.56,
+ "eval_code_steps_per_second": 35.315,
+ "step": 100
+ },
+ {
+ "epoch": 0.05,
+ "eval_orca_gate_load": [
+ 500.1,
+ 352.35,
+ 397.35,
+ 384.15,
+ 349.8,
+ 443.1,
+ 371.2,
+ 344.25
+ ],
+ "eval_orca_loss": 0.7696288824081421,
+ "eval_orca_runtime": 2.0215,
+ "eval_orca_samples_per_second": 494.677,
+ "eval_orca_steps_per_second": 31.165,
+ "step": 100
+ },
+ {
+ "epoch": 0.05,
+ "eval_math_gate_load": [
+ 320.6,
+ 221.3,
+ 228.4,
+ 217.5,
+ 248.3,
+ 243.2,
+ 282.25,
+ 271.15
+ ],
+ "eval_math_loss": 0.716601550579071,
+ "eval_math_runtime": 1.8689,
+ "eval_math_samples_per_second": 535.077,
+ "eval_math_steps_per_second": 33.71,
+ "step": 100
+ },
+ {
+ "epoch": 0.05,
+ "eval_sharegpt_gate_load": [
+ 1524.7,
+ 1115.1,
+ 1297.45,
+ 1170.1,
+ 1142.65,
+ 1383.85,
+ 1207.25,
+ 1070.5
+ ],
+ "eval_sharegpt_loss": 0.7251952886581421,
+ "eval_sharegpt_runtime": 2.996,
+ "eval_sharegpt_samples_per_second": 333.775,
+ "eval_sharegpt_steps_per_second": 21.028,
+ "step": 100
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.9977966958200276e-05,
+ "loss": 0.7711,
+ "step": 101
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.997687948796831e-05,
+ "loss": 0.7064,
+ "step": 102
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.9975765854532974e-05,
+ "loss": 0.7409,
+ "step": 103
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.997462606081465e-05,
+ "loss": 0.7148,
+ "step": 104
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.9973460109802306e-05,
+ "loss": 0.7689,
+ "step": 105
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.997226800455352e-05,
+ "loss": 0.7637,
+ "step": 106
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.9971049748194448e-05,
+ "loss": 0.7224,
+ "step": 107
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.9969805343919822e-05,
+ "loss": 0.7147,
+ "step": 108
+ },
+ {
+ "epoch": 0.05,
+ "learning_rate": 1.9968534794992947e-05,
+ "loss": 0.6927,
+ "step": 109
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.9967238104745695e-05,
+ "loss": 0.7395,
+ "step": 110
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.996591527657848e-05,
+ "loss": 0.6984,
+ "step": 111
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.9964566313960265e-05,
+ "loss": 0.7243,
+ "step": 112
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.9963191220428552e-05,
+ "loss": 0.8089,
+ "step": 113
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.9961789999589357e-05,
+ "loss": 0.7965,
+ "step": 114
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.996036265511722e-05,
+ "loss": 0.7219,
+ "step": 115
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.995890919075519e-05,
+ "loss": 0.7092,
+ "step": 116
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.9957429610314797e-05,
+ "loss": 0.7492,
+ "step": 117
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.995592391767608e-05,
+ "loss": 0.7885,
+ "step": 118
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.995439211678754e-05,
+ "loss": 0.7533,
+ "step": 119
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.995283421166614e-05,
+ "loss": 0.6916,
+ "step": 120
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.995125020639731e-05,
+ "loss": 0.6982,
+ "step": 121
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.994964010513492e-05,
+ "loss": 0.7155,
+ "step": 122
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.9948003912101274e-05,
+ "loss": 0.7385,
+ "step": 123
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.9946341631587086e-05,
+ "loss": 0.7199,
+ "step": 124
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.9944653267951507e-05,
+ "loss": 0.68,
+ "step": 125
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.9942938825622064e-05,
+ "loss": 0.8073,
+ "step": 126
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.994119830909469e-05,
+ "loss": 0.705,
+ "step": 127
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.9939431722933678e-05,
+ "loss": 0.7802,
+ "step": 128
+ },
+ {
+ "epoch": 0.06,
+ "learning_rate": 1.9937639071771704e-05,
+ "loss": 0.7184,
+ "step": 129
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.993582036030978e-05,
+ "loss": 0.7413,
+ "step": 130
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.9933975593317263e-05,
+ "loss": 0.7346,
+ "step": 131
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.9932104775631847e-05,
+ "loss": 0.7115,
+ "step": 132
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.993020791215953e-05,
+ "loss": 0.713,
+ "step": 133
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.992828500787461e-05,
+ "loss": 0.7238,
+ "step": 134
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.9926336067819686e-05,
+ "loss": 0.7186,
+ "step": 135
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.9924361097105624e-05,
+ "loss": 0.7158,
+ "step": 136
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.9922360100911553e-05,
+ "loss": 0.7106,
+ "step": 137
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.992033308448486e-05,
+ "loss": 0.7274,
+ "step": 138
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.9918280053141144e-05,
+ "loss": 0.738,
+ "step": 139
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.9916201012264255e-05,
+ "loss": 0.7267,
+ "step": 140
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.9914095967306224e-05,
+ "loss": 0.734,
+ "step": 141
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.9911964923787295e-05,
+ "loss": 0.7932,
+ "step": 142
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.990980788729588e-05,
+ "loss": 0.7568,
+ "step": 143
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.990762486348855e-05,
+ "loss": 0.7244,
+ "step": 144
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.9905415858090036e-05,
+ "loss": 0.7349,
+ "step": 145
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.9903180876893195e-05,
+ "loss": 0.674,
+ "step": 146
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.9900919925759e-05,
+ "loss": 0.7401,
+ "step": 147
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.989863301061654e-05,
+ "loss": 0.7278,
+ "step": 148
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.9896320137462984e-05,
+ "loss": 0.7734,
+ "step": 149
+ },
+ {
+ "epoch": 0.07,
+ "learning_rate": 1.9893981312363563e-05,
+ "loss": 0.6926,
+ "step": 150
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.989161654145158e-05,
+ "loss": 0.7339,
+ "step": 151
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.9889225830928365e-05,
+ "loss": 0.7335,
+ "step": 152
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.9886809187063285e-05,
+ "loss": 0.6829,
+ "step": 153
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.9884366616193707e-05,
+ "loss": 0.6921,
+ "step": 154
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.988189812472498e-05,
+ "loss": 0.7695,
+ "step": 155
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.987940371913044e-05,
+ "loss": 0.7079,
+ "step": 156
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.9876883405951378e-05,
+ "loss": 0.7568,
+ "step": 157
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.987433719179702e-05,
+ "loss": 0.7084,
+ "step": 158
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.987176508334451e-05,
+ "loss": 0.7419,
+ "step": 159
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.9869167087338908e-05,
+ "loss": 0.7072,
+ "step": 160
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.9866543210593154e-05,
+ "loss": 0.769,
+ "step": 161
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.986389345998806e-05,
+ "loss": 0.7518,
+ "step": 162
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.986121784247229e-05,
+ "loss": 0.7457,
+ "step": 163
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.9858516365062334e-05,
+ "loss": 0.7448,
+ "step": 164
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.9855789034842504e-05,
+ "loss": 0.7529,
+ "step": 165
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.9853035858964907e-05,
+ "loss": 0.6531,
+ "step": 166
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.9850256844649422e-05,
+ "loss": 0.7162,
+ "step": 167
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.9847451999183692e-05,
+ "loss": 0.6711,
+ "step": 168
+ },
+ {
+ "epoch": 0.08,
+ "learning_rate": 1.98446213299231e-05,
+ "loss": 0.687,
+ "step": 169
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.9841764844290744e-05,
+ "loss": 0.7153,
+ "step": 170
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.9838882549777426e-05,
+ "loss": 0.7152,
+ "step": 171
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.9835974453941623e-05,
+ "loss": 0.7557,
+ "step": 172
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.983304056440948e-05,
+ "loss": 0.7054,
+ "step": 173
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.983008088887478e-05,
+ "loss": 0.6988,
+ "step": 174
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.9827095435098926e-05,
+ "loss": 0.6721,
+ "step": 175
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.9824084210910924e-05,
+ "loss": 0.7319,
+ "step": 176
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.9821047224207362e-05,
+ "loss": 0.6729,
+ "step": 177
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.9817984482952378e-05,
+ "loss": 0.7491,
+ "step": 178
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.9814895995177653e-05,
+ "loss": 0.7494,
+ "step": 179
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.9811781768982392e-05,
+ "loss": 0.7351,
+ "step": 180
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.9808641812533286e-05,
+ "loss": 0.7003,
+ "step": 181
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.980547613406451e-05,
+ "loss": 0.7291,
+ "step": 182
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.9802284741877674e-05,
+ "loss": 0.7574,
+ "step": 183
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.9799067644341844e-05,
+ "loss": 0.6798,
+ "step": 184
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.9795824849893483e-05,
+ "loss": 0.7454,
+ "step": 185
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.9792556367036432e-05,
+ "loss": 0.6889,
+ "step": 186
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.9789262204341918e-05,
+ "loss": 0.7128,
+ "step": 187
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.978594237044849e-05,
+ "loss": 0.6948,
+ "step": 188
+ },
+ {
+ "epoch": 0.09,
+ "learning_rate": 1.9782596874062028e-05,
+ "loss": 0.733,
+ "step": 189
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.977922572395571e-05,
+ "loss": 0.7165,
+ "step": 190
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.9775828928969976e-05,
+ "loss": 0.7471,
+ "step": 191
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.977240649801253e-05,
+ "loss": 0.7039,
+ "step": 192
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.97689584400583e-05,
+ "loss": 0.6682,
+ "step": 193
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.9765484764149413e-05,
+ "loss": 0.7194,
+ "step": 194
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.976198547939518e-05,
+ "loss": 0.6918,
+ "step": 195
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.9758460594972068e-05,
+ "loss": 0.7341,
+ "step": 196
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.9754910120123675e-05,
+ "loss": 0.7172,
+ "step": 197
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.9751334064160708e-05,
+ "loss": 0.6916,
+ "step": 198
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.9747732436460955e-05,
+ "loss": 0.7046,
+ "step": 199
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.9744105246469264e-05,
+ "loss": 0.7297,
+ "step": 200
+ },
+ {
+ "epoch": 0.1,
+ "eval_code_gate_load": [
+ 202.15,
+ 184.0,
+ 181.85,
+ 156.2,
+ 180.4,
+ 177.6,
+ 186.5,
+ 171.3
+ ],
+ "eval_code_loss": 0.5015624761581421,
+ "eval_code_runtime": 1.8076,
+ "eval_code_samples_per_second": 553.219,
+ "eval_code_steps_per_second": 34.853,
+ "step": 200
+ },
+ {
+ "epoch": 0.1,
+ "eval_orca_gate_load": [
+ 500.1,
+ 348.05,
+ 396.7,
+ 392.0,
+ 354.8,
+ 437.65,
+ 368.05,
+ 344.95
+ ],
+ "eval_orca_loss": 0.7518554925918579,
+ "eval_orca_runtime": 2.0082,
+ "eval_orca_samples_per_second": 497.958,
+ "eval_orca_steps_per_second": 31.371,
+ "step": 200
+ },
+ {
+ "epoch": 0.1,
+ "eval_math_gate_load": [
+ 319.05,
+ 220.35,
+ 222.35,
+ 231.65,
+ 255.05,
+ 239.45,
+ 280.9,
+ 263.9
+ ],
+ "eval_math_loss": 0.6756836175918579,
+ "eval_math_runtime": 1.8563,
+ "eval_math_samples_per_second": 538.698,
+ "eval_math_steps_per_second": 33.938,
+ "step": 200
+ },
+ {
+ "epoch": 0.1,
+ "eval_sharegpt_gate_load": [
+ 1522.6,
+ 1108.8,
+ 1288.45,
+ 1197.4,
+ 1139.0,
+ 1373.8,
+ 1198.4,
+ 1083.15
+ ],
+ "eval_sharegpt_loss": 0.7212890386581421,
+ "eval_sharegpt_runtime": 3.0067,
+ "eval_sharegpt_samples_per_second": 332.594,
+ "eval_sharegpt_steps_per_second": 20.953,
+ "step": 200
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.9740452503697518e-05,
+ "loss": 0.7521,
+ "step": 201
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.9736774217724614e-05,
+ "loss": 0.716,
+ "step": 202
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.9733070398196423e-05,
+ "loss": 0.7111,
+ "step": 203
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.9729341054825783e-05,
+ "loss": 0.7261,
+ "step": 204
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.972558619739246e-05,
+ "loss": 0.7397,
+ "step": 205
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.972180583574313e-05,
+ "loss": 0.7313,
+ "step": 206
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.9717999979791356e-05,
+ "loss": 0.6935,
+ "step": 207
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.9714168639517543e-05,
+ "loss": 0.7706,
+ "step": 208
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.9710311824968942e-05,
+ "loss": 0.7282,
+ "step": 209
+ },
+ {
+ "epoch": 0.1,
+ "learning_rate": 1.9706429546259592e-05,
+ "loss": 0.6551,
+ "step": 210
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.9702521813570322e-05,
+ "loss": 0.6824,
+ "step": 211
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.9698588637148705e-05,
+ "loss": 0.7006,
+ "step": 212
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.9694630027309035e-05,
+ "loss": 0.734,
+ "step": 213
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.9690645994432307e-05,
+ "loss": 0.6711,
+ "step": 214
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.9686636548966177e-05,
+ "loss": 0.7231,
+ "step": 215
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.9682601701424958e-05,
+ "loss": 0.7045,
+ "step": 216
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.9678541462389564e-05,
+ "loss": 0.7036,
+ "step": 217
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.9674455842507494e-05,
+ "loss": 0.6919,
+ "step": 218
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.9670344852492814e-05,
+ "loss": 0.7436,
+ "step": 219
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.9666208503126115e-05,
+ "loss": 0.7014,
+ "step": 220
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.966204680525449e-05,
+ "loss": 0.7137,
+ "step": 221
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.9657859769791506e-05,
+ "loss": 0.6578,
+ "step": 222
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.965364740771718e-05,
+ "loss": 0.6613,
+ "step": 223
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.9649409730077934e-05,
+ "loss": 0.748,
+ "step": 224
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.964514674798659e-05,
+ "loss": 0.7396,
+ "step": 225
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.9640858472622316e-05,
+ "loss": 0.6985,
+ "step": 226
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.963654491523062e-05,
+ "loss": 0.7084,
+ "step": 227
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.9632206087123296e-05,
+ "loss": 0.6876,
+ "step": 228
+ },
+ {
+ "epoch": 0.11,
+ "learning_rate": 1.9627841999678422e-05,
+ "loss": 0.719,
+ "step": 229
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.9623452664340305e-05,
+ "loss": 0.6693,
+ "step": 230
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.9619038092619465e-05,
+ "loss": 0.722,
+ "step": 231
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.9614598296092603e-05,
+ "loss": 0.7495,
+ "step": 232
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.9610133286402565e-05,
+ "loss": 0.653,
+ "step": 233
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.9605643075258323e-05,
+ "loss": 0.7022,
+ "step": 234
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.960112767443493e-05,
+ "loss": 0.7374,
+ "step": 235
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.9596587095773496e-05,
+ "loss": 0.6933,
+ "step": 236
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.9592021351181163e-05,
+ "loss": 0.6685,
+ "step": 237
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.958743045263106e-05,
+ "loss": 0.7344,
+ "step": 238
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.9582814412162288e-05,
+ "loss": 0.6979,
+ "step": 239
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.957817324187987e-05,
+ "loss": 0.6684,
+ "step": 240
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.957350695395474e-05,
+ "loss": 0.7081,
+ "step": 241
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.956881556062369e-05,
+ "loss": 0.7117,
+ "step": 242
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.956409907418935e-05,
+ "loss": 0.6398,
+ "step": 243
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.9559357507020163e-05,
+ "loss": 0.7447,
+ "step": 244
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.955459087155033e-05,
+ "loss": 0.7159,
+ "step": 245
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.9549799180279793e-05,
+ "loss": 0.7141,
+ "step": 246
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.9544982445774217e-05,
+ "loss": 0.7247,
+ "step": 247
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.9540140680664915e-05,
+ "loss": 0.6986,
+ "step": 248
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.9535273897648857e-05,
+ "loss": 0.7617,
+ "step": 249
+ },
+ {
+ "epoch": 0.12,
+ "learning_rate": 1.953038210948861e-05,
+ "loss": 0.7067,
+ "step": 250
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.9525465329012322e-05,
+ "loss": 0.7115,
+ "step": 251
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.952052356911368e-05,
+ "loss": 0.7017,
+ "step": 252
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.9515556842751863e-05,
+ "loss": 0.6714,
+ "step": 253
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.9510565162951538e-05,
+ "loss": 0.6627,
+ "step": 254
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.9505548542802805e-05,
+ "loss": 0.6834,
+ "step": 255
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.950050699546116e-05,
+ "loss": 0.7105,
+ "step": 256
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.949544053414748e-05,
+ "loss": 0.7134,
+ "step": 257
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.9490349172147964e-05,
+ "loss": 0.7188,
+ "step": 258
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.9485232922814117e-05,
+ "loss": 0.6334,
+ "step": 259
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.9480091799562706e-05,
+ "loss": 0.7144,
+ "step": 260
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.947492581587573e-05,
+ "loss": 0.6746,
+ "step": 261
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.9469734985300373e-05,
+ "loss": 0.778,
+ "step": 262
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.9464519321448988e-05,
+ "loss": 0.7379,
+ "step": 263
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.9459278837999048e-05,
+ "loss": 0.699,
+ "step": 264
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.9454013548693103e-05,
+ "loss": 0.7559,
+ "step": 265
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.9448723467338765e-05,
+ "loss": 0.6769,
+ "step": 266
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.944340860780865e-05,
+ "loss": 0.709,
+ "step": 267
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.9438068984040366e-05,
+ "loss": 0.7177,
+ "step": 268
+ },
+ {
+ "epoch": 0.13,
+ "learning_rate": 1.9432704610036448e-05,
+ "loss": 0.7849,
+ "step": 269
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9427315499864345e-05,
+ "loss": 0.7355,
+ "step": 270
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9421901667656364e-05,
+ "loss": 0.686,
+ "step": 271
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9416463127609655e-05,
+ "loss": 0.6962,
+ "step": 272
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9410999893986157e-05,
+ "loss": 0.7009,
+ "step": 273
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9405511981112553e-05,
+ "loss": 0.6892,
+ "step": 274
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9399999403380266e-05,
+ "loss": 0.6725,
+ "step": 275
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9394462175245382e-05,
+ "loss": 0.75,
+ "step": 276
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9388900311228636e-05,
+ "loss": 0.7397,
+ "step": 277
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9383313825915372e-05,
+ "loss": 0.6471,
+ "step": 278
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9377702733955493e-05,
+ "loss": 0.7021,
+ "step": 279
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.937206705006344e-05,
+ "loss": 0.6346,
+ "step": 280
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9366406789018127e-05,
+ "loss": 0.7311,
+ "step": 281
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9360721965662934e-05,
+ "loss": 0.6929,
+ "step": 282
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9355012594905645e-05,
+ "loss": 0.705,
+ "step": 283
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9349278691718426e-05,
+ "loss": 0.7271,
+ "step": 284
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9343520271137764e-05,
+ "loss": 0.7556,
+ "step": 285
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9337737348264448e-05,
+ "loss": 0.6825,
+ "step": 286
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9331929938263515e-05,
+ "loss": 0.6625,
+ "step": 287
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9326098056364224e-05,
+ "loss": 0.6577,
+ "step": 288
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9320241717860007e-05,
+ "loss": 0.6815,
+ "step": 289
+ },
+ {
+ "epoch": 0.14,
+ "learning_rate": 1.9314360938108427e-05,
+ "loss": 0.7224,
+ "step": 290
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.930845573253114e-05,
+ "loss": 0.6794,
+ "step": 291
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.9302526116613863e-05,
+ "loss": 0.7068,
+ "step": 292
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.9296572105906323e-05,
+ "loss": 0.759,
+ "step": 293
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.9290593716022218e-05,
+ "loss": 0.7061,
+ "step": 294
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.928459096263918e-05,
+ "loss": 0.7049,
+ "step": 295
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.9278563861498726e-05,
+ "loss": 0.7267,
+ "step": 296
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.927251242840623e-05,
+ "loss": 0.7339,
+ "step": 297
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.9266436679230866e-05,
+ "loss": 0.7487,
+ "step": 298
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.926033662990558e-05,
+ "loss": 0.7184,
+ "step": 299
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.9254212296427043e-05,
+ "loss": 0.7435,
+ "step": 300
+ },
+ {
+ "epoch": 0.15,
+ "eval_code_gate_load": [
+ 206.0,
+ 187.95,
+ 190.05,
+ 161.5,
+ 166.75,
+ 174.0,
+ 185.1,
+ 168.65
+ ],
+ "eval_code_loss": 0.5054687261581421,
+ "eval_code_runtime": 1.7765,
+ "eval_code_samples_per_second": 562.908,
+ "eval_code_steps_per_second": 35.463,
+ "step": 300
+ },
+ {
+ "epoch": 0.15,
+ "eval_orca_gate_load": [
+ 508.25,
+ 351.75,
+ 403.15,
+ 398.9,
+ 341.1,
+ 425.3,
+ 372.7,
+ 341.15
+ ],
+ "eval_orca_loss": 0.74560546875,
+ "eval_orca_runtime": 1.9987,
+ "eval_orca_samples_per_second": 500.33,
+ "eval_orca_steps_per_second": 31.521,
+ "step": 300
+ },
+ {
+ "epoch": 0.15,
+ "eval_math_gate_load": [
+ 324.1,
+ 225.55,
+ 232.0,
+ 231.1,
+ 244.15,
+ 235.65,
+ 281.2,
+ 258.95
+ ],
+ "eval_math_loss": 0.660449206829071,
+ "eval_math_runtime": 1.8524,
+ "eval_math_samples_per_second": 539.839,
+ "eval_math_steps_per_second": 34.01,
+ "step": 300
+ },
+ {
+ "epoch": 0.15,
+ "eval_sharegpt_gate_load": [
+ 1528.15,
+ 1114.7,
+ 1318.0,
+ 1217.15,
+ 1114.7,
+ 1348.95,
+ 1194.85,
+ 1075.1
+ ],
+ "eval_sharegpt_loss": 0.7177734375,
+ "eval_sharegpt_runtime": 3.0081,
+ "eval_sharegpt_samples_per_second": 332.432,
+ "eval_sharegpt_steps_per_second": 20.943,
+ "step": 300
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.9248063694855603e-05,
+ "loss": 0.7142,
+ "step": 301
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.924189084131525e-05,
+ "loss": 0.6985,
+ "step": 302
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.923569375199357e-05,
+ "loss": 0.7139,
+ "step": 303
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.922947244314172e-05,
+ "loss": 0.7499,
+ "step": 304
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.922322693107434e-05,
+ "loss": 0.6487,
+ "step": 305
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.9216957232169567e-05,
+ "loss": 0.6963,
+ "step": 306
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.9210663362868956e-05,
+ "loss": 0.6527,
+ "step": 307
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.9204345339677442e-05,
+ "loss": 0.6664,
+ "step": 308
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.9198003179163308e-05,
+ "loss": 0.7128,
+ "step": 309
+ },
+ {
+ "epoch": 0.15,
+ "learning_rate": 1.9191636897958123e-05,
+ "loss": 0.7186,
+ "step": 310
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.9185246512756727e-05,
+ "loss": 0.6744,
+ "step": 311
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.9178832040317153e-05,
+ "loss": 0.671,
+ "step": 312
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.917239349746061e-05,
+ "loss": 0.7735,
+ "step": 313
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.916593090107143e-05,
+ "loss": 0.6821,
+ "step": 314
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.9159444268097012e-05,
+ "loss": 0.6755,
+ "step": 315
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.91529336155478e-05,
+ "loss": 0.6606,
+ "step": 316
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.9146398960497213e-05,
+ "loss": 0.7263,
+ "step": 317
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.913984032008163e-05,
+ "loss": 0.6973,
+ "step": 318
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.9133257711500318e-05,
+ "loss": 0.6823,
+ "step": 319
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.9126651152015404e-05,
+ "loss": 0.6701,
+ "step": 320
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.9120020658951814e-05,
+ "loss": 0.6863,
+ "step": 321
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.911336624969725e-05,
+ "loss": 0.7225,
+ "step": 322
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.910668794170212e-05,
+ "loss": 0.7344,
+ "step": 323
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.9099985752479505e-05,
+ "loss": 0.743,
+ "step": 324
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.9093259699605125e-05,
+ "loss": 0.7427,
+ "step": 325
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.908650980071726e-05,
+ "loss": 0.6911,
+ "step": 326
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.9079736073516735e-05,
+ "loss": 0.6815,
+ "step": 327
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.9072938535766864e-05,
+ "loss": 0.681,
+ "step": 328
+ },
+ {
+ "epoch": 0.16,
+ "learning_rate": 1.9066117205293393e-05,
+ "loss": 0.6892,
+ "step": 329
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.905927209998447e-05,
+ "loss": 0.7275,
+ "step": 330
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.905240323779058e-05,
+ "loss": 0.7578,
+ "step": 331
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.904551063672452e-05,
+ "loss": 0.6842,
+ "step": 332
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.9038594314861328e-05,
+ "loss": 0.7046,
+ "step": 333
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.9031654290338256e-05,
+ "loss": 0.686,
+ "step": 334
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.90246905813547e-05,
+ "loss": 0.6564,
+ "step": 335
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.9017703206172187e-05,
+ "loss": 0.6683,
+ "step": 336
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.9010692183114285e-05,
+ "loss": 0.7419,
+ "step": 337
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.900365753056659e-05,
+ "loss": 0.6519,
+ "step": 338
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.8996599266976658e-05,
+ "loss": 0.6392,
+ "step": 339
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.8989517410853956e-05,
+ "loss": 0.6843,
+ "step": 340
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.898241198076983e-05,
+ "loss": 0.6899,
+ "step": 341
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.8975282995357448e-05,
+ "loss": 0.7023,
+ "step": 342
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.8968130473311732e-05,
+ "loss": 0.6331,
+ "step": 343
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.896095443338935e-05,
+ "loss": 0.659,
+ "step": 344
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.8953754894408617e-05,
+ "loss": 0.6294,
+ "step": 345
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.8946531875249496e-05,
+ "loss": 0.6617,
+ "step": 346
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.89392853948535e-05,
+ "loss": 0.6872,
+ "step": 347
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.8932015472223692e-05,
+ "loss": 0.7242,
+ "step": 348
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.892472212642459e-05,
+ "loss": 0.6774,
+ "step": 349
+ },
+ {
+ "epoch": 0.17,
+ "learning_rate": 1.8917405376582144e-05,
+ "loss": 0.6537,
+ "step": 350
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.891006524188368e-05,
+ "loss": 0.7253,
+ "step": 351
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.8902701741577844e-05,
+ "loss": 0.6531,
+ "step": 352
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.889531489497455e-05,
+ "loss": 0.6655,
+ "step": 353
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.8887904721444955e-05,
+ "loss": 0.7832,
+ "step": 354
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.8880471240421365e-05,
+ "loss": 0.6488,
+ "step": 355
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.8873014471397225e-05,
+ "loss": 0.7323,
+ "step": 356
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.8865534433927034e-05,
+ "loss": 0.6627,
+ "step": 357
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.8858031147626326e-05,
+ "loss": 0.6513,
+ "step": 358
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.885050463217159e-05,
+ "loss": 0.6807,
+ "step": 359
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.8842954907300236e-05,
+ "loss": 0.6281,
+ "step": 360
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.883538199281054e-05,
+ "loss": 0.6593,
+ "step": 361
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.8827785908561585e-05,
+ "loss": 0.6379,
+ "step": 362
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.8820166674473217e-05,
+ "loss": 0.6934,
+ "step": 363
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.881252431052599e-05,
+ "loss": 0.6419,
+ "step": 364
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.880485883676111e-05,
+ "loss": 0.7072,
+ "step": 365
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.879717027328039e-05,
+ "loss": 0.7061,
+ "step": 366
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.8789458640246193e-05,
+ "loss": 0.668,
+ "step": 367
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.8781723957881374e-05,
+ "loss": 0.6409,
+ "step": 368
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.8773966246469238e-05,
+ "loss": 0.6655,
+ "step": 369
+ },
+ {
+ "epoch": 0.18,
+ "learning_rate": 1.876618552635348e-05,
+ "loss": 0.7031,
+ "step": 370
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.8758381817938126e-05,
+ "loss": 0.6492,
+ "step": 371
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.87505551416875e-05,
+ "loss": 0.7515,
+ "step": 372
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.874270551812614e-05,
+ "loss": 0.6949,
+ "step": 373
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.8734832967838775e-05,
+ "loss": 0.7355,
+ "step": 374
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.8726937511470247e-05,
+ "loss": 0.6748,
+ "step": 375
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.871901916972547e-05,
+ "loss": 0.6932,
+ "step": 376
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.8711077963369377e-05,
+ "loss": 0.7317,
+ "step": 377
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.8703113913226847e-05,
+ "loss": 0.7287,
+ "step": 378
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.8695127040182678e-05,
+ "loss": 0.7347,
+ "step": 379
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.8687117365181514e-05,
+ "loss": 0.665,
+ "step": 380
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.867908490922779e-05,
+ "loss": 0.6309,
+ "step": 381
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.867102969338569e-05,
+ "loss": 0.6849,
+ "step": 382
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.8662951738779077e-05,
+ "loss": 0.7277,
+ "step": 383
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.865485106659145e-05,
+ "loss": 0.7113,
+ "step": 384
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.8646727698065865e-05,
+ "loss": 0.6555,
+ "step": 385
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.863858165450492e-05,
+ "loss": 0.6328,
+ "step": 386
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.863041295727066e-05,
+ "loss": 0.755,
+ "step": 387
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.862222162778454e-05,
+ "loss": 0.72,
+ "step": 388
+ },
+ {
+ "epoch": 0.19,
+ "learning_rate": 1.8614007687527374e-05,
+ "loss": 0.7177,
+ "step": 389
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8605771158039253e-05,
+ "loss": 0.7039,
+ "step": 390
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8597512060919523e-05,
+ "loss": 0.6861,
+ "step": 391
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.85892304178267e-05,
+ "loss": 0.672,
+ "step": 392
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8580926250478425e-05,
+ "loss": 0.6745,
+ "step": 393
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8572599580651415e-05,
+ "loss": 0.6572,
+ "step": 394
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8564250430181387e-05,
+ "loss": 0.6489,
+ "step": 395
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8555878820963014e-05,
+ "loss": 0.7114,
+ "step": 396
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8547484774949865e-05,
+ "loss": 0.7313,
+ "step": 397
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8539068314154355e-05,
+ "loss": 0.7268,
+ "step": 398
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8530629460647658e-05,
+ "loss": 0.698,
+ "step": 399
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8522168236559693e-05,
+ "loss": 0.7727,
+ "step": 400
+ },
+ {
+ "epoch": 0.2,
+ "eval_code_gate_load": [
+ 202.85,
+ 191.5,
+ 178.8,
+ 158.95,
+ 174.1,
+ 173.95,
+ 189.7,
+ 170.15
+ ],
+ "eval_code_loss": 0.4867187440395355,
+ "eval_code_runtime": 1.7854,
+ "eval_code_samples_per_second": 560.106,
+ "eval_code_steps_per_second": 35.287,
+ "step": 400
+ },
+ {
+ "epoch": 0.2,
+ "eval_orca_gate_load": [
+ 501.6,
+ 354.85,
+ 398.45,
+ 395.2,
+ 353.6,
+ 425.0,
+ 373.85,
+ 339.75
+ ],
+ "eval_orca_loss": 0.735644519329071,
+ "eval_orca_runtime": 2.0136,
+ "eval_orca_samples_per_second": 496.616,
+ "eval_orca_steps_per_second": 31.287,
+ "step": 400
+ },
+ {
+ "epoch": 0.2,
+ "eval_math_gate_load": [
+ 320.85,
+ 237.0,
+ 216.35,
+ 231.35,
+ 252.05,
+ 238.55,
+ 277.05,
+ 259.5
+ ],
+ "eval_math_loss": 0.6509765386581421,
+ "eval_math_runtime": 1.8609,
+ "eval_math_samples_per_second": 537.377,
+ "eval_math_steps_per_second": 33.855,
+ "step": 400
+ },
+ {
+ "epoch": 0.2,
+ "eval_sharegpt_gate_load": [
+ 1520.85,
+ 1130.25,
+ 1282.2,
+ 1212.05,
+ 1141.8,
+ 1334.7,
+ 1205.7,
+ 1084.05
+ ],
+ "eval_sharegpt_loss": 0.712109386920929,
+ "eval_sharegpt_runtime": 2.9955,
+ "eval_sharegpt_samples_per_second": 333.837,
+ "eval_sharegpt_steps_per_second": 21.032,
+ "step": 400
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8513684664079033e-05,
+ "loss": 0.736,
+ "step": 401
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8505178765452853e-05,
+ "loss": 0.7126,
+ "step": 402
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8496650562986888e-05,
+ "loss": 0.6468,
+ "step": 403
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8488100079045345e-05,
+ "loss": 0.76,
+ "step": 404
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.847952733605088e-05,
+ "loss": 0.641,
+ "step": 405
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.847093235648451e-05,
+ "loss": 0.656,
+ "step": 406
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8462315162885563e-05,
+ "loss": 0.7218,
+ "step": 407
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8453675777851627e-05,
+ "loss": 0.6604,
+ "step": 408
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8445014224038485e-05,
+ "loss": 0.7317,
+ "step": 409
+ },
+ {
+ "epoch": 0.2,
+ "learning_rate": 1.8436330524160048e-05,
+ "loss": 0.7086,
+ "step": 410
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.8427624700988308e-05,
+ "loss": 0.7205,
+ "step": 411
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.8418896777353272e-05,
+ "loss": 0.7507,
+ "step": 412
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.84101467761429e-05,
+ "loss": 0.6566,
+ "step": 413
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.8401374720303054e-05,
+ "loss": 0.6488,
+ "step": 414
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.8392580632837423e-05,
+ "loss": 0.6858,
+ "step": 415
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.8383764536807486e-05,
+ "loss": 0.6794,
+ "step": 416
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.837492645533241e-05,
+ "loss": 0.7179,
+ "step": 417
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.836606641158905e-05,
+ "loss": 0.6611,
+ "step": 418
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.835718442881183e-05,
+ "loss": 0.6555,
+ "step": 419
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.8348280530292712e-05,
+ "loss": 0.6509,
+ "step": 420
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.8339354739381138e-05,
+ "loss": 0.6516,
+ "step": 421
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.833040707948395e-05,
+ "loss": 0.7115,
+ "step": 422
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.8321437574065347e-05,
+ "loss": 0.6653,
+ "step": 423
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.831244624664681e-05,
+ "loss": 0.6984,
+ "step": 424
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.8303433120807043e-05,
+ "loss": 0.6648,
+ "step": 425
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.829439822018192e-05,
+ "loss": 0.6358,
+ "step": 426
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.8285341568464416e-05,
+ "loss": 0.7535,
+ "step": 427
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.827626318940454e-05,
+ "loss": 0.6195,
+ "step": 428
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.8267163106809288e-05,
+ "loss": 0.6978,
+ "step": 429
+ },
+ {
+ "epoch": 0.21,
+ "learning_rate": 1.8258041344542567e-05,
+ "loss": 0.6611,
+ "step": 430
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.824889792652513e-05,
+ "loss": 0.6458,
+ "step": 431
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.8239732876734525e-05,
+ "loss": 0.68,
+ "step": 432
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.8230546219205032e-05,
+ "loss": 0.6779,
+ "step": 433
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.822133797802758e-05,
+ "loss": 0.719,
+ "step": 434
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.8212108177349722e-05,
+ "loss": 0.6448,
+ "step": 435
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.8202856841375517e-05,
+ "loss": 0.7534,
+ "step": 436
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.819358399436553e-05,
+ "loss": 0.6638,
+ "step": 437
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.8184289660636715e-05,
+ "loss": 0.7415,
+ "step": 438
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.817497386456238e-05,
+ "loss": 0.6746,
+ "step": 439
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.816563663057211e-05,
+ "loss": 0.6855,
+ "step": 440
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.815627798315172e-05,
+ "loss": 0.7072,
+ "step": 441
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.8146897946843162e-05,
+ "loss": 0.7095,
+ "step": 442
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.81374965462445e-05,
+ "loss": 0.6492,
+ "step": 443
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.81280738060098e-05,
+ "loss": 0.6364,
+ "step": 444
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.8118629750849106e-05,
+ "loss": 0.7134,
+ "step": 445
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.810916440552835e-05,
+ "loss": 0.6878,
+ "step": 446
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.8099677794869297e-05,
+ "loss": 0.6927,
+ "step": 447
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.8090169943749477e-05,
+ "loss": 0.6148,
+ "step": 448
+ },
+ {
+ "epoch": 0.22,
+ "learning_rate": 1.808064087710212e-05,
+ "loss": 0.6622,
+ "step": 449
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.8071090619916095e-05,
+ "loss": 0.7079,
+ "step": 450
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.8061519197235835e-05,
+ "loss": 0.6797,
+ "step": 451
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.8051926634161282e-05,
+ "loss": 0.6875,
+ "step": 452
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.804231295584782e-05,
+ "loss": 0.6777,
+ "step": 453
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.8032678187506187e-05,
+ "loss": 0.6696,
+ "step": 454
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.802302235440245e-05,
+ "loss": 0.7006,
+ "step": 455
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.8013345481857903e-05,
+ "loss": 0.6464,
+ "step": 456
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.8003647595249016e-05,
+ "loss": 0.7272,
+ "step": 457
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.799392872000736e-05,
+ "loss": 0.6675,
+ "step": 458
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.7984188881619563e-05,
+ "loss": 0.6513,
+ "step": 459
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.797442810562721e-05,
+ "loss": 0.707,
+ "step": 460
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.79646464176268e-05,
+ "loss": 0.6863,
+ "step": 461
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.7954843843269665e-05,
+ "loss": 0.6823,
+ "step": 462
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.794502040826192e-05,
+ "loss": 0.6794,
+ "step": 463
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.793517613836437e-05,
+ "loss": 0.6876,
+ "step": 464
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.7925311059392472e-05,
+ "loss": 0.6972,
+ "step": 465
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.7915425197216246e-05,
+ "loss": 0.6594,
+ "step": 466
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.7905518577760207e-05,
+ "loss": 0.6748,
+ "step": 467
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.7895591227003316e-05,
+ "loss": 0.7027,
+ "step": 468
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.788564317097889e-05,
+ "loss": 0.6533,
+ "step": 469
+ },
+ {
+ "epoch": 0.23,
+ "learning_rate": 1.7875674435774546e-05,
+ "loss": 0.7014,
+ "step": 470
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.786568504753213e-05,
+ "loss": 0.6529,
+ "step": 471
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.7855675032447648e-05,
+ "loss": 0.6857,
+ "step": 472
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.78456444167712e-05,
+ "loss": 0.6052,
+ "step": 473
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.7835593226806902e-05,
+ "loss": 0.6513,
+ "step": 474
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.7825521488912833e-05,
+ "loss": 0.6751,
+ "step": 475
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.7815429229500946e-05,
+ "loss": 0.6228,
+ "step": 476
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.7805316475037016e-05,
+ "loss": 0.7381,
+ "step": 477
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.7795183252040568e-05,
+ "loss": 0.7245,
+ "step": 478
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.7785029587084793e-05,
+ "loss": 0.6885,
+ "step": 479
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.7774855506796497e-05,
+ "loss": 0.7099,
+ "step": 480
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.7764661037856013e-05,
+ "loss": 0.7003,
+ "step": 481
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.7754446206997152e-05,
+ "loss": 0.6168,
+ "step": 482
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.774421104100712e-05,
+ "loss": 0.6982,
+ "step": 483
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.7733955566726438e-05,
+ "loss": 0.705,
+ "step": 484
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.7723679811048904e-05,
+ "loss": 0.6737,
+ "step": 485
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.771338380092148e-05,
+ "loss": 0.649,
+ "step": 486
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.7703067563344252e-05,
+ "loss": 0.6895,
+ "step": 487
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.7692731125370355e-05,
+ "loss": 0.724,
+ "step": 488
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.768237451410589e-05,
+ "loss": 0.6591,
+ "step": 489
+ },
+ {
+ "epoch": 0.24,
+ "learning_rate": 1.767199775670986e-05,
+ "loss": 0.6185,
+ "step": 490
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.7661600880394113e-05,
+ "loss": 0.634,
+ "step": 491
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.7651183912423228e-05,
+ "loss": 0.7616,
+ "step": 492
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.7640746880114505e-05,
+ "loss": 0.6675,
+ "step": 493
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.7630289810837836e-05,
+ "loss": 0.6559,
+ "step": 494
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.7619812732015664e-05,
+ "loss": 0.6797,
+ "step": 495
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.7609315671122912e-05,
+ "loss": 0.6389,
+ "step": 496
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.75987986556869e-05,
+ "loss": 0.6548,
+ "step": 497
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.758826171328727e-05,
+ "loss": 0.641,
+ "step": 498
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.7577704871555924e-05,
+ "loss": 0.6575,
+ "step": 499
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.7567128158176955e-05,
+ "loss": 0.7441,
+ "step": 500
+ },
+ {
+ "epoch": 0.25,
+ "eval_code_gate_load": [
+ 211.7,
+ 175.45,
+ 182.4,
+ 153.8,
+ 175.7,
+ 182.45,
+ 190.25,
+ 168.25
+ ],
+ "eval_code_loss": 0.48564451932907104,
+ "eval_code_runtime": 1.7819,
+ "eval_code_samples_per_second": 561.202,
+ "eval_code_steps_per_second": 35.356,
+ "step": 500
+ },
+ {
+ "epoch": 0.25,
+ "eval_orca_gate_load": [
+ 510.85,
+ 348.25,
+ 399.85,
+ 394.75,
+ 347.0,
+ 427.2,
+ 372.4,
+ 342.0
+ ],
+ "eval_orca_loss": 0.7276366949081421,
+ "eval_orca_runtime": 2.0064,
+ "eval_orca_samples_per_second": 498.405,
+ "eval_orca_steps_per_second": 31.399,
+ "step": 500
+ },
+ {
+ "epoch": 0.25,
+ "eval_math_gate_load": [
+ 323.9,
+ 211.65,
+ 228.5,
+ 240.05,
+ 256.55,
+ 243.35,
+ 275.65,
+ 253.05
+ ],
+ "eval_math_loss": 0.6341797113418579,
+ "eval_math_runtime": 1.8482,
+ "eval_math_samples_per_second": 541.069,
+ "eval_math_steps_per_second": 34.087,
+ "step": 500
+ },
+ {
+ "epoch": 0.25,
+ "eval_sharegpt_gate_load": [
+ 1535.15,
+ 1108.85,
+ 1287.05,
+ 1205.95,
+ 1139.5,
+ 1344.0,
+ 1206.15,
+ 1084.95
+ ],
+ "eval_sharegpt_loss": 0.702929675579071,
+ "eval_sharegpt_runtime": 3.0148,
+ "eval_sharegpt_samples_per_second": 331.693,
+ "eval_sharegpt_steps_per_second": 20.897,
+ "step": 500
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.7556531600886554e-05,
+ "loss": 0.6686,
+ "step": 501
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.7545915227472967e-05,
+ "loss": 0.6791,
+ "step": 502
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.753527906577638e-05,
+ "loss": 0.6089,
+ "step": 503
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.7524623143688905e-05,
+ "loss": 0.6985,
+ "step": 504
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.7513947489154443e-05,
+ "loss": 0.6266,
+ "step": 505
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.7503252130168657e-05,
+ "loss": 0.653,
+ "step": 506
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.749253709477888e-05,
+ "loss": 0.6769,
+ "step": 507
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.748180241108404e-05,
+ "loss": 0.7099,
+ "step": 508
+ },
+ {
+ "epoch": 0.25,
+ "learning_rate": 1.74710481072346e-05,
+ "loss": 0.6426,
+ "step": 509
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.7460274211432463e-05,
+ "loss": 0.6726,
+ "step": 510
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.7449480751930915e-05,
+ "loss": 0.6101,
+ "step": 511
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.7438667757034547e-05,
+ "loss": 0.7545,
+ "step": 512
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.7427835255099173e-05,
+ "loss": 0.6404,
+ "step": 513
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.7416983274531777e-05,
+ "loss": 0.6997,
+ "step": 514
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.74061118437904e-05,
+ "loss": 0.636,
+ "step": 515
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.739522099138411e-05,
+ "loss": 0.6581,
+ "step": 516
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.7384310745872896e-05,
+ "loss": 0.6526,
+ "step": 517
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.7373381135867605e-05,
+ "loss": 0.6856,
+ "step": 518
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.7362432190029862e-05,
+ "loss": 0.6896,
+ "step": 519
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.7351463937072008e-05,
+ "loss": 0.6557,
+ "step": 520
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.7340476405757e-05,
+ "loss": 0.5884,
+ "step": 521
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.732946962489836e-05,
+ "loss": 0.7313,
+ "step": 522
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.7318443623360092e-05,
+ "loss": 0.6795,
+ "step": 523
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.7307398430056595e-05,
+ "loss": 0.6634,
+ "step": 524
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.7296334073952606e-05,
+ "loss": 0.6827,
+ "step": 525
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.72852505840631e-05,
+ "loss": 0.5858,
+ "step": 526
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.7274147989453246e-05,
+ "loss": 0.6678,
+ "step": 527
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.72630263192383e-05,
+ "loss": 0.6171,
+ "step": 528
+ },
+ {
+ "epoch": 0.26,
+ "learning_rate": 1.7251885602583547e-05,
+ "loss": 0.7119,
+ "step": 529
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.7240725868704218e-05,
+ "loss": 0.6961,
+ "step": 530
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.722954714686541e-05,
+ "loss": 0.6365,
+ "step": 531
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.7218349466382024e-05,
+ "loss": 0.6798,
+ "step": 532
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.7207132856618668e-05,
+ "loss": 0.6779,
+ "step": 533
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.719589734698959e-05,
+ "loss": 0.7134,
+ "step": 534
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.718464296695861e-05,
+ "loss": 0.6633,
+ "step": 535
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.7173369746039026e-05,
+ "loss": 0.6756,
+ "step": 536
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.7162077713793547e-05,
+ "loss": 0.6724,
+ "step": 537
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.7150766899834205e-05,
+ "loss": 0.7333,
+ "step": 538
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.7139437333822303e-05,
+ "loss": 0.719,
+ "step": 539
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.7128089045468294e-05,
+ "loss": 0.6325,
+ "step": 540
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.711672206453175e-05,
+ "loss": 0.6804,
+ "step": 541
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.7105336420821247e-05,
+ "loss": 0.6392,
+ "step": 542
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.709393214419431e-05,
+ "loss": 0.6337,
+ "step": 543
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.7082509264557333e-05,
+ "loss": 0.6137,
+ "step": 544
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.7071067811865477e-05,
+ "loss": 0.6187,
+ "step": 545
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.705960781612262e-05,
+ "loss": 0.6155,
+ "step": 546
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.7048129307381266e-05,
+ "loss": 0.7153,
+ "step": 547
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.7036632315742464e-05,
+ "loss": 0.6237,
+ "step": 548
+ },
+ {
+ "epoch": 0.27,
+ "learning_rate": 1.7025116871355737e-05,
+ "loss": 0.6675,
+ "step": 549
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.7013583004418994e-05,
+ "loss": 0.6524,
+ "step": 550
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.7002030745178455e-05,
+ "loss": 0.6712,
+ "step": 551
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.6990460123928577e-05,
+ "loss": 0.7206,
+ "step": 552
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.6978871171011963e-05,
+ "loss": 0.6446,
+ "step": 553
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.696726391681929e-05,
+ "loss": 0.7012,
+ "step": 554
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.695563839178923e-05,
+ "loss": 0.6295,
+ "step": 555
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.6943994626408365e-05,
+ "loss": 0.6708,
+ "step": 556
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.6932332651211115e-05,
+ "loss": 0.6714,
+ "step": 557
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.692065249677965e-05,
+ "loss": 0.6664,
+ "step": 558
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.6908954193743816e-05,
+ "loss": 0.6241,
+ "step": 559
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.6897237772781046e-05,
+ "loss": 0.5995,
+ "step": 560
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.6885503264616282e-05,
+ "loss": 0.6553,
+ "step": 561
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.6873750700021917e-05,
+ "loss": 0.6247,
+ "step": 562
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.686198010981767e-05,
+ "loss": 0.6658,
+ "step": 563
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.6850191524870548e-05,
+ "loss": 0.6658,
+ "step": 564
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.6838384976094738e-05,
+ "loss": 0.5976,
+ "step": 565
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.682656049445154e-05,
+ "loss": 0.6962,
+ "step": 566
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.6814718110949274e-05,
+ "loss": 0.618,
+ "step": 567
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.6802857856643214e-05,
+ "loss": 0.6911,
+ "step": 568
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.6790979762635497e-05,
+ "loss": 0.6263,
+ "step": 569
+ },
+ {
+ "epoch": 0.28,
+ "learning_rate": 1.6779083860075032e-05,
+ "loss": 0.7031,
+ "step": 570
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.6767170180157442e-05,
+ "loss": 0.6945,
+ "step": 571
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.6755238754124965e-05,
+ "loss": 0.5995,
+ "step": 572
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.674328961326637e-05,
+ "loss": 0.6473,
+ "step": 573
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.6731322788916892e-05,
+ "loss": 0.6429,
+ "step": 574
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.6719338312458123e-05,
+ "loss": 0.6871,
+ "step": 575
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.6707336215317968e-05,
+ "loss": 0.7102,
+ "step": 576
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.6695316528970517e-05,
+ "loss": 0.6381,
+ "step": 577
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.6683279284936004e-05,
+ "loss": 0.6506,
+ "step": 578
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.6671224514780692e-05,
+ "loss": 0.6909,
+ "step": 579
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.665915225011681e-05,
+ "loss": 0.6609,
+ "step": 580
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.6647062522602474e-05,
+ "loss": 0.6844,
+ "step": 581
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.6634955363941573e-05,
+ "loss": 0.639,
+ "step": 582
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.662283080588373e-05,
+ "loss": 0.6377,
+ "step": 583
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.6610688880224178e-05,
+ "loss": 0.6264,
+ "step": 584
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.65985296188037e-05,
+ "loss": 0.5948,
+ "step": 585
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.6586353053508548e-05,
+ "loss": 0.6641,
+ "step": 586
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.657415921627034e-05,
+ "loss": 0.6331,
+ "step": 587
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.6561948139065997e-05,
+ "loss": 0.6469,
+ "step": 588
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.654971985391764e-05,
+ "loss": 0.6639,
+ "step": 589
+ },
+ {
+ "epoch": 0.29,
+ "learning_rate": 1.6537474392892527e-05,
+ "loss": 0.6445,
+ "step": 590
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.6525211788102946e-05,
+ "loss": 0.6451,
+ "step": 591
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.6512932071706153e-05,
+ "loss": 0.6608,
+ "step": 592
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.6500635275904274e-05,
+ "loss": 0.6746,
+ "step": 593
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.6488321432944218e-05,
+ "loss": 0.6803,
+ "step": 594
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.6475990575117603e-05,
+ "loss": 0.6298,
+ "step": 595
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.646364273476067e-05,
+ "loss": 0.6742,
+ "step": 596
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.6451277944254186e-05,
+ "loss": 0.6271,
+ "step": 597
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.6438896236023374e-05,
+ "loss": 0.6675,
+ "step": 598
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.6426497642537826e-05,
+ "loss": 0.5843,
+ "step": 599
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.6414082196311402e-05,
+ "loss": 0.6476,
+ "step": 600
+ },
+ {
+ "epoch": 0.3,
+ "eval_code_gate_load": [
+ 205.25,
+ 186.05,
+ 177.25,
+ 157.05,
+ 177.4,
+ 178.15,
+ 188.5,
+ 170.35
+ ],
+ "eval_code_loss": 0.47382813692092896,
+ "eval_code_runtime": 1.7893,
+ "eval_code_samples_per_second": 558.879,
+ "eval_code_steps_per_second": 35.209,
+ "step": 600
+ },
+ {
+ "epoch": 0.3,
+ "eval_orca_gate_load": [
+ 503.95,
+ 352.45,
+ 398.2,
+ 397.5,
+ 349.4,
+ 424.3,
+ 374.4,
+ 342.1
+ ],
+ "eval_orca_loss": 0.7177734375,
+ "eval_orca_runtime": 2.0043,
+ "eval_orca_samples_per_second": 498.93,
+ "eval_orca_steps_per_second": 31.433,
+ "step": 600
+ },
+ {
+ "epoch": 0.3,
+ "eval_math_gate_load": [
+ 314.1,
+ 223.2,
+ 230.75,
+ 237.9,
+ 259.05,
+ 238.65,
+ 276.2,
+ 252.85
+ ],
+ "eval_math_loss": 0.45869141817092896,
+ "eval_math_runtime": 1.8415,
+ "eval_math_samples_per_second": 543.045,
+ "eval_math_steps_per_second": 34.212,
+ "step": 600
+ },
+ {
+ "epoch": 0.3,
+ "eval_sharegpt_gate_load": [
+ 1523.65,
+ 1118.3,
+ 1287.95,
+ 1209.65,
+ 1143.35,
+ 1350.6,
+ 1193.3,
+ 1084.8
+ ],
+ "eval_sharegpt_loss": 0.6988281011581421,
+ "eval_sharegpt_runtime": 2.9965,
+ "eval_sharegpt_samples_per_second": 333.722,
+ "eval_sharegpt_steps_per_second": 21.024,
+ "step": 600
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.640164992990216e-05,
+ "loss": 0.6594,
+ "step": 601
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.638920087591228e-05,
+ "loss": 0.6526,
+ "step": 602
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.637673506698794e-05,
+ "loss": 0.6331,
+ "step": 603
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.6364252535819284e-05,
+ "loss": 0.6393,
+ "step": 604
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.6351753315140285e-05,
+ "loss": 0.6594,
+ "step": 605
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.63392374377287e-05,
+ "loss": 0.6047,
+ "step": 606
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.6326704936405953e-05,
+ "loss": 0.6231,
+ "step": 607
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.6314155844037074e-05,
+ "loss": 0.646,
+ "step": 608
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.6301590193530585e-05,
+ "loss": 0.6644,
+ "step": 609
+ },
+ {
+ "epoch": 0.3,
+ "learning_rate": 1.6289008017838447e-05,
+ "loss": 0.6033,
+ "step": 610
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.6276409349955945e-05,
+ "loss": 0.6628,
+ "step": 611
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.626379422292162e-05,
+ "loss": 0.6536,
+ "step": 612
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.6251162669817172e-05,
+ "loss": 0.6094,
+ "step": 613
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.6238514723767372e-05,
+ "loss": 0.6263,
+ "step": 614
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.622585041793999e-05,
+ "loss": 0.5747,
+ "step": 615
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.6213169785545688e-05,
+ "loss": 0.5773,
+ "step": 616
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.6200472859837946e-05,
+ "loss": 0.6433,
+ "step": 617
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.6187759674112972e-05,
+ "loss": 0.63,
+ "step": 618
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.6175030261709615e-05,
+ "loss": 0.6301,
+ "step": 619
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.6162284656009276e-05,
+ "loss": 0.6516,
+ "step": 620
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.6149522890435815e-05,
+ "loss": 0.6746,
+ "step": 621
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.6136744998455477e-05,
+ "loss": 0.5876,
+ "step": 622
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.6123951013576796e-05,
+ "loss": 0.6151,
+ "step": 623
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.6111140969350504e-05,
+ "loss": 0.6078,
+ "step": 624
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.6098314899369446e-05,
+ "loss": 0.564,
+ "step": 625
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.6085472837268504e-05,
+ "loss": 0.6233,
+ "step": 626
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.607261481672448e-05,
+ "loss": 0.6358,
+ "step": 627
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.6059740871456035e-05,
+ "loss": 0.6307,
+ "step": 628
+ },
+ {
+ "epoch": 0.31,
+ "learning_rate": 1.6046851035223594e-05,
+ "loss": 0.6252,
+ "step": 629
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.603394534182925e-05,
+ "loss": 0.6263,
+ "step": 630
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.6021023825116672e-05,
+ "loss": 0.6303,
+ "step": 631
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.6008086518971037e-05,
+ "loss": 0.6099,
+ "step": 632
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.599513345731892e-05,
+ "loss": 0.6102,
+ "step": 633
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.598216467412822e-05,
+ "loss": 0.6181,
+ "step": 634
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.5969180203408052e-05,
+ "loss": 0.6137,
+ "step": 635
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.5956180079208684e-05,
+ "loss": 0.6133,
+ "step": 636
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.5943164335621418e-05,
+ "loss": 0.6424,
+ "step": 637
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.593013300677853e-05,
+ "loss": 0.6082,
+ "step": 638
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.591708612685316e-05,
+ "loss": 0.575,
+ "step": 639
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.5904023730059227e-05,
+ "loss": 0.6683,
+ "step": 640
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.5890945850651347e-05,
+ "loss": 0.6528,
+ "step": 641
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.5877852522924733e-05,
+ "loss": 0.6372,
+ "step": 642
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.586474378121511e-05,
+ "loss": 0.6389,
+ "step": 643
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.5851619659898623e-05,
+ "loss": 0.6056,
+ "step": 644
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.5838480193391753e-05,
+ "loss": 0.5766,
+ "step": 645
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.582532541615122e-05,
+ "loss": 0.6306,
+ "step": 646
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.5812155362673895e-05,
+ "loss": 0.6092,
+ "step": 647
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.57989700674967e-05,
+ "loss": 0.6193,
+ "step": 648
+ },
+ {
+ "epoch": 0.32,
+ "learning_rate": 1.5785769565196543e-05,
+ "loss": 0.693,
+ "step": 649
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5772553890390196e-05,
+ "loss": 0.6228,
+ "step": 650
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5759323077734233e-05,
+ "loss": 0.6016,
+ "step": 651
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5746077161924905e-05,
+ "loss": 0.6191,
+ "step": 652
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5732816177698097e-05,
+ "loss": 0.5908,
+ "step": 653
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5719540159829185e-05,
+ "loss": 0.6295,
+ "step": 654
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5706249143132982e-05,
+ "loss": 0.5966,
+ "step": 655
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5692943162463628e-05,
+ "loss": 0.571,
+ "step": 656
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5679622252714507e-05,
+ "loss": 0.6001,
+ "step": 657
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5666286448818152e-05,
+ "loss": 0.6675,
+ "step": 658
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.565293578574615e-05,
+ "loss": 0.5932,
+ "step": 659
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5639570298509067e-05,
+ "loss": 0.6454,
+ "step": 660
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5626190022156328e-05,
+ "loss": 0.6011,
+ "step": 661
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5612794991776147e-05,
+ "loss": 0.5868,
+ "step": 662
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5599385242495437e-05,
+ "loss": 0.5421,
+ "step": 663
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5585960809479698e-05,
+ "loss": 0.6044,
+ "step": 664
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5572521727932937e-05,
+ "loss": 0.5905,
+ "step": 665
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5559068033097583e-05,
+ "loss": 0.6102,
+ "step": 666
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.554559976025438e-05,
+ "loss": 0.5814,
+ "step": 667
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5532116944722308e-05,
+ "loss": 0.6065,
+ "step": 668
+ },
+ {
+ "epoch": 0.33,
+ "learning_rate": 1.5518619621858474e-05,
+ "loss": 0.5764,
+ "step": 669
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.5505107827058038e-05,
+ "loss": 0.5827,
+ "step": 670
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.5491581595754102e-05,
+ "loss": 0.5963,
+ "step": 671
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.547804096341763e-05,
+ "loss": 0.5861,
+ "step": 672
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.546448596555736e-05,
+ "loss": 0.6071,
+ "step": 673
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.5450916637719683e-05,
+ "loss": 0.6061,
+ "step": 674
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.5437333015488586e-05,
+ "loss": 0.6383,
+ "step": 675
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.5423735134485537e-05,
+ "loss": 0.5513,
+ "step": 676
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.5410123030369387e-05,
+ "loss": 0.6036,
+ "step": 677
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.5396496738836292e-05,
+ "loss": 0.6438,
+ "step": 678
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.5382856295619622e-05,
+ "loss": 0.6068,
+ "step": 679
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.536920173648984e-05,
+ "loss": 0.6384,
+ "step": 680
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.535553309725444e-05,
+ "loss": 0.5917,
+ "step": 681
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.5341850413757834e-05,
+ "loss": 0.5726,
+ "step": 682
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.532815372188126e-05,
+ "loss": 0.5571,
+ "step": 683
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.5314443057542703e-05,
+ "loss": 0.6066,
+ "step": 684
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.530071845669678e-05,
+ "loss": 0.5798,
+ "step": 685
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.5286979955334655e-05,
+ "loss": 0.6412,
+ "step": 686
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.5273227589483945e-05,
+ "loss": 0.5704,
+ "step": 687
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.5259461395208628e-05,
+ "loss": 0.6293,
+ "step": 688
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.5245681408608946e-05,
+ "loss": 0.541,
+ "step": 689
+ },
+ {
+ "epoch": 0.34,
+ "learning_rate": 1.52318876658213e-05,
+ "loss": 0.6042,
+ "step": 690
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.5218080203018181e-05,
+ "loss": 0.5809,
+ "step": 691
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.5204259056408046e-05,
+ "loss": 0.5862,
+ "step": 692
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.5190424262235241e-05,
+ "loss": 0.5623,
+ "step": 693
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.5176575856779904e-05,
+ "loss": 0.5797,
+ "step": 694
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.516271387635786e-05,
+ "loss": 0.6085,
+ "step": 695
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.5148838357320537e-05,
+ "loss": 0.5822,
+ "step": 696
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.5134949336054866e-05,
+ "loss": 0.5458,
+ "step": 697
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.512104684898319e-05,
+ "loss": 0.6001,
+ "step": 698
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.5107130932563151e-05,
+ "loss": 0.5995,
+ "step": 699
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.5093201623287631e-05,
+ "loss": 0.6145,
+ "step": 700
+ },
+ {
+ "epoch": 0.35,
+ "eval_code_gate_load": [
+ 209.05,
+ 177.55,
+ 174.45,
+ 152.3,
+ 174.4,
+ 187.35,
+ 191.25,
+ 173.65
+ ],
+ "eval_code_loss": 0.34501951932907104,
+ "eval_code_runtime": 1.7766,
+ "eval_code_samples_per_second": 562.88,
+ "eval_code_steps_per_second": 35.461,
+ "step": 700
+ },
+ {
+ "epoch": 0.35,
+ "eval_orca_gate_load": [
+ 509.4,
+ 346.8,
+ 397.0,
+ 390.9,
+ 346.15,
+ 430.65,
+ 368.2,
+ 353.2
+ ],
+ "eval_orca_loss": 0.4732421934604645,
+ "eval_orca_runtime": 1.9952,
+ "eval_orca_samples_per_second": 501.201,
+ "eval_orca_steps_per_second": 31.576,
+ "step": 700
+ },
+ {
+ "epoch": 0.35,
+ "eval_math_gate_load": [
+ 328.55,
+ 214.9,
+ 226.85,
+ 225.9,
+ 251.3,
+ 251.0,
+ 278.75,
+ 255.45
+ ],
+ "eval_math_loss": 0.45625001192092896,
+ "eval_math_runtime": 1.8447,
+ "eval_math_samples_per_second": 542.082,
+ "eval_math_steps_per_second": 34.151,
+ "step": 700
+ },
+ {
+ "epoch": 0.35,
+ "eval_sharegpt_gate_load": [
+ 1548.45,
+ 1089.65,
+ 1285.75,
+ 1191.1,
+ 1132.35,
+ 1374.15,
+ 1192.4,
+ 1097.75
+ ],
+ "eval_sharegpt_loss": 0.689746081829071,
+ "eval_sharegpt_runtime": 2.9874,
+ "eval_sharegpt_samples_per_second": 334.74,
+ "eval_sharegpt_steps_per_second": 21.089,
+ "step": 700
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.507925895768461e-05,
+ "loss": 0.5587,
+ "step": 701
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.5065302972317108e-05,
+ "loss": 0.5865,
+ "step": 702
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.5051333703783069e-05,
+ "loss": 0.5186,
+ "step": 703
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.5037351188715265e-05,
+ "loss": 0.6164,
+ "step": 704
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.5023355463781221e-05,
+ "loss": 0.586,
+ "step": 705
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.5009346565683088e-05,
+ "loss": 0.5101,
+ "step": 706
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.4995324531157569e-05,
+ "loss": 0.5553,
+ "step": 707
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.4981289396975818e-05,
+ "loss": 0.5443,
+ "step": 708
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.4967241199943332e-05,
+ "loss": 0.589,
+ "step": 709
+ },
+ {
+ "epoch": 0.35,
+ "learning_rate": 1.4953179976899878e-05,
+ "loss": 0.6207,
+ "step": 710
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4939105764719369e-05,
+ "loss": 0.5548,
+ "step": 711
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4925018600309784e-05,
+ "loss": 0.5938,
+ "step": 712
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4910918520613074e-05,
+ "loss": 0.5599,
+ "step": 713
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4896805562605052e-05,
+ "loss": 0.5786,
+ "step": 714
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4882679763295307e-05,
+ "loss": 0.5052,
+ "step": 715
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4868541159727097e-05,
+ "loss": 0.5806,
+ "step": 716
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4854389788977266e-05,
+ "loss": 0.5824,
+ "step": 717
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4840225688156132e-05,
+ "loss": 0.5878,
+ "step": 718
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4826048894407396e-05,
+ "loss": 0.5256,
+ "step": 719
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4811859444908053e-05,
+ "loss": 0.5338,
+ "step": 720
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4797657376868273e-05,
+ "loss": 0.5115,
+ "step": 721
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4783442727531328e-05,
+ "loss": 0.5406,
+ "step": 722
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4769215534173476e-05,
+ "loss": 0.5402,
+ "step": 723
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4754975834103877e-05,
+ "loss": 0.5703,
+ "step": 724
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4740723664664483e-05,
+ "loss": 0.5609,
+ "step": 725
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4726459063229946e-05,
+ "loss": 0.5399,
+ "step": 726
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4712182067207516e-05,
+ "loss": 0.5649,
+ "step": 727
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4697892714036959e-05,
+ "loss": 0.5274,
+ "step": 728
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4683591041190433e-05,
+ "loss": 0.5253,
+ "step": 729
+ },
+ {
+ "epoch": 0.36,
+ "learning_rate": 1.4669277086172406e-05,
+ "loss": 0.4835,
+ "step": 730
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4654950886519563e-05,
+ "loss": 0.5794,
+ "step": 731
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4640612479800686e-05,
+ "loss": 0.521,
+ "step": 732
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4626261903616579e-05,
+ "loss": 0.5273,
+ "step": 733
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4611899195599952e-05,
+ "loss": 0.5263,
+ "step": 734
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4597524393415336e-05,
+ "loss": 0.5614,
+ "step": 735
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4583137534758968e-05,
+ "loss": 0.5693,
+ "step": 736
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4568738657358715e-05,
+ "loss": 0.5616,
+ "step": 737
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.455432779897395e-05,
+ "loss": 0.5403,
+ "step": 738
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4539904997395468e-05,
+ "loss": 0.5085,
+ "step": 739
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4525470290445392e-05,
+ "loss": 0.5168,
+ "step": 740
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4511023715977048e-05,
+ "loss": 0.587,
+ "step": 741
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4496565311874902e-05,
+ "loss": 0.5324,
+ "step": 742
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4482095116054421e-05,
+ "loss": 0.4944,
+ "step": 743
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4467613166462024e-05,
+ "loss": 0.5339,
+ "step": 744
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4453119501074924e-05,
+ "loss": 0.5517,
+ "step": 745
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4438614157901073e-05,
+ "loss": 0.5189,
+ "step": 746
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4424097174979038e-05,
+ "loss": 0.535,
+ "step": 747
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4409568590377918e-05,
+ "loss": 0.5303,
+ "step": 748
+ },
+ {
+ "epoch": 0.37,
+ "learning_rate": 1.4395028442197231e-05,
+ "loss": 0.5561,
+ "step": 749
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.4380476768566825e-05,
+ "loss": 0.5305,
+ "step": 750
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.4365913607646762e-05,
+ "loss": 0.496,
+ "step": 751
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.4351338997627233e-05,
+ "loss": 0.5268,
+ "step": 752
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.433675297672846e-05,
+ "loss": 0.5,
+ "step": 753
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.4322155583200577e-05,
+ "loss": 0.517,
+ "step": 754
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.4307546855323549e-05,
+ "loss": 0.5244,
+ "step": 755
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.429292683140706e-05,
+ "loss": 0.4792,
+ "step": 756
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.4278295549790419e-05,
+ "loss": 0.5154,
+ "step": 757
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.4263653048842461e-05,
+ "loss": 0.4767,
+ "step": 758
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.424899936696143e-05,
+ "loss": 0.4858,
+ "step": 759
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.4234334542574906e-05,
+ "loss": 0.4936,
+ "step": 760
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.4219658614139674e-05,
+ "loss": 0.4957,
+ "step": 761
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.4204971620141648e-05,
+ "loss": 0.5789,
+ "step": 762
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.4190273599095761e-05,
+ "loss": 0.5306,
+ "step": 763
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.4175564589545853e-05,
+ "loss": 0.4988,
+ "step": 764
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.4160844630064596e-05,
+ "loss": 0.5283,
+ "step": 765
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.4146113759253362e-05,
+ "loss": 0.5026,
+ "step": 766
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.4131372015742141e-05,
+ "loss": 0.5237,
+ "step": 767
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.411661943818944e-05,
+ "loss": 0.5602,
+ "step": 768
+ },
+ {
+ "epoch": 0.38,
+ "learning_rate": 1.4101856065282174e-05,
+ "loss": 0.528,
+ "step": 769
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.4087081935735565e-05,
+ "loss": 0.5042,
+ "step": 770
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.4072297088293043e-05,
+ "loss": 0.4794,
+ "step": 771
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.4057501561726157e-05,
+ "loss": 0.5531,
+ "step": 772
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.4042695394834435e-05,
+ "loss": 0.4915,
+ "step": 773
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.4027878626445339e-05,
+ "loss": 0.4861,
+ "step": 774
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.4013051295414108e-05,
+ "loss": 0.4889,
+ "step": 775
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.3998213440623691e-05,
+ "loss": 0.5035,
+ "step": 776
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.3983365100984633e-05,
+ "loss": 0.5034,
+ "step": 777
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.3968506315434973e-05,
+ "loss": 0.4847,
+ "step": 778
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.3953637122940147e-05,
+ "loss": 0.504,
+ "step": 779
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.3938757562492873e-05,
+ "loss": 0.483,
+ "step": 780
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.3923867673113067e-05,
+ "loss": 0.5039,
+ "step": 781
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.390896749384773e-05,
+ "loss": 0.4807,
+ "step": 782
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.3894057063770841e-05,
+ "loss": 0.513,
+ "step": 783
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.3879136421983265e-05,
+ "loss": 0.4903,
+ "step": 784
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.3864205607612648e-05,
+ "loss": 0.5104,
+ "step": 785
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.3849264659813314e-05,
+ "loss": 0.4922,
+ "step": 786
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.3834313617766146e-05,
+ "loss": 0.5198,
+ "step": 787
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.3819352520678519e-05,
+ "loss": 0.4577,
+ "step": 788
+ },
+ {
+ "epoch": 0.39,
+ "learning_rate": 1.380438140778416e-05,
+ "loss": 0.4697,
+ "step": 789
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.378940031834307e-05,
+ "loss": 0.4832,
+ "step": 790
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.3774409291641407e-05,
+ "loss": 0.4664,
+ "step": 791
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.3759408366991391e-05,
+ "loss": 0.492,
+ "step": 792
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.3744397583731204e-05,
+ "loss": 0.496,
+ "step": 793
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.3729376981224869e-05,
+ "loss": 0.498,
+ "step": 794
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.3714346598862168e-05,
+ "loss": 0.5533,
+ "step": 795
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.3699306476058523e-05,
+ "loss": 0.4858,
+ "step": 796
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.3684256652254906e-05,
+ "loss": 0.511,
+ "step": 797
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.3669197166917723e-05,
+ "loss": 0.5326,
+ "step": 798
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.365412805953872e-05,
+ "loss": 0.5154,
+ "step": 799
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.3639049369634878e-05,
+ "loss": 0.5222,
+ "step": 800
+ },
+ {
+ "epoch": 0.4,
+ "eval_code_gate_load": [
+ 212.75,
+ 176.15,
+ 177.3,
+ 159.4,
+ 177.95,
+ 178.35,
+ 186.4,
+ 171.7
+ ],
+ "eval_code_loss": 0.35273438692092896,
+ "eval_code_runtime": 1.7754,
+ "eval_code_samples_per_second": 563.24,
+ "eval_code_steps_per_second": 35.484,
+ "step": 800
+ },
+ {
+ "epoch": 0.4,
+ "eval_orca_gate_load": [
+ 506.35,
+ 346.1,
+ 403.4,
+ 400.65,
+ 347.1,
+ 420.75,
+ 365.05,
+ 352.9
+ ],
+ "eval_orca_loss": 0.45512694120407104,
+ "eval_orca_runtime": 2.0061,
+ "eval_orca_samples_per_second": 498.484,
+ "eval_orca_steps_per_second": 31.405,
+ "step": 800
+ },
+ {
+ "epoch": 0.4,
+ "eval_math_gate_load": [
+ 326.15,
+ 213.85,
+ 229.75,
+ 231.9,
+ 256.15,
+ 245.8,
+ 275.0,
+ 254.1
+ ],
+ "eval_math_loss": 0.4581542909145355,
+ "eval_math_runtime": 1.8544,
+ "eval_math_samples_per_second": 539.26,
+ "eval_math_steps_per_second": 33.973,
+ "step": 800
+ },
+ {
+ "epoch": 0.4,
+ "eval_sharegpt_gate_load": [
+ 1533.15,
+ 1088.05,
+ 1310.2,
+ 1211.85,
+ 1135.75,
+ 1350.35,
+ 1168.4,
+ 1113.85
+ ],
+ "eval_sharegpt_loss": 0.5342773199081421,
+ "eval_sharegpt_runtime": 2.9927,
+ "eval_sharegpt_samples_per_second": 334.144,
+ "eval_sharegpt_steps_per_second": 21.051,
+ "step": 800
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.3623961136748296e-05,
+ "loss": 0.5299,
+ "step": 801
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.3608863400446113e-05,
+ "loss": 0.485,
+ "step": 802
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.3593756200320373e-05,
+ "loss": 0.4949,
+ "step": 803
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.357863957598796e-05,
+ "loss": 0.5395,
+ "step": 804
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.356351356709045e-05,
+ "loss": 0.5138,
+ "step": 805
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.3548378213294042e-05,
+ "loss": 0.5286,
+ "step": 806
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.3533233554289433e-05,
+ "loss": 0.5311,
+ "step": 807
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.3518079629791725e-05,
+ "loss": 0.5093,
+ "step": 808
+ },
+ {
+ "epoch": 0.4,
+ "learning_rate": 1.3502916479540327e-05,
+ "loss": 0.4513,
+ "step": 809
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3487744143298822e-05,
+ "loss": 0.4592,
+ "step": 810
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3472562660854902e-05,
+ "loss": 0.4641,
+ "step": 811
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.345737207202023e-05,
+ "loss": 0.5504,
+ "step": 812
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3442172416630355e-05,
+ "loss": 0.5057,
+ "step": 813
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3426963734544601e-05,
+ "loss": 0.4988,
+ "step": 814
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3411746065645961e-05,
+ "loss": 0.4449,
+ "step": 815
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3396519449841006e-05,
+ "loss": 0.4707,
+ "step": 816
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3381283927059751e-05,
+ "loss": 0.476,
+ "step": 817
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3366039537255589e-05,
+ "loss": 0.5192,
+ "step": 818
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3350786320405145e-05,
+ "loss": 0.494,
+ "step": 819
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3335524316508208e-05,
+ "loss": 0.4846,
+ "step": 820
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3320253565587602e-05,
+ "loss": 0.4849,
+ "step": 821
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3304974107689088e-05,
+ "loss": 0.5147,
+ "step": 822
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.328968598288127e-05,
+ "loss": 0.5149,
+ "step": 823
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3274389231255466e-05,
+ "loss": 0.5002,
+ "step": 824
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3259083892925633e-05,
+ "loss": 0.5237,
+ "step": 825
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3243770008028225e-05,
+ "loss": 0.4745,
+ "step": 826
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3228447616722128e-05,
+ "loss": 0.5389,
+ "step": 827
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3213116759188525e-05,
+ "loss": 0.4719,
+ "step": 828
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.31977774756308e-05,
+ "loss": 0.5096,
+ "step": 829
+ },
+ {
+ "epoch": 0.41,
+ "learning_rate": 1.3182429806274442e-05,
+ "loss": 0.5167,
+ "step": 830
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.3167073791366915e-05,
+ "loss": 0.494,
+ "step": 831
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.3151709471177589e-05,
+ "loss": 0.5393,
+ "step": 832
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.3136336885997591e-05,
+ "loss": 0.5502,
+ "step": 833
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.3120956076139746e-05,
+ "loss": 0.4955,
+ "step": 834
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.3105567081938423e-05,
+ "loss": 0.5306,
+ "step": 835
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.3090169943749475e-05,
+ "loss": 0.4821,
+ "step": 836
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.3074764701950095e-05,
+ "loss": 0.4851,
+ "step": 837
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.305935139693874e-05,
+ "loss": 0.5209,
+ "step": 838
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.3043930069134998e-05,
+ "loss": 0.4816,
+ "step": 839
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.3028500758979507e-05,
+ "loss": 0.4333,
+ "step": 840
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.3013063506933838e-05,
+ "loss": 0.472,
+ "step": 841
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.299761835348038e-05,
+ "loss": 0.4837,
+ "step": 842
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.2982165339122248e-05,
+ "loss": 0.5189,
+ "step": 843
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.296670450438317e-05,
+ "loss": 0.5207,
+ "step": 844
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.2951235889807386e-05,
+ "loss": 0.4656,
+ "step": 845
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.2935759535959528e-05,
+ "loss": 0.537,
+ "step": 846
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.2920275483424538e-05,
+ "loss": 0.5239,
+ "step": 847
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.2904783772807534e-05,
+ "loss": 0.4885,
+ "step": 848
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.2889284444733722e-05,
+ "loss": 0.5081,
+ "step": 849
+ },
+ {
+ "epoch": 0.42,
+ "learning_rate": 1.2873777539848284e-05,
+ "loss": 0.4915,
+ "step": 850
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.2858263098816265e-05,
+ "loss": 0.4385,
+ "step": 851
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.2842741162322487e-05,
+ "loss": 0.5104,
+ "step": 852
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.282721177107141e-05,
+ "loss": 0.5064,
+ "step": 853
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.2811674965787058e-05,
+ "loss": 0.4651,
+ "step": 854
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.279613078721289e-05,
+ "loss": 0.5029,
+ "step": 855
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.2780579276111702e-05,
+ "loss": 0.5042,
+ "step": 856
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.276502047326552e-05,
+ "loss": 0.5048,
+ "step": 857
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.2749454419475486e-05,
+ "loss": 0.4466,
+ "step": 858
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.273388115556177e-05,
+ "loss": 0.5513,
+ "step": 859
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.2718300722363431e-05,
+ "loss": 0.5231,
+ "step": 860
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.2702713160738344e-05,
+ "loss": 0.4755,
+ "step": 861
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.2687118511563075e-05,
+ "loss": 0.5436,
+ "step": 862
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.2671516815732767e-05,
+ "loss": 0.4676,
+ "step": 863
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.2655908114161053e-05,
+ "loss": 0.5156,
+ "step": 864
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.2640292447779932e-05,
+ "loss": 0.4917,
+ "step": 865
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.2624669857539669e-05,
+ "loss": 0.5152,
+ "step": 866
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.2609040384408685e-05,
+ "loss": 0.4448,
+ "step": 867
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.2593404069373452e-05,
+ "loss": 0.4678,
+ "step": 868
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.2577760953438382e-05,
+ "loss": 0.4526,
+ "step": 869
+ },
+ {
+ "epoch": 0.43,
+ "learning_rate": 1.2562111077625723e-05,
+ "loss": 0.4869,
+ "step": 870
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.2546454482975454e-05,
+ "loss": 0.4524,
+ "step": 871
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.2530791210545163e-05,
+ "loss": 0.4979,
+ "step": 872
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.251512130140996e-05,
+ "loss": 0.4931,
+ "step": 873
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.2499444796662354e-05,
+ "loss": 0.4952,
+ "step": 874
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.248376173741215e-05,
+ "loss": 0.476,
+ "step": 875
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.2468072164786342e-05,
+ "loss": 0.5432,
+ "step": 876
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.2452376119929009e-05,
+ "loss": 0.5156,
+ "step": 877
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.2436673644001196e-05,
+ "loss": 0.4924,
+ "step": 878
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.2420964778180815e-05,
+ "loss": 0.4994,
+ "step": 879
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.2405249563662539e-05,
+ "loss": 0.4219,
+ "step": 880
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.2389528041657679e-05,
+ "loss": 0.4999,
+ "step": 881
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.23738002533941e-05,
+ "loss": 0.5012,
+ "step": 882
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.2358066240116092e-05,
+ "loss": 0.4539,
+ "step": 883
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.2342326043084268e-05,
+ "loss": 0.4784,
+ "step": 884
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.2326579703575464e-05,
+ "loss": 0.5036,
+ "step": 885
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.2310827262882614e-05,
+ "loss": 0.5389,
+ "step": 886
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.2295068762314661e-05,
+ "loss": 0.5221,
+ "step": 887
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.2279304243196438e-05,
+ "loss": 0.4863,
+ "step": 888
+ },
+ {
+ "epoch": 0.44,
+ "learning_rate": 1.2263533746868552e-05,
+ "loss": 0.47,
+ "step": 889
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.2247757314687296e-05,
+ "loss": 0.5165,
+ "step": 890
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.2231974988024522e-05,
+ "loss": 0.4943,
+ "step": 891
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.2216186808267544e-05,
+ "loss": 0.4758,
+ "step": 892
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.2200392816819022e-05,
+ "loss": 0.4999,
+ "step": 893
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.2184593055096853e-05,
+ "loss": 0.5106,
+ "step": 894
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.2168787564534078e-05,
+ "loss": 0.476,
+ "step": 895
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.215297638657875e-05,
+ "loss": 0.5206,
+ "step": 896
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.2137159562693839e-05,
+ "loss": 0.4682,
+ "step": 897
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.2121337134357121e-05,
+ "loss": 0.5406,
+ "step": 898
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.2105509143061072e-05,
+ "loss": 0.4805,
+ "step": 899
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.2089675630312755e-05,
+ "loss": 0.5203,
+ "step": 900
+ },
+ {
+ "epoch": 0.45,
+ "eval_code_gate_load": [
+ 205.5,
+ 177.4,
+ 183.4,
+ 157.6,
+ 179.75,
+ 179.4,
+ 190.55,
+ 166.4
+ ],
+ "eval_code_loss": 0.35786134004592896,
+ "eval_code_runtime": 1.7808,
+ "eval_code_samples_per_second": 561.533,
+ "eval_code_steps_per_second": 35.377,
+ "step": 900
+ },
+ {
+ "epoch": 0.45,
+ "eval_orca_gate_load": [
+ 499.1,
+ 348.55,
+ 403.15,
+ 395.45,
+ 351.2,
+ 424.25,
+ 368.4,
+ 352.2
+ ],
+ "eval_orca_loss": 0.46074217557907104,
+ "eval_orca_runtime": 1.999,
+ "eval_orca_samples_per_second": 500.239,
+ "eval_orca_steps_per_second": 31.515,
+ "step": 900
+ },
+ {
+ "epoch": 0.45,
+ "eval_math_gate_load": [
+ 316.75,
+ 225.15,
+ 231.5,
+ 227.4,
+ 256.35,
+ 248.25,
+ 277.4,
+ 249.9
+ ],
+ "eval_math_loss": 0.4561523497104645,
+ "eval_math_runtime": 1.8388,
+ "eval_math_samples_per_second": 543.841,
+ "eval_math_steps_per_second": 34.262,
+ "step": 900
+ },
+ {
+ "epoch": 0.45,
+ "eval_sharegpt_gate_load": [
+ 1524.45,
+ 1103.4,
+ 1313.4,
+ 1203.9,
+ 1138.05,
+ 1354.3,
+ 1174.25,
+ 1099.85
+ ],
+ "eval_sharegpt_loss": 0.5375000238418579,
+ "eval_sharegpt_runtime": 2.9992,
+ "eval_sharegpt_samples_per_second": 333.421,
+ "eval_sharegpt_steps_per_second": 21.006,
+ "step": 900
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.2073836637633705e-05,
+ "loss": 0.4337,
+ "step": 901
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.2057992206559837e-05,
+ "loss": 0.4969,
+ "step": 902
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.204214237864133e-05,
+ "loss": 0.5045,
+ "step": 903
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.2026287195442503e-05,
+ "loss": 0.4796,
+ "step": 904
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.2010426698541728e-05,
+ "loss": 0.5411,
+ "step": 905
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.199456092953131e-05,
+ "loss": 0.5126,
+ "step": 906
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.197868993001738e-05,
+ "loss": 0.5036,
+ "step": 907
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.1962813741619777e-05,
+ "loss": 0.4752,
+ "step": 908
+ },
+ {
+ "epoch": 0.45,
+ "learning_rate": 1.194693240597196e-05,
+ "loss": 0.5218,
+ "step": 909
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.1931045964720882e-05,
+ "loss": 0.4636,
+ "step": 910
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.1915154459526876e-05,
+ "loss": 0.5427,
+ "step": 911
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.189925793206357e-05,
+ "loss": 0.5337,
+ "step": 912
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.188335642401775e-05,
+ "loss": 0.5242,
+ "step": 913
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.1867449977089264e-05,
+ "loss": 0.4671,
+ "step": 914
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.1851538632990922e-05,
+ "loss": 0.5247,
+ "step": 915
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.1835622433448361e-05,
+ "loss": 0.4585,
+ "step": 916
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.181970142019997e-05,
+ "loss": 0.5068,
+ "step": 917
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.1803775634996735e-05,
+ "loss": 0.5094,
+ "step": 918
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.1787845119602184e-05,
+ "loss": 0.5033,
+ "step": 919
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.177190991579223e-05,
+ "loss": 0.5635,
+ "step": 920
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.1755970065355087e-05,
+ "loss": 0.4865,
+ "step": 921
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.174002561009116e-05,
+ "loss": 0.5134,
+ "step": 922
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.1724076591812919e-05,
+ "loss": 0.5193,
+ "step": 923
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.1708123052344803e-05,
+ "loss": 0.4283,
+ "step": 924
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.1692165033523117e-05,
+ "loss": 0.5168,
+ "step": 925
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.1676202577195901e-05,
+ "loss": 0.5103,
+ "step": 926
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.1660235725222835e-05,
+ "loss": 0.4611,
+ "step": 927
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.164426451947513e-05,
+ "loss": 0.5215,
+ "step": 928
+ },
+ {
+ "epoch": 0.46,
+ "learning_rate": 1.1628289001835405e-05,
+ "loss": 0.5178,
+ "step": 929
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.1612309214197599e-05,
+ "loss": 0.4999,
+ "step": 930
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.1596325198466841e-05,
+ "loss": 0.5103,
+ "step": 931
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.1580336996559343e-05,
+ "loss": 0.4791,
+ "step": 932
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.156434465040231e-05,
+ "loss": 0.4663,
+ "step": 933
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.1548348201933799e-05,
+ "loss": 0.5155,
+ "step": 934
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.1532347693102632e-05,
+ "loss": 0.4827,
+ "step": 935
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.151634316586828e-05,
+ "loss": 0.4985,
+ "step": 936
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.150033466220075e-05,
+ "loss": 0.4664,
+ "step": 937
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.1484322224080474e-05,
+ "loss": 0.4885,
+ "step": 938
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.1468305893498204e-05,
+ "loss": 0.4893,
+ "step": 939
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.1452285712454905e-05,
+ "loss": 0.5168,
+ "step": 940
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.1436261722961627e-05,
+ "loss": 0.5049,
+ "step": 941
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.1420233967039423e-05,
+ "loss": 0.464,
+ "step": 942
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.1404202486719205e-05,
+ "loss": 0.4962,
+ "step": 943
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.138816732404167e-05,
+ "loss": 0.5134,
+ "step": 944
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.1372128521057155e-05,
+ "loss": 0.5082,
+ "step": 945
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.1356086119825553e-05,
+ "loss": 0.4526,
+ "step": 946
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.1340040162416197e-05,
+ "loss": 0.5121,
+ "step": 947
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.1323990690907734e-05,
+ "loss": 0.4691,
+ "step": 948
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.1307937747388034e-05,
+ "loss": 0.4597,
+ "step": 949
+ },
+ {
+ "epoch": 0.47,
+ "learning_rate": 1.1291881373954066e-05,
+ "loss": 0.4535,
+ "step": 950
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.1275821612711803e-05,
+ "loss": 0.4899,
+ "step": 951
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.1259758505776092e-05,
+ "loss": 0.5067,
+ "step": 952
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.1243692095270565e-05,
+ "loss": 0.463,
+ "step": 953
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.1227622423327501e-05,
+ "loss": 0.5201,
+ "step": 954
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.1211549532087749e-05,
+ "loss": 0.4925,
+ "step": 955
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.119547346370059e-05,
+ "loss": 0.5388,
+ "step": 956
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.1179394260323639e-05,
+ "loss": 0.4614,
+ "step": 957
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.1163311964122733e-05,
+ "loss": 0.4782,
+ "step": 958
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.114722661727182e-05,
+ "loss": 0.4644,
+ "step": 959
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.1131138261952845e-05,
+ "loss": 0.4268,
+ "step": 960
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.1115046940355643e-05,
+ "loss": 0.4545,
+ "step": 961
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.109895269467783e-05,
+ "loss": 0.4529,
+ "step": 962
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.1082855567124693e-05,
+ "loss": 0.5314,
+ "step": 963
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.1066755599909065e-05,
+ "loss": 0.4394,
+ "step": 964
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.105065283525124e-05,
+ "loss": 0.4943,
+ "step": 965
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.1034547315378838e-05,
+ "loss": 0.4995,
+ "step": 966
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.101843908252671e-05,
+ "loss": 0.5093,
+ "step": 967
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.1002328178936813e-05,
+ "loss": 0.4655,
+ "step": 968
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.0986214646858115e-05,
+ "loss": 0.47,
+ "step": 969
+ },
+ {
+ "epoch": 0.48,
+ "learning_rate": 1.0970098528546482e-05,
+ "loss": 0.5542,
+ "step": 970
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.0953979866264549e-05,
+ "loss": 0.4962,
+ "step": 971
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.0937858702281631e-05,
+ "loss": 0.4945,
+ "step": 972
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.0921735078873599e-05,
+ "loss": 0.4754,
+ "step": 973
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.090560903832278e-05,
+ "loss": 0.4328,
+ "step": 974
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.088948062291783e-05,
+ "loss": 0.4566,
+ "step": 975
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.087334987495364e-05,
+ "loss": 0.4559,
+ "step": 976
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.0857216836731221e-05,
+ "loss": 0.5433,
+ "step": 977
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.0841081550557577e-05,
+ "loss": 0.4789,
+ "step": 978
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.0824944058745623e-05,
+ "loss": 0.4684,
+ "step": 979
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.0808804403614044e-05,
+ "loss": 0.4446,
+ "step": 980
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.0792662627487207e-05,
+ "loss": 0.4864,
+ "step": 981
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.0776518772695035e-05,
+ "loss": 0.4218,
+ "step": 982
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.0760372881572904e-05,
+ "loss": 0.4892,
+ "step": 983
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.0744224996461541e-05,
+ "loss": 0.4257,
+ "step": 984
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.0728075159706881e-05,
+ "loss": 0.5109,
+ "step": 985
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.0711923413659995e-05,
+ "loss": 0.4799,
+ "step": 986
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.069576980067695e-05,
+ "loss": 0.4825,
+ "step": 987
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.0679614363118718e-05,
+ "loss": 0.4711,
+ "step": 988
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.0663457143351044e-05,
+ "loss": 0.528,
+ "step": 989
+ },
+ {
+ "epoch": 0.49,
+ "learning_rate": 1.0647298183744359e-05,
+ "loss": 0.4973,
+ "step": 990
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.0631137526673647e-05,
+ "loss": 0.5144,
+ "step": 991
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.061497521451835e-05,
+ "loss": 0.5067,
+ "step": 992
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.0598811289662243e-05,
+ "loss": 0.5282,
+ "step": 993
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.0582645794493337e-05,
+ "loss": 0.4935,
+ "step": 994
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.0566478771403763e-05,
+ "loss": 0.4667,
+ "step": 995
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.055031026278965e-05,
+ "loss": 0.4847,
+ "step": 996
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.0534140311051026e-05,
+ "loss": 0.4582,
+ "step": 997
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.0517968958591705e-05,
+ "loss": 0.502,
+ "step": 998
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.0501796247819176e-05,
+ "loss": 0.4594,
+ "step": 999
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.0485622221144485e-05,
+ "loss": 0.49,
+ "step": 1000
+ },
+ {
+ "epoch": 0.5,
+ "eval_code_gate_load": [
+ 209.5,
+ 175.9,
+ 182.2,
+ 159.9,
+ 172.05,
+ 178.75,
+ 190.7,
+ 171.0
+ ],
+ "eval_code_loss": 0.35546875,
+ "eval_code_runtime": 1.7833,
+ "eval_code_samples_per_second": 560.749,
+ "eval_code_steps_per_second": 35.327,
+ "step": 1000
+ },
+ {
+ "epoch": 0.5,
+ "eval_orca_gate_load": [
+ 501.25,
+ 343.4,
+ 403.4,
+ 400.1,
+ 346.3,
+ 423.95,
+ 366.8,
+ 357.1
+ ],
+ "eval_orca_loss": 0.452880859375,
+ "eval_orca_runtime": 2.0036,
+ "eval_orca_samples_per_second": 499.113,
+ "eval_orca_steps_per_second": 31.444,
+ "step": 1000
+ },
+ {
+ "epoch": 0.5,
+ "eval_math_gate_load": [
+ 322.65,
+ 220.6,
+ 229.0,
+ 235.25,
+ 246.65,
+ 247.5,
+ 277.5,
+ 253.55
+ ],
+ "eval_math_loss": 0.4518066346645355,
+ "eval_math_runtime": 1.847,
+ "eval_math_samples_per_second": 541.417,
+ "eval_math_steps_per_second": 34.109,
+ "step": 1000
+ },
+ {
+ "epoch": 0.5,
+ "eval_sharegpt_gate_load": [
+ 1521.05,
+ 1084.45,
+ 1307.85,
+ 1203.0,
+ 1140.15,
+ 1363.05,
+ 1179.85,
+ 1112.2
+ ],
+ "eval_sharegpt_loss": 0.5404297113418579,
+ "eval_sharegpt_runtime": 3.0138,
+ "eval_sharegpt_samples_per_second": 331.805,
+ "eval_sharegpt_steps_per_second": 20.904,
+ "step": 1000
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.046944692098213e-05,
+ "loss": 0.487,
+ "step": 1001
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.0453270389749956e-05,
+ "loss": 0.4748,
+ "step": 1002
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.0437092669869025e-05,
+ "loss": 0.4349,
+ "step": 1003
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.0420913803763522e-05,
+ "loss": 0.4664,
+ "step": 1004
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.0404733833860639e-05,
+ "loss": 0.4951,
+ "step": 1005
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.0388552802590461e-05,
+ "loss": 0.4668,
+ "step": 1006
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.0372370752385854e-05,
+ "loss": 0.4626,
+ "step": 1007
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.0356187725682359e-05,
+ "loss": 0.4278,
+ "step": 1008
+ },
+ {
+ "epoch": 0.5,
+ "learning_rate": 1.0340003764918078e-05,
+ "loss": 0.5119,
+ "step": 1009
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.0323818912533561e-05,
+ "loss": 0.4633,
+ "step": 1010
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.0307633210971697e-05,
+ "loss": 0.461,
+ "step": 1011
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.0291446702677598e-05,
+ "loss": 0.5217,
+ "step": 1012
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.0275259430098502e-05,
+ "loss": 0.4573,
+ "step": 1013
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.0259071435683636e-05,
+ "loss": 0.5034,
+ "step": 1014
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.0242882761884132e-05,
+ "loss": 0.4356,
+ "step": 1015
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.02266934511529e-05,
+ "loss": 0.4805,
+ "step": 1016
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.0210503545944522e-05,
+ "loss": 0.4934,
+ "step": 1017
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.0194313088715135e-05,
+ "loss": 0.4913,
+ "step": 1018
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.0178122121922324e-05,
+ "loss": 0.4323,
+ "step": 1019
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.0161930688025018e-05,
+ "loss": 0.4759,
+ "step": 1020
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.0145738829483354e-05,
+ "loss": 0.4667,
+ "step": 1021
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.0129546588758605e-05,
+ "loss": 0.4452,
+ "step": 1022
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.0113354008313025e-05,
+ "loss": 0.438,
+ "step": 1023
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.0097161130609774e-05,
+ "loss": 0.4632,
+ "step": 1024
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.0080967998112787e-05,
+ "loss": 0.4721,
+ "step": 1025
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.0064774653286662e-05,
+ "loss": 0.4787,
+ "step": 1026
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.0048581138596563e-05,
+ "loss": 0.479,
+ "step": 1027
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.003238749650809e-05,
+ "loss": 0.4629,
+ "step": 1028
+ },
+ {
+ "epoch": 0.51,
+ "learning_rate": 1.001619376948718e-05,
+ "loss": 0.4641,
+ "step": 1029
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 1e-05,
+ "loss": 0.4645,
+ "step": 1030
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.98380623051282e-06,
+ "loss": 0.4603,
+ "step": 1031
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.967612503491915e-06,
+ "loss": 0.4245,
+ "step": 1032
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.95141886140344e-06,
+ "loss": 0.5036,
+ "step": 1033
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.935225346713341e-06,
+ "loss": 0.4646,
+ "step": 1034
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.919032001887215e-06,
+ "loss": 0.4433,
+ "step": 1035
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.90283886939023e-06,
+ "loss": 0.4786,
+ "step": 1036
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.886645991686977e-06,
+ "loss": 0.5202,
+ "step": 1037
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.870453411241399e-06,
+ "loss": 0.4868,
+ "step": 1038
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.854261170516648e-06,
+ "loss": 0.4462,
+ "step": 1039
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.838069311974986e-06,
+ "loss": 0.4161,
+ "step": 1040
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.821877878077678e-06,
+ "loss": 0.4759,
+ "step": 1041
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.805686911284867e-06,
+ "loss": 0.5145,
+ "step": 1042
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.789496454055482e-06,
+ "loss": 0.4849,
+ "step": 1043
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.773306548847102e-06,
+ "loss": 0.4507,
+ "step": 1044
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.757117238115871e-06,
+ "loss": 0.4427,
+ "step": 1045
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.740928564316369e-06,
+ "loss": 0.4745,
+ "step": 1046
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.724740569901503e-06,
+ "loss": 0.4943,
+ "step": 1047
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.708553297322407e-06,
+ "loss": 0.4544,
+ "step": 1048
+ },
+ {
+ "epoch": 0.52,
+ "learning_rate": 9.692366789028308e-06,
+ "loss": 0.4622,
+ "step": 1049
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.676181087466444e-06,
+ "loss": 0.4576,
+ "step": 1050
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.659996235081926e-06,
+ "loss": 0.4764,
+ "step": 1051
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.643812274317644e-06,
+ "loss": 0.5082,
+ "step": 1052
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.627629247614151e-06,
+ "loss": 0.4467,
+ "step": 1053
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.611447197409544e-06,
+ "loss": 0.4574,
+ "step": 1054
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.595266166139366e-06,
+ "loss": 0.4667,
+ "step": 1055
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.579086196236483e-06,
+ "loss": 0.4444,
+ "step": 1056
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.562907330130981e-06,
+ "loss": 0.4719,
+ "step": 1057
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.54672961025005e-06,
+ "loss": 0.4315,
+ "step": 1058
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.530553079017872e-06,
+ "loss": 0.5312,
+ "step": 1059
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.514377778855521e-06,
+ "loss": 0.4925,
+ "step": 1060
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.498203752180827e-06,
+ "loss": 0.4775,
+ "step": 1061
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.482031041408296e-06,
+ "loss": 0.4874,
+ "step": 1062
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.465859688948977e-06,
+ "loss": 0.5414,
+ "step": 1063
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.449689737210352e-06,
+ "loss": 0.479,
+ "step": 1064
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.433521228596237e-06,
+ "loss": 0.4297,
+ "step": 1065
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.417354205506663e-06,
+ "loss": 0.5173,
+ "step": 1066
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.401188710337757e-06,
+ "loss": 0.4412,
+ "step": 1067
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.385024785481653e-06,
+ "loss": 0.4954,
+ "step": 1068
+ },
+ {
+ "epoch": 0.53,
+ "learning_rate": 9.368862473326355e-06,
+ "loss": 0.4635,
+ "step": 1069
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.352701816255643e-06,
+ "loss": 0.5658,
+ "step": 1070
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.336542856648958e-06,
+ "loss": 0.478,
+ "step": 1071
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.320385636881283e-06,
+ "loss": 0.4725,
+ "step": 1072
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.30423019932305e-06,
+ "loss": 0.5015,
+ "step": 1073
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.288076586340005e-06,
+ "loss": 0.5177,
+ "step": 1074
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.27192484029312e-06,
+ "loss": 0.4999,
+ "step": 1075
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.255775003538462e-06,
+ "loss": 0.4933,
+ "step": 1076
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.239627118427098e-06,
+ "loss": 0.4579,
+ "step": 1077
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.22348122730497e-06,
+ "loss": 0.4491,
+ "step": 1078
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.207337372512797e-06,
+ "loss": 0.4519,
+ "step": 1079
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.19119559638596e-06,
+ "loss": 0.4515,
+ "step": 1080
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.17505594125438e-06,
+ "loss": 0.4672,
+ "step": 1081
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.158918449442425e-06,
+ "loss": 0.4807,
+ "step": 1082
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.142783163268782e-06,
+ "loss": 0.5425,
+ "step": 1083
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.126650125046361e-06,
+ "loss": 0.4717,
+ "step": 1084
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.110519377082174e-06,
+ "loss": 0.5154,
+ "step": 1085
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.094390961677223e-06,
+ "loss": 0.4954,
+ "step": 1086
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.078264921126405e-06,
+ "loss": 0.43,
+ "step": 1087
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.062141297718372e-06,
+ "loss": 0.4305,
+ "step": 1088
+ },
+ {
+ "epoch": 0.54,
+ "learning_rate": 9.046020133735455e-06,
+ "loss": 0.409,
+ "step": 1089
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 9.02990147145352e-06,
+ "loss": 0.4316,
+ "step": 1090
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 9.013785353141887e-06,
+ "loss": 0.4189,
+ "step": 1091
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.99767182106319e-06,
+ "loss": 0.5003,
+ "step": 1092
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.981560917473292e-06,
+ "loss": 0.4231,
+ "step": 1093
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.965452684621164e-06,
+ "loss": 0.4326,
+ "step": 1094
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.949347164748761e-06,
+ "loss": 0.4282,
+ "step": 1095
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.933244400090937e-06,
+ "loss": 0.4409,
+ "step": 1096
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.91714443287531e-06,
+ "loss": 0.5219,
+ "step": 1097
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.901047305322172e-06,
+ "loss": 0.4599,
+ "step": 1098
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.88495305964436e-06,
+ "loss": 0.4843,
+ "step": 1099
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.868861738047158e-06,
+ "loss": 0.4487,
+ "step": 1100
+ },
+ {
+ "epoch": 0.55,
+ "eval_code_gate_load": [
+ 204.55,
+ 178.2,
+ 181.25,
+ 158.8,
+ 174.15,
+ 180.85,
+ 194.9,
+ 167.3
+ ],
+ "eval_code_loss": 0.3551269471645355,
+ "eval_code_runtime": 1.8096,
+ "eval_code_samples_per_second": 552.622,
+ "eval_code_steps_per_second": 34.815,
+ "step": 1100
+ },
+ {
+ "epoch": 0.55,
+ "eval_orca_gate_load": [
+ 497.9,
+ 347.05,
+ 405.0,
+ 400.25,
+ 349.7,
+ 422.7,
+ 367.5,
+ 352.2
+ ],
+ "eval_orca_loss": 0.45805662870407104,
+ "eval_orca_runtime": 1.9956,
+ "eval_orca_samples_per_second": 501.094,
+ "eval_orca_steps_per_second": 31.569,
+ "step": 1100
+ },
+ {
+ "epoch": 0.55,
+ "eval_math_gate_load": [
+ 309.9,
+ 222.05,
+ 237.3,
+ 231.3,
+ 253.5,
+ 249.15,
+ 277.95,
+ 251.55
+ ],
+ "eval_math_loss": 0.32890623807907104,
+ "eval_math_runtime": 1.861,
+ "eval_math_samples_per_second": 537.357,
+ "eval_math_steps_per_second": 33.853,
+ "step": 1100
+ },
+ {
+ "epoch": 0.55,
+ "eval_sharegpt_gate_load": [
+ 1510.0,
+ 1101.7,
+ 1304.1,
+ 1211.55,
+ 1140.2,
+ 1358.85,
+ 1183.65,
+ 1101.55
+ ],
+ "eval_sharegpt_loss": 0.539843738079071,
+ "eval_sharegpt_runtime": 3.0143,
+ "eval_sharegpt_samples_per_second": 331.753,
+ "eval_sharegpt_steps_per_second": 20.9,
+ "step": 1100
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.852773382728184e-06,
+ "loss": 0.4398,
+ "step": 1101
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.836688035877268e-06,
+ "loss": 0.4278,
+ "step": 1102
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.820605739676363e-06,
+ "loss": 0.4476,
+ "step": 1103
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.804526536299413e-06,
+ "loss": 0.4136,
+ "step": 1104
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.788450467912254e-06,
+ "loss": 0.445,
+ "step": 1105
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.772377576672502e-06,
+ "loss": 0.4633,
+ "step": 1106
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.75630790472944e-06,
+ "loss": 0.4523,
+ "step": 1107
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.740241494223911e-06,
+ "loss": 0.4346,
+ "step": 1108
+ },
+ {
+ "epoch": 0.55,
+ "learning_rate": 8.724178387288202e-06,
+ "loss": 0.4706,
+ "step": 1109
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.708118626045939e-06,
+ "loss": 0.4377,
+ "step": 1110
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.692062252611973e-06,
+ "loss": 0.4326,
+ "step": 1111
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.676009309092273e-06,
+ "loss": 0.4979,
+ "step": 1112
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.659959837583808e-06,
+ "loss": 0.4885,
+ "step": 1113
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.643913880174449e-06,
+ "loss": 0.424,
+ "step": 1114
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.62787147894285e-06,
+ "loss": 0.4862,
+ "step": 1115
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.611832675958335e-06,
+ "loss": 0.4256,
+ "step": 1116
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.595797513280799e-06,
+ "loss": 0.4172,
+ "step": 1117
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.579766032960582e-06,
+ "loss": 0.4389,
+ "step": 1118
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.563738277038376e-06,
+ "loss": 0.421,
+ "step": 1119
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.5477142875451e-06,
+ "loss": 0.4555,
+ "step": 1120
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.531694106501796e-06,
+ "loss": 0.4244,
+ "step": 1121
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.515677775919528e-06,
+ "loss": 0.4084,
+ "step": 1122
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.499665337799254e-06,
+ "loss": 0.4406,
+ "step": 1123
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.48365683413172e-06,
+ "loss": 0.4274,
+ "step": 1124
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.46765230689737e-06,
+ "loss": 0.3716,
+ "step": 1125
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.451651798066203e-06,
+ "loss": 0.459,
+ "step": 1126
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.43565534959769e-06,
+ "loss": 0.4536,
+ "step": 1127
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.419663003440657e-06,
+ "loss": 0.4558,
+ "step": 1128
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.40367480153316e-06,
+ "loss": 0.4123,
+ "step": 1129
+ },
+ {
+ "epoch": 0.56,
+ "learning_rate": 8.387690785802403e-06,
+ "loss": 0.4408,
+ "step": 1130
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.371710998164595e-06,
+ "loss": 0.481,
+ "step": 1131
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.355735480524874e-06,
+ "loss": 0.3822,
+ "step": 1132
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.339764274777165e-06,
+ "loss": 0.4389,
+ "step": 1133
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.3237974228041e-06,
+ "loss": 0.403,
+ "step": 1134
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.307834966476885e-06,
+ "loss": 0.4676,
+ "step": 1135
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.291876947655197e-06,
+ "loss": 0.4541,
+ "step": 1136
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.275923408187086e-06,
+ "loss": 0.4605,
+ "step": 1137
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.259974389908842e-06,
+ "loss": 0.4276,
+ "step": 1138
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.244029934644916e-06,
+ "loss": 0.4232,
+ "step": 1139
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.228090084207773e-06,
+ "loss": 0.4284,
+ "step": 1140
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.212154880397817e-06,
+ "loss": 0.3999,
+ "step": 1141
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.196224365003267e-06,
+ "loss": 0.4149,
+ "step": 1142
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.180298579800034e-06,
+ "loss": 0.5071,
+ "step": 1143
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.16437756655164e-06,
+ "loss": 0.4423,
+ "step": 1144
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.148461367009081e-06,
+ "loss": 0.4525,
+ "step": 1145
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.132550022910737e-06,
+ "loss": 0.4204,
+ "step": 1146
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.116643575982254e-06,
+ "loss": 0.4082,
+ "step": 1147
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.100742067936432e-06,
+ "loss": 0.4174,
+ "step": 1148
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.084845540473127e-06,
+ "loss": 0.4162,
+ "step": 1149
+ },
+ {
+ "epoch": 0.57,
+ "learning_rate": 8.068954035279121e-06,
+ "loss": 0.4527,
+ "step": 1150
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 8.053067594028044e-06,
+ "loss": 0.4525,
+ "step": 1151
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 8.037186258380226e-06,
+ "loss": 0.4433,
+ "step": 1152
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 8.021310069982624e-06,
+ "loss": 0.5178,
+ "step": 1153
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 8.005439070468692e-06,
+ "loss": 0.4018,
+ "step": 1154
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 7.989573301458274e-06,
+ "loss": 0.4235,
+ "step": 1155
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 7.9737128045575e-06,
+ "loss": 0.4846,
+ "step": 1156
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 7.957857621358674e-06,
+ "loss": 0.4281,
+ "step": 1157
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 7.942007793440165e-06,
+ "loss": 0.4087,
+ "step": 1158
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 7.9261633623663e-06,
+ "loss": 0.4426,
+ "step": 1159
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 7.91032436968725e-06,
+ "loss": 0.4638,
+ "step": 1160
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 7.894490856938931e-06,
+ "loss": 0.4776,
+ "step": 1161
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 7.87866286564288e-06,
+ "loss": 0.4447,
+ "step": 1162
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 7.862840437306165e-06,
+ "loss": 0.3879,
+ "step": 1163
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 7.847023613421251e-06,
+ "loss": 0.4486,
+ "step": 1164
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 7.831212435465925e-06,
+ "loss": 0.4983,
+ "step": 1165
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 7.815406944903148e-06,
+ "loss": 0.4229,
+ "step": 1166
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 7.799607183180981e-06,
+ "loss": 0.4204,
+ "step": 1167
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 7.78381319173246e-06,
+ "loss": 0.3881,
+ "step": 1168
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 7.768025011975481e-06,
+ "loss": 0.4176,
+ "step": 1169
+ },
+ {
+ "epoch": 0.58,
+ "learning_rate": 7.752242685312709e-06,
+ "loss": 0.4191,
+ "step": 1170
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.736466253131451e-06,
+ "loss": 0.4608,
+ "step": 1171
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.720695756803569e-06,
+ "loss": 0.446,
+ "step": 1172
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.704931237685342e-06,
+ "loss": 0.3986,
+ "step": 1173
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.689172737117389e-06,
+ "loss": 0.438,
+ "step": 1174
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.673420296424541e-06,
+ "loss": 0.4122,
+ "step": 1175
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.657673956915735e-06,
+ "loss": 0.4458,
+ "step": 1176
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.641933759883913e-06,
+ "loss": 0.4381,
+ "step": 1177
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.6261997466059035e-06,
+ "loss": 0.4505,
+ "step": 1178
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.610471958342326e-06,
+ "loss": 0.4123,
+ "step": 1179
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.594750436337467e-06,
+ "loss": 0.4655,
+ "step": 1180
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.579035221819188e-06,
+ "loss": 0.4205,
+ "step": 1181
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.5633263559988035e-06,
+ "loss": 0.4072,
+ "step": 1182
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.547623880070992e-06,
+ "loss": 0.4442,
+ "step": 1183
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.531927835213657e-06,
+ "loss": 0.4164,
+ "step": 1184
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.516238262587851e-06,
+ "loss": 0.4247,
+ "step": 1185
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.500555203337647e-06,
+ "loss": 0.4238,
+ "step": 1186
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.48487869859004e-06,
+ "loss": 0.4493,
+ "step": 1187
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.469208789454838e-06,
+ "loss": 0.4132,
+ "step": 1188
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.4535455170245476e-06,
+ "loss": 0.4364,
+ "step": 1189
+ },
+ {
+ "epoch": 0.59,
+ "learning_rate": 7.4378889223742766e-06,
+ "loss": 0.4417,
+ "step": 1190
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.422239046561619e-06,
+ "loss": 0.3977,
+ "step": 1191
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.40659593062655e-06,
+ "loss": 0.4288,
+ "step": 1192
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.390959615591315e-06,
+ "loss": 0.4328,
+ "step": 1193
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.375330142460331e-06,
+ "loss": 0.3894,
+ "step": 1194
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.35970755222007e-06,
+ "loss": 0.4406,
+ "step": 1195
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.344091885838949e-06,
+ "loss": 0.4512,
+ "step": 1196
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.328483184267236e-06,
+ "loss": 0.3937,
+ "step": 1197
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.312881488436928e-06,
+ "loss": 0.4402,
+ "step": 1198
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.297286839261659e-06,
+ "loss": 0.4219,
+ "step": 1199
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.2816992776365714e-06,
+ "loss": 0.4174,
+ "step": 1200
+ },
+ {
+ "epoch": 0.6,
+ "eval_code_gate_load": [
+ 207.85,
+ 176.65,
+ 176.75,
+ 155.35,
+ 175.4,
+ 184.75,
+ 195.35,
+ 167.9
+ ],
+ "eval_code_loss": 0.26713865995407104,
+ "eval_code_runtime": 1.7771,
+ "eval_code_samples_per_second": 562.706,
+ "eval_code_steps_per_second": 35.45,
+ "step": 1200
+ },
+ {
+ "epoch": 0.6,
+ "eval_orca_gate_load": [
+ 500.65,
+ 345.65,
+ 405.4,
+ 401.1,
+ 345.55,
+ 420.1,
+ 369.5,
+ 354.35
+ ],
+ "eval_orca_loss": 0.4488769471645355,
+ "eval_orca_runtime": 2.0075,
+ "eval_orca_samples_per_second": 498.135,
+ "eval_orca_steps_per_second": 31.383,
+ "step": 1200
+ },
+ {
+ "epoch": 0.6,
+ "eval_math_gate_load": [
+ 324.85,
+ 216.9,
+ 232.55,
+ 225.15,
+ 251.7,
+ 253.15,
+ 278.7,
+ 249.7
+ ],
+ "eval_math_loss": 0.30810546875,
+ "eval_math_runtime": 1.8431,
+ "eval_math_samples_per_second": 542.566,
+ "eval_math_steps_per_second": 34.182,
+ "step": 1200
+ },
+ {
+ "epoch": 0.6,
+ "eval_sharegpt_gate_load": [
+ 1528.05,
+ 1087.15,
+ 1303.2,
+ 1211.5,
+ 1131.85,
+ 1353.75,
+ 1187.35,
+ 1108.75
+ ],
+ "eval_sharegpt_loss": 0.537792980670929,
+ "eval_sharegpt_runtime": 2.9986,
+ "eval_sharegpt_samples_per_second": 333.485,
+ "eval_sharegpt_steps_per_second": 21.01,
+ "step": 1200
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.2661188444382345e-06,
+ "loss": 0.3947,
+ "step": 1201
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.250545580524515e-06,
+ "loss": 0.446,
+ "step": 1202
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.234979526734482e-06,
+ "loss": 0.4238,
+ "step": 1203
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.219420723888301e-06,
+ "loss": 0.4102,
+ "step": 1204
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.203869212787112e-06,
+ "loss": 0.4027,
+ "step": 1205
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.188325034212944e-06,
+ "loss": 0.3781,
+ "step": 1206
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.1727882289285915e-06,
+ "loss": 0.3997,
+ "step": 1207
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.157258837677514e-06,
+ "loss": 0.4436,
+ "step": 1208
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.1417369011837355e-06,
+ "loss": 0.4706,
+ "step": 1209
+ },
+ {
+ "epoch": 0.6,
+ "learning_rate": 7.126222460151719e-06,
+ "loss": 0.4107,
+ "step": 1210
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 7.110715555266281e-06,
+ "loss": 0.3836,
+ "step": 1211
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 7.095216227192467e-06,
+ "loss": 0.4006,
+ "step": 1212
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 7.0797245165754654e-06,
+ "loss": 0.4064,
+ "step": 1213
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 7.064240464040472e-06,
+ "loss": 0.3926,
+ "step": 1214
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 7.048764110192618e-06,
+ "loss": 0.4111,
+ "step": 1215
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 7.033295495616834e-06,
+ "loss": 0.379,
+ "step": 1216
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 7.017834660877756e-06,
+ "loss": 0.3834,
+ "step": 1217
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 7.002381646519625e-06,
+ "loss": 0.4477,
+ "step": 1218
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 6.986936493066165e-06,
+ "loss": 0.4177,
+ "step": 1219
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 6.971499241020495e-06,
+ "loss": 0.4571,
+ "step": 1220
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 6.956069930865005e-06,
+ "loss": 0.4066,
+ "step": 1221
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 6.940648603061263e-06,
+ "loss": 0.4223,
+ "step": 1222
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 6.925235298049906e-06,
+ "loss": 0.4069,
+ "step": 1223
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 6.909830056250527e-06,
+ "loss": 0.3334,
+ "step": 1224
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 6.894432918061579e-06,
+ "loss": 0.404,
+ "step": 1225
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 6.8790439238602576e-06,
+ "loss": 0.403,
+ "step": 1226
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 6.863663114002411e-06,
+ "loss": 0.3739,
+ "step": 1227
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 6.848290528822417e-06,
+ "loss": 0.3936,
+ "step": 1228
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 6.8329262086330864e-06,
+ "loss": 0.4142,
+ "step": 1229
+ },
+ {
+ "epoch": 0.61,
+ "learning_rate": 6.8175701937255645e-06,
+ "loss": 0.4611,
+ "step": 1230
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.802222524369202e-06,
+ "loss": 0.4569,
+ "step": 1231
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.786883240811479e-06,
+ "loss": 0.44,
+ "step": 1232
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.771552383277875e-06,
+ "loss": 0.3902,
+ "step": 1233
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.756229991971779e-06,
+ "loss": 0.467,
+ "step": 1234
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.740916107074372e-06,
+ "loss": 0.389,
+ "step": 1235
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.725610768744535e-06,
+ "loss": 0.4507,
+ "step": 1236
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.710314017118734e-06,
+ "loss": 0.4448,
+ "step": 1237
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.695025892310913e-06,
+ "loss": 0.4476,
+ "step": 1238
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.6797464344124045e-06,
+ "loss": 0.4785,
+ "step": 1239
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.664475683491797e-06,
+ "loss": 0.4106,
+ "step": 1240
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.649213679594859e-06,
+ "loss": 0.4091,
+ "step": 1241
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.633960462744415e-06,
+ "loss": 0.4373,
+ "step": 1242
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.618716072940248e-06,
+ "loss": 0.4227,
+ "step": 1243
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.603480550158995e-06,
+ "loss": 0.4241,
+ "step": 1244
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.588253934354039e-06,
+ "loss": 0.3771,
+ "step": 1245
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.5730362654554015e-06,
+ "loss": 0.4489,
+ "step": 1246
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.5578275833696485e-06,
+ "loss": 0.4633,
+ "step": 1247
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.542627927979772e-06,
+ "loss": 0.3883,
+ "step": 1248
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.527437339145097e-06,
+ "loss": 0.4436,
+ "step": 1249
+ },
+ {
+ "epoch": 0.62,
+ "learning_rate": 6.5122558567011775e-06,
+ "loss": 0.4504,
+ "step": 1250
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.497083520459674e-06,
+ "loss": 0.3626,
+ "step": 1251
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.481920370208274e-06,
+ "loss": 0.4091,
+ "step": 1252
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.466766445710568e-06,
+ "loss": 0.3863,
+ "step": 1253
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.4516217867059615e-06,
+ "loss": 0.4081,
+ "step": 1254
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.43648643290955e-06,
+ "loss": 0.3932,
+ "step": 1255
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.421360424012039e-06,
+ "loss": 0.4043,
+ "step": 1256
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.406243799679625e-06,
+ "loss": 0.4142,
+ "step": 1257
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.39113659955389e-06,
+ "loss": 0.3747,
+ "step": 1258
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.376038863251706e-06,
+ "loss": 0.3748,
+ "step": 1259
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.360950630365126e-06,
+ "loss": 0.4162,
+ "step": 1260
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.345871940461282e-06,
+ "loss": 0.4242,
+ "step": 1261
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.33080283308228e-06,
+ "loss": 0.3819,
+ "step": 1262
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.315743347745098e-06,
+ "loss": 0.4009,
+ "step": 1263
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.300693523941481e-06,
+ "loss": 0.3622,
+ "step": 1264
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.2856534011378365e-06,
+ "loss": 0.4283,
+ "step": 1265
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.270623018775135e-06,
+ "loss": 0.4575,
+ "step": 1266
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.255602416268799e-06,
+ "loss": 0.4358,
+ "step": 1267
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.2405916330086106e-06,
+ "loss": 0.3554,
+ "step": 1268
+ },
+ {
+ "epoch": 0.63,
+ "learning_rate": 6.225590708358596e-06,
+ "loss": 0.4037,
+ "step": 1269
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 6.210599681656933e-06,
+ "loss": 0.3967,
+ "step": 1270
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 6.1956185922158445e-06,
+ "loss": 0.4144,
+ "step": 1271
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 6.180647479321484e-06,
+ "loss": 0.4534,
+ "step": 1272
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 6.165686382233856e-06,
+ "loss": 0.4557,
+ "step": 1273
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 6.150735340186689e-06,
+ "loss": 0.3985,
+ "step": 1274
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 6.135794392387353e-06,
+ "loss": 0.4285,
+ "step": 1275
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 6.120863578016736e-06,
+ "loss": 0.3867,
+ "step": 1276
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 6.1059429362291615e-06,
+ "loss": 0.4173,
+ "step": 1277
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 6.091032506152274e-06,
+ "loss": 0.3591,
+ "step": 1278
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 6.076132326886934e-06,
+ "loss": 0.4235,
+ "step": 1279
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 6.061242437507131e-06,
+ "loss": 0.3599,
+ "step": 1280
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 6.0463628770598574e-06,
+ "loss": 0.3687,
+ "step": 1281
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 6.0314936845650296e-06,
+ "loss": 0.4174,
+ "step": 1282
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 6.016634899015369e-06,
+ "loss": 0.3781,
+ "step": 1283
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 6.00178655937631e-06,
+ "loss": 0.3717,
+ "step": 1284
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 5.986948704585895e-06,
+ "loss": 0.3987,
+ "step": 1285
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 5.972121373554665e-06,
+ "loss": 0.3913,
+ "step": 1286
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 5.957304605165567e-06,
+ "loss": 0.4595,
+ "step": 1287
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 5.942498438273849e-06,
+ "loss": 0.375,
+ "step": 1288
+ },
+ {
+ "epoch": 0.64,
+ "learning_rate": 5.927702911706961e-06,
+ "loss": 0.4359,
+ "step": 1289
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.912918064264441e-06,
+ "loss": 0.3799,
+ "step": 1290
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.898143934717831e-06,
+ "loss": 0.4056,
+ "step": 1291
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.8833805618105635e-06,
+ "loss": 0.4361,
+ "step": 1292
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.868627984257862e-06,
+ "loss": 0.4293,
+ "step": 1293
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.853886240746643e-06,
+ "loss": 0.3594,
+ "step": 1294
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.839155369935407e-06,
+ "loss": 0.4376,
+ "step": 1295
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.82443541045415e-06,
+ "loss": 0.3927,
+ "step": 1296
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.809726400904242e-06,
+ "loss": 0.4547,
+ "step": 1297
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.795028379858355e-06,
+ "loss": 0.3932,
+ "step": 1298
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.780341385860333e-06,
+ "loss": 0.4543,
+ "step": 1299
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.765665457425102e-06,
+ "loss": 0.409,
+ "step": 1300
+ },
+ {
+ "epoch": 0.65,
+ "eval_code_gate_load": [
+ 206.65,
+ 175.25,
+ 174.65,
+ 157.15,
+ 173.1,
+ 187.8,
+ 194.85,
+ 170.55
+ ],
+ "eval_code_loss": 0.2615722715854645,
+ "eval_code_runtime": 1.7786,
+ "eval_code_samples_per_second": 562.233,
+ "eval_code_steps_per_second": 35.421,
+ "step": 1300
+ },
+ {
+ "epoch": 0.65,
+ "eval_orca_gate_load": [
+ 499.85,
+ 346.7,
+ 403.75,
+ 402.4,
+ 350.05,
+ 418.15,
+ 370.1,
+ 351.3
+ ],
+ "eval_orca_loss": 0.4473632872104645,
+ "eval_orca_runtime": 1.9968,
+ "eval_orca_samples_per_second": 500.804,
+ "eval_orca_steps_per_second": 31.551,
+ "step": 1300
+ },
+ {
+ "epoch": 0.65,
+ "eval_math_gate_load": [
+ 312.9,
+ 224.8,
+ 227.45,
+ 234.4,
+ 250.9,
+ 253.95,
+ 276.6,
+ 251.7
+ ],
+ "eval_math_loss": 0.3106445372104645,
+ "eval_math_runtime": 1.8419,
+ "eval_math_samples_per_second": 542.921,
+ "eval_math_steps_per_second": 34.204,
+ "step": 1300
+ },
+ {
+ "epoch": 0.65,
+ "eval_sharegpt_gate_load": [
+ 1519.25,
+ 1096.15,
+ 1298.05,
+ 1219.5,
+ 1133.1,
+ 1354.6,
+ 1183.8,
+ 1107.15
+ ],
+ "eval_sharegpt_loss": 0.533984363079071,
+ "eval_sharegpt_runtime": 2.9971,
+ "eval_sharegpt_samples_per_second": 333.658,
+ "eval_sharegpt_steps_per_second": 21.02,
+ "step": 1300
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.751000633038573e-06,
+ "loss": 0.4107,
+ "step": 1301
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.736346951157544e-06,
+ "loss": 0.3936,
+ "step": 1302
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.721704450209581e-06,
+ "loss": 0.3587,
+ "step": 1303
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.707073168592943e-06,
+ "loss": 0.3612,
+ "step": 1304
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.692453144676451e-06,
+ "loss": 0.3779,
+ "step": 1305
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.677844416799424e-06,
+ "loss": 0.4237,
+ "step": 1306
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.663247023271543e-06,
+ "loss": 0.4095,
+ "step": 1307
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.648661002372769e-06,
+ "loss": 0.3707,
+ "step": 1308
+ },
+ {
+ "epoch": 0.65,
+ "learning_rate": 5.63408639235324e-06,
+ "loss": 0.4192,
+ "step": 1309
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.619523231433177e-06,
+ "loss": 0.4076,
+ "step": 1310
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.604971557802769e-06,
+ "loss": 0.3707,
+ "step": 1311
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.590431409622081e-06,
+ "loss": 0.3618,
+ "step": 1312
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.575902825020962e-06,
+ "loss": 0.4583,
+ "step": 1313
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.56138584209893e-06,
+ "loss": 0.3753,
+ "step": 1314
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.546880498925079e-06,
+ "loss": 0.4015,
+ "step": 1315
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.5323868335379775e-06,
+ "loss": 0.4733,
+ "step": 1316
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.517904883945577e-06,
+ "loss": 0.4065,
+ "step": 1317
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.503434688125104e-06,
+ "loss": 0.391,
+ "step": 1318
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.488976284022953e-06,
+ "loss": 0.413,
+ "step": 1319
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.4745297095546125e-06,
+ "loss": 0.4226,
+ "step": 1320
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.460095002604533e-06,
+ "loss": 0.4065,
+ "step": 1321
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.445672201026054e-06,
+ "loss": 0.3748,
+ "step": 1322
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.431261342641287e-06,
+ "loss": 0.3484,
+ "step": 1323
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.416862465241033e-06,
+ "loss": 0.3729,
+ "step": 1324
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.40247560658467e-06,
+ "loss": 0.4519,
+ "step": 1325
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.3881008044000495e-06,
+ "loss": 0.3754,
+ "step": 1326
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.373738096383423e-06,
+ "loss": 0.399,
+ "step": 1327
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.359387520199317e-06,
+ "loss": 0.3878,
+ "step": 1328
+ },
+ {
+ "epoch": 0.66,
+ "learning_rate": 5.3450491134804416e-06,
+ "loss": 0.3375,
+ "step": 1329
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.330722913827594e-06,
+ "loss": 0.3478,
+ "step": 1330
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.3164089588095705e-06,
+ "loss": 0.4146,
+ "step": 1331
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.302107285963045e-06,
+ "loss": 0.4672,
+ "step": 1332
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.287817932792485e-06,
+ "loss": 0.389,
+ "step": 1333
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.273540936770059e-06,
+ "loss": 0.3968,
+ "step": 1334
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.259276335335522e-06,
+ "loss": 0.4544,
+ "step": 1335
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.245024165896126e-06,
+ "loss": 0.4088,
+ "step": 1336
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.2307844658265236e-06,
+ "loss": 0.3966,
+ "step": 1337
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.216557272468675e-06,
+ "loss": 0.3967,
+ "step": 1338
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.202342623131731e-06,
+ "loss": 0.4211,
+ "step": 1339
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.18814055509195e-06,
+ "loss": 0.3969,
+ "step": 1340
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.173951105592605e-06,
+ "loss": 0.3714,
+ "step": 1341
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.1597743118438725e-06,
+ "loss": 0.4332,
+ "step": 1342
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.145610211022738e-06,
+ "loss": 0.4368,
+ "step": 1343
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.131458840272905e-06,
+ "loss": 0.3699,
+ "step": 1344
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.117320236704697e-06,
+ "loss": 0.3549,
+ "step": 1345
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.103194437394952e-06,
+ "loss": 0.3848,
+ "step": 1346
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.089081479386928e-06,
+ "loss": 0.4111,
+ "step": 1347
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.074981399690219e-06,
+ "loss": 0.4432,
+ "step": 1348
+ },
+ {
+ "epoch": 0.67,
+ "learning_rate": 5.060894235280637e-06,
+ "loss": 0.4486,
+ "step": 1349
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 5.046820023100129e-06,
+ "loss": 0.3844,
+ "step": 1350
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 5.03275880005667e-06,
+ "loss": 0.393,
+ "step": 1351
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 5.018710603024187e-06,
+ "loss": 0.3565,
+ "step": 1352
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 5.004675468842436e-06,
+ "loss": 0.3473,
+ "step": 1353
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 4.990653434316915e-06,
+ "loss": 0.4119,
+ "step": 1354
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 4.976644536218783e-06,
+ "loss": 0.3434,
+ "step": 1355
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 4.9626488112847384e-06,
+ "loss": 0.3702,
+ "step": 1356
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 4.948666296216938e-06,
+ "loss": 0.4028,
+ "step": 1357
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 4.934697027682894e-06,
+ "loss": 0.3527,
+ "step": 1358
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 4.9207410423153925e-06,
+ "loss": 0.3867,
+ "step": 1359
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 4.9067983767123736e-06,
+ "loss": 0.4232,
+ "step": 1360
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 4.8928690674368495e-06,
+ "loss": 0.4539,
+ "step": 1361
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 4.878953151016816e-06,
+ "loss": 0.4188,
+ "step": 1362
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 4.8650506639451385e-06,
+ "loss": 0.4357,
+ "step": 1363
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 4.851161642679466e-06,
+ "loss": 0.3801,
+ "step": 1364
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 4.837286123642141e-06,
+ "loss": 0.4286,
+ "step": 1365
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 4.823424143220097e-06,
+ "loss": 0.411,
+ "step": 1366
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 4.809575737764759e-06,
+ "loss": 0.4003,
+ "step": 1367
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 4.795740943591955e-06,
+ "loss": 0.4079,
+ "step": 1368
+ },
+ {
+ "epoch": 0.68,
+ "learning_rate": 4.781919796981818e-06,
+ "loss": 0.4187,
+ "step": 1369
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.7681123341787e-06,
+ "loss": 0.4027,
+ "step": 1370
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.754318591391057e-06,
+ "loss": 0.3967,
+ "step": 1371
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.740538604791371e-06,
+ "loss": 0.3976,
+ "step": 1372
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.726772410516055e-06,
+ "loss": 0.4034,
+ "step": 1373
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.713020044665348e-06,
+ "loss": 0.4319,
+ "step": 1374
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.699281543303222e-06,
+ "loss": 0.4228,
+ "step": 1375
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.685556942457296e-06,
+ "loss": 0.3096,
+ "step": 1376
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.67184627811874e-06,
+ "loss": 0.4083,
+ "step": 1377
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.65814958624217e-06,
+ "loss": 0.3911,
+ "step": 1378
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.6444669027455615e-06,
+ "loss": 0.4363,
+ "step": 1379
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.630798263510162e-06,
+ "loss": 0.3926,
+ "step": 1380
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.617143704380382e-06,
+ "loss": 0.3413,
+ "step": 1381
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.60350326116371e-06,
+ "loss": 0.414,
+ "step": 1382
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.589876969630616e-06,
+ "loss": 0.3942,
+ "step": 1383
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.576264865514467e-06,
+ "loss": 0.4239,
+ "step": 1384
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.562666984511416e-06,
+ "loss": 0.4214,
+ "step": 1385
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.549083362280318e-06,
+ "loss": 0.3739,
+ "step": 1386
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.535514034442644e-06,
+ "loss": 0.372,
+ "step": 1387
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.521959036582372e-06,
+ "loss": 0.3932,
+ "step": 1388
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.508418404245903e-06,
+ "loss": 0.3606,
+ "step": 1389
+ },
+ {
+ "epoch": 0.69,
+ "learning_rate": 4.494892172941965e-06,
+ "loss": 0.4025,
+ "step": 1390
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.481380378141528e-06,
+ "loss": 0.3945,
+ "step": 1391
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.467883055277696e-06,
+ "loss": 0.4146,
+ "step": 1392
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.454400239745619e-06,
+ "loss": 0.4055,
+ "step": 1393
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.440931966902419e-06,
+ "loss": 0.3951,
+ "step": 1394
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.427478272067066e-06,
+ "loss": 0.4238,
+ "step": 1395
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.414039190520308e-06,
+ "loss": 0.4598,
+ "step": 1396
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.400614757504565e-06,
+ "loss": 0.4238,
+ "step": 1397
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.3872050082238535e-06,
+ "loss": 0.3732,
+ "step": 1398
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.373809977843676e-06,
+ "loss": 0.4321,
+ "step": 1399
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.360429701490935e-06,
+ "loss": 0.4115,
+ "step": 1400
+ },
+ {
+ "epoch": 0.7,
+ "eval_code_gate_load": [
+ 205.5,
+ 171.95,
+ 176.4,
+ 155.0,
+ 176.15,
+ 185.45,
+ 196.15,
+ 173.4
+ ],
+ "eval_code_loss": 0.2625488340854645,
+ "eval_code_runtime": 1.7811,
+ "eval_code_samples_per_second": 561.449,
+ "eval_code_steps_per_second": 35.371,
+ "step": 1400
+ },
+ {
+ "epoch": 0.7,
+ "eval_orca_gate_load": [
+ 499.7,
+ 344.5,
+ 406.5,
+ 399.4,
+ 350.35,
+ 416.7,
+ 369.6,
+ 355.55
+ ],
+ "eval_orca_loss": 0.39360350370407104,
+ "eval_orca_runtime": 1.9989,
+ "eval_orca_samples_per_second": 500.287,
+ "eval_orca_steps_per_second": 31.518,
+ "step": 1400
+ },
+ {
+ "epoch": 0.7,
+ "eval_math_gate_load": [
+ 315.6,
+ 219.15,
+ 229.9,
+ 225.75,
+ 253.55,
+ 256.25,
+ 275.3,
+ 257.2
+ ],
+ "eval_math_loss": 0.30537110567092896,
+ "eval_math_runtime": 1.8428,
+ "eval_math_samples_per_second": 542.647,
+ "eval_math_steps_per_second": 34.187,
+ "step": 1400
+ },
+ {
+ "epoch": 0.7,
+ "eval_sharegpt_gate_load": [
+ 1527.85,
+ 1080.7,
+ 1307.7,
+ 1213.2,
+ 1131.55,
+ 1358.05,
+ 1182.35,
+ 1110.2
+ ],
+ "eval_sharegpt_loss": 0.53271484375,
+ "eval_sharegpt_runtime": 3.0058,
+ "eval_sharegpt_samples_per_second": 332.695,
+ "eval_sharegpt_steps_per_second": 20.96,
+ "step": 1400
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.34706421425385e-06,
+ "loss": 0.3626,
+ "step": 1401
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.3337135511818514e-06,
+ "loss": 0.3482,
+ "step": 1402
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.320377747285497e-06,
+ "loss": 0.3597,
+ "step": 1403
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.307056837536373e-06,
+ "loss": 0.4391,
+ "step": 1404
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.2937508568670194e-06,
+ "loss": 0.4752,
+ "step": 1405
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.280459840170818e-06,
+ "loss": 0.4315,
+ "step": 1406
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.267183822301903e-06,
+ "loss": 0.4123,
+ "step": 1407
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.2539228380750955e-06,
+ "loss": 0.3806,
+ "step": 1408
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.240676922265774e-06,
+ "loss": 0.3826,
+ "step": 1409
+ },
+ {
+ "epoch": 0.7,
+ "learning_rate": 4.2274461096098085e-06,
+ "loss": 0.4211,
+ "step": 1410
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 4.21423043480346e-06,
+ "loss": 0.382,
+ "step": 1411
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 4.201029932503303e-06,
+ "loss": 0.415,
+ "step": 1412
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 4.18784463732611e-06,
+ "loss": 0.4246,
+ "step": 1413
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 4.17467458384878e-06,
+ "loss": 0.4311,
+ "step": 1414
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 4.1615198066082475e-06,
+ "loss": 0.4051,
+ "step": 1415
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 4.14838034010138e-06,
+ "loss": 0.3679,
+ "step": 1416
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 4.135256218784896e-06,
+ "loss": 0.3997,
+ "step": 1417
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 4.12214747707527e-06,
+ "loss": 0.4115,
+ "step": 1418
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 4.1090541493486555e-06,
+ "loss": 0.413,
+ "step": 1419
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 4.095976269940777e-06,
+ "loss": 0.3356,
+ "step": 1420
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 4.082913873146842e-06,
+ "loss": 0.3641,
+ "step": 1421
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 4.069866993221473e-06,
+ "loss": 0.3875,
+ "step": 1422
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 4.056835664378585e-06,
+ "loss": 0.3374,
+ "step": 1423
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 4.043819920791322e-06,
+ "loss": 0.3604,
+ "step": 1424
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 4.03081979659195e-06,
+ "loss": 0.3292,
+ "step": 1425
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 4.017835325871781e-06,
+ "loss": 0.3759,
+ "step": 1426
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 4.004866542681079e-06,
+ "loss": 0.4933,
+ "step": 1427
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 3.991913481028965e-06,
+ "loss": 0.343,
+ "step": 1428
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 3.978976174883329e-06,
+ "loss": 0.3699,
+ "step": 1429
+ },
+ {
+ "epoch": 0.71,
+ "learning_rate": 3.966054658170754e-06,
+ "loss": 0.3808,
+ "step": 1430
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.953148964776408e-06,
+ "loss": 0.3943,
+ "step": 1431
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.940259128543967e-06,
+ "loss": 0.4061,
+ "step": 1432
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.927385183275522e-06,
+ "loss": 0.3791,
+ "step": 1433
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.914527162731498e-06,
+ "loss": 0.3934,
+ "step": 1434
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.901685100630554e-06,
+ "loss": 0.3913,
+ "step": 1435
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.888859030649498e-06,
+ "loss": 0.3934,
+ "step": 1436
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.876048986423207e-06,
+ "loss": 0.4283,
+ "step": 1437
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.863255001544526e-06,
+ "loss": 0.3086,
+ "step": 1438
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.8504771095641905e-06,
+ "loss": 0.358,
+ "step": 1439
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.837715343990727e-06,
+ "loss": 0.3836,
+ "step": 1440
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.824969738290386e-06,
+ "loss": 0.351,
+ "step": 1441
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.81224032588703e-06,
+ "loss": 0.3598,
+ "step": 1442
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.7995271401620548e-06,
+ "loss": 0.4081,
+ "step": 1443
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.7868302144543146e-06,
+ "loss": 0.3872,
+ "step": 1444
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.7741495820600128e-06,
+ "loss": 0.3478,
+ "step": 1445
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.7614852762326303e-06,
+ "loss": 0.4235,
+ "step": 1446
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.7488373301828296e-06,
+ "loss": 0.3883,
+ "step": 1447
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.736205777078381e-06,
+ "loss": 0.3884,
+ "step": 1448
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.7235906500440576e-06,
+ "loss": 0.4015,
+ "step": 1449
+ },
+ {
+ "epoch": 0.72,
+ "learning_rate": 3.7109919821615546e-06,
+ "loss": 0.3689,
+ "step": 1450
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.6984098064694174e-06,
+ "loss": 0.3799,
+ "step": 1451
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.685844155962931e-06,
+ "loss": 0.3901,
+ "step": 1452
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.673295063594049e-06,
+ "loss": 0.4099,
+ "step": 1453
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.6607625622713005e-06,
+ "loss": 0.3802,
+ "step": 1454
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.6482466848597164e-06,
+ "loss": 0.3853,
+ "step": 1455
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.63574746418072e-06,
+ "loss": 0.3828,
+ "step": 1456
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.6232649330120608e-06,
+ "loss": 0.3936,
+ "step": 1457
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.610799124087725e-06,
+ "loss": 0.3601,
+ "step": 1458
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.5983500700978425e-06,
+ "loss": 0.3573,
+ "step": 1459
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.585917803688603e-06,
+ "loss": 0.4001,
+ "step": 1460
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.5735023574621765e-06,
+ "loss": 0.3791,
+ "step": 1461
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.5611037639766267e-06,
+ "loss": 0.3672,
+ "step": 1462
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.548722055745818e-06,
+ "loss": 0.4283,
+ "step": 1463
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.536357265239333e-06,
+ "loss": 0.3697,
+ "step": 1464
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.5240094248824e-06,
+ "loss": 0.3795,
+ "step": 1465
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.511678567055786e-06,
+ "loss": 0.3783,
+ "step": 1466
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.4993647240957307e-06,
+ "loss": 0.3738,
+ "step": 1467
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.487067928293848e-06,
+ "loss": 0.3223,
+ "step": 1468
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.4747882118970565e-06,
+ "loss": 0.4047,
+ "step": 1469
+ },
+ {
+ "epoch": 0.73,
+ "learning_rate": 3.4625256071074776e-06,
+ "loss": 0.3838,
+ "step": 1470
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.450280146082361e-06,
+ "loss": 0.3525,
+ "step": 1471
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.4380518609340076e-06,
+ "loss": 0.4043,
+ "step": 1472
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.4258407837296635e-06,
+ "loss": 0.3922,
+ "step": 1473
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.413646946491458e-06,
+ "loss": 0.4094,
+ "step": 1474
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.4014703811963024e-06,
+ "loss": 0.4139,
+ "step": 1475
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.3893111197758276e-06,
+ "loss": 0.3647,
+ "step": 1476
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.3771691941162755e-06,
+ "loss": 0.3618,
+ "step": 1477
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.3650446360584276e-06,
+ "loss": 0.4106,
+ "step": 1478
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.35293747739753e-06,
+ "loss": 0.3881,
+ "step": 1479
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.3408477498831917e-06,
+ "loss": 0.4383,
+ "step": 1480
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.3287754852193143e-06,
+ "loss": 0.3771,
+ "step": 1481
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.3167207150640003e-06,
+ "loss": 0.3858,
+ "step": 1482
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.304683471029485e-06,
+ "loss": 0.3392,
+ "step": 1483
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.2926637846820366e-06,
+ "loss": 0.4236,
+ "step": 1484
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.280661687541876e-06,
+ "loss": 0.3373,
+ "step": 1485
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.268677211083109e-06,
+ "loss": 0.424,
+ "step": 1486
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.256710386733629e-06,
+ "loss": 0.4352,
+ "step": 1487
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.2447612458750365e-06,
+ "loss": 0.3318,
+ "step": 1488
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.2328298198425556e-06,
+ "loss": 0.3844,
+ "step": 1489
+ },
+ {
+ "epoch": 0.74,
+ "learning_rate": 3.2209161399249677e-06,
+ "loss": 0.3987,
+ "step": 1490
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.209020237364505e-06,
+ "loss": 0.3508,
+ "step": 1491
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.197142143356787e-06,
+ "loss": 0.3412,
+ "step": 1492
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.1852818890507255e-06,
+ "loss": 0.3601,
+ "step": 1493
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.1734395055484623e-06,
+ "loss": 0.3728,
+ "step": 1494
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.1616150239052647e-06,
+ "loss": 0.4014,
+ "step": 1495
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.1498084751294523e-06,
+ "loss": 0.3344,
+ "step": 1496
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.1380198901823313e-06,
+ "loss": 0.3488,
+ "step": 1497
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.126249299978086e-06,
+ "loss": 0.3459,
+ "step": 1498
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.1144967353837196e-06,
+ "loss": 0.4079,
+ "step": 1499
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.1027622272189572e-06,
+ "loss": 0.3557,
+ "step": 1500
+ },
+ {
+ "epoch": 0.75,
+ "eval_code_gate_load": [
+ 207.05,
+ 170.35,
+ 182.3,
+ 156.9,
+ 179.05,
+ 184.35,
+ 193.1,
+ 166.9
+ ],
+ "eval_code_loss": 0.2623046934604645,
+ "eval_code_runtime": 1.7777,
+ "eval_code_samples_per_second": 562.517,
+ "eval_code_steps_per_second": 35.439,
+ "step": 1500
+ },
+ {
+ "epoch": 0.75,
+ "eval_orca_gate_load": [
+ 501.2,
+ 343.6,
+ 407.1,
+ 397.6,
+ 349.35,
+ 418.55,
+ 371.3,
+ 353.6
+ ],
+ "eval_orca_loss": 0.35224610567092896,
+ "eval_orca_runtime": 2.0086,
+ "eval_orca_samples_per_second": 497.848,
+ "eval_orca_steps_per_second": 31.364,
+ "step": 1500
+ },
+ {
+ "epoch": 0.75,
+ "eval_math_gate_load": [
+ 318.45,
+ 216.6,
+ 235.85,
+ 226.0,
+ 251.4,
+ 258.6,
+ 275.95,
+ 249.85
+ ],
+ "eval_math_loss": 0.3041015565395355,
+ "eval_math_runtime": 1.8624,
+ "eval_math_samples_per_second": 536.934,
+ "eval_math_steps_per_second": 33.827,
+ "step": 1500
+ },
+ {
+ "epoch": 0.75,
+ "eval_sharegpt_gate_load": [
+ 1534.6,
+ 1081.35,
+ 1305.0,
+ 1210.7,
+ 1133.0,
+ 1357.35,
+ 1191.1,
+ 1098.5
+ ],
+ "eval_sharegpt_loss": 0.530078113079071,
+ "eval_sharegpt_runtime": 2.9848,
+ "eval_sharegpt_samples_per_second": 335.033,
+ "eval_sharegpt_steps_per_second": 21.107,
+ "step": 1500
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.0910458062561865e-06,
+ "loss": 0.3699,
+ "step": 1501
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.0793475032203513e-06,
+ "loss": 0.4025,
+ "step": 1502
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.0676673487888854e-06,
+ "loss": 0.3846,
+ "step": 1503
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.0560053735916372e-06,
+ "loss": 0.3701,
+ "step": 1504
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.0443616082107753e-06,
+ "loss": 0.4251,
+ "step": 1505
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.032736083180716e-06,
+ "loss": 0.3944,
+ "step": 1506
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.0211288289880404e-06,
+ "loss": 0.3492,
+ "step": 1507
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 3.009539876071427e-06,
+ "loss": 0.3408,
+ "step": 1508
+ },
+ {
+ "epoch": 0.75,
+ "learning_rate": 2.997969254821548e-06,
+ "loss": 0.3257,
+ "step": 1509
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.9864169955810085e-06,
+ "loss": 0.3895,
+ "step": 1510
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.974883128644266e-06,
+ "loss": 0.367,
+ "step": 1511
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.9633676842575386e-06,
+ "loss": 0.3883,
+ "step": 1512
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.951870692618739e-06,
+ "loss": 0.3662,
+ "step": 1513
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.940392183877382e-06,
+ "loss": 0.338,
+ "step": 1514
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.9289321881345257e-06,
+ "loss": 0.3624,
+ "step": 1515
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.9174907354426696e-06,
+ "loss": 0.3262,
+ "step": 1516
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.9060678558056876e-06,
+ "loss": 0.3512,
+ "step": 1517
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.8946635791787546e-06,
+ "loss": 0.4026,
+ "step": 1518
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.883277935468254e-06,
+ "loss": 0.3844,
+ "step": 1519
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.8719109545317102e-06,
+ "loss": 0.3645,
+ "step": 1520
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.8605626661776995e-06,
+ "loss": 0.3448,
+ "step": 1521
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.849233100165795e-06,
+ "loss": 0.405,
+ "step": 1522
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.837922286206457e-06,
+ "loss": 0.3771,
+ "step": 1523
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.8266302539609747e-06,
+ "loss": 0.354,
+ "step": 1524
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.8153570330413925e-06,
+ "loss": 0.3807,
+ "step": 1525
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.8041026530104144e-06,
+ "loss": 0.3702,
+ "step": 1526
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.7928671433813392e-06,
+ "loss": 0.3836,
+ "step": 1527
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.78165053361798e-06,
+ "loss": 0.3682,
+ "step": 1528
+ },
+ {
+ "epoch": 0.76,
+ "learning_rate": 2.770452853134593e-06,
+ "loss": 0.3843,
+ "step": 1529
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.759274131295787e-06,
+ "loss": 0.4317,
+ "step": 1530
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.7481143974164548e-06,
+ "loss": 0.3398,
+ "step": 1531
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.736973680761702e-06,
+ "loss": 0.3638,
+ "step": 1532
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.7258520105467566e-06,
+ "loss": 0.3795,
+ "step": 1533
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.714749415936904e-06,
+ "loss": 0.3695,
+ "step": 1534
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.7036659260473973e-06,
+ "loss": 0.3323,
+ "step": 1535
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.692601569943407e-06,
+ "loss": 0.3591,
+ "step": 1536
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.6815563766399122e-06,
+ "loss": 0.3609,
+ "step": 1537
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.670530375101641e-06,
+ "loss": 0.3286,
+ "step": 1538
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.6595235942430044e-06,
+ "loss": 0.347,
+ "step": 1539
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.648536062927999e-06,
+ "loss": 0.3776,
+ "step": 1540
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.637567809970143e-06,
+ "loss": 0.469,
+ "step": 1541
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.6266188641324e-06,
+ "loss": 0.3424,
+ "step": 1542
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.6156892541271083e-06,
+ "loss": 0.384,
+ "step": 1543
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.604779008615895e-06,
+ "loss": 0.3755,
+ "step": 1544
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.593888156209603e-06,
+ "loss": 0.37,
+ "step": 1545
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.583016725468226e-06,
+ "loss": 0.3906,
+ "step": 1546
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.572164744900827e-06,
+ "loss": 0.3445,
+ "step": 1547
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.5613322429654573e-06,
+ "loss": 0.3819,
+ "step": 1548
+ },
+ {
+ "epoch": 0.77,
+ "learning_rate": 2.5505192480690865e-06,
+ "loss": 0.305,
+ "step": 1549
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.5397257885675396e-06,
+ "loss": 0.2964,
+ "step": 1550
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.528951892765402e-06,
+ "loss": 0.3686,
+ "step": 1551
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.5181975889159615e-06,
+ "loss": 0.3582,
+ "step": 1552
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.507462905221122e-06,
+ "loss": 0.3419,
+ "step": 1553
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.496747869831345e-06,
+ "loss": 0.3447,
+ "step": 1554
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.48605251084556e-06,
+ "loss": 0.3513,
+ "step": 1555
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.475376856311097e-06,
+ "loss": 0.3515,
+ "step": 1556
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.464720934223619e-06,
+ "loss": 0.347,
+ "step": 1557
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.4540847725270376e-06,
+ "loss": 0.3591,
+ "step": 1558
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.4434683991134476e-06,
+ "loss": 0.3858,
+ "step": 1559
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.432871841823047e-06,
+ "loss": 0.3432,
+ "step": 1560
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.4222951284440776e-06,
+ "loss": 0.3379,
+ "step": 1561
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.411738286712735e-06,
+ "loss": 0.3651,
+ "step": 1562
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.401201344313102e-06,
+ "loss": 0.411,
+ "step": 1563
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.390684328877089e-06,
+ "loss": 0.3929,
+ "step": 1564
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.3801872679843384e-06,
+ "loss": 0.335,
+ "step": 1565
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.36971018916217e-06,
+ "loss": 0.347,
+ "step": 1566
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.3592531198854974e-06,
+ "loss": 0.36,
+ "step": 1567
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.3488160875767717e-06,
+ "loss": 0.3374,
+ "step": 1568
+ },
+ {
+ "epoch": 0.78,
+ "learning_rate": 2.3383991196058918e-06,
+ "loss": 0.287,
+ "step": 1569
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.328002243290138e-06,
+ "loss": 0.3322,
+ "step": 1570
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.317625485894113e-06,
+ "loss": 0.324,
+ "step": 1571
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.307268874629649e-06,
+ "loss": 0.3286,
+ "step": 1572
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.296932436655752e-06,
+ "loss": 0.3224,
+ "step": 1573
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.2866161990785228e-06,
+ "loss": 0.3541,
+ "step": 1574
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.2763201889510987e-06,
+ "loss": 0.3138,
+ "step": 1575
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.266044433273562e-06,
+ "loss": 0.3317,
+ "step": 1576
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.2557889589928815e-06,
+ "loss": 0.3571,
+ "step": 1577
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.245553793002849e-06,
+ "loss": 0.3136,
+ "step": 1578
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.23533896214399e-06,
+ "loss": 0.3203,
+ "step": 1579
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.2251444932035094e-06,
+ "loss": 0.3453,
+ "step": 1580
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.2149704129152083e-06,
+ "loss": 0.4132,
+ "step": 1581
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.204816747959434e-06,
+ "loss": 0.368,
+ "step": 1582
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.194683524962986e-06,
+ "loss": 0.3031,
+ "step": 1583
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.184570770499056e-06,
+ "loss": 0.3408,
+ "step": 1584
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.1744785110871713e-06,
+ "loss": 0.3161,
+ "step": 1585
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.1644067731931005e-06,
+ "loss": 0.2998,
+ "step": 1586
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.1543555832288056e-06,
+ "loss": 0.3601,
+ "step": 1587
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.1443249675523536e-06,
+ "loss": 0.343,
+ "step": 1588
+ },
+ {
+ "epoch": 0.79,
+ "learning_rate": 2.134314952467873e-06,
+ "loss": 0.306,
+ "step": 1589
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 2.124325564225458e-06,
+ "loss": 0.3214,
+ "step": 1590
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 2.1143568290211115e-06,
+ "loss": 0.3413,
+ "step": 1591
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 2.1044087729966856e-06,
+ "loss": 0.3369,
+ "step": 1592
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 2.0944814222397948e-06,
+ "loss": 0.3431,
+ "step": 1593
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 2.0845748027837585e-06,
+ "loss": 0.2723,
+ "step": 1594
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 2.074688940607529e-06,
+ "loss": 0.326,
+ "step": 1595
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 2.064823861635633e-06,
+ "loss": 0.2964,
+ "step": 1596
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 2.0549795917380867e-06,
+ "loss": 0.3481,
+ "step": 1597
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 2.0451561567303378e-06,
+ "loss": 0.3613,
+ "step": 1598
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 2.0353535823732053e-06,
+ "loss": 0.3455,
+ "step": 1599
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 2.025571894372794e-06,
+ "loss": 0.2833,
+ "step": 1600
+ },
+ {
+ "epoch": 0.8,
+ "eval_code_gate_load": [
+ 206.95,
+ 170.4,
+ 179.9,
+ 153.6,
+ 179.0,
+ 186.85,
+ 195.1,
+ 168.2
+ ],
+ "eval_code_loss": 0.261962890625,
+ "eval_code_runtime": 1.7896,
+ "eval_code_samples_per_second": 558.77,
+ "eval_code_steps_per_second": 35.203,
+ "step": 1600
+ },
+ {
+ "epoch": 0.8,
+ "eval_orca_gate_load": [
+ 498.0,
+ 341.05,
+ 410.9,
+ 396.4,
+ 351.45,
+ 419.25,
+ 369.95,
+ 355.3
+ ],
+ "eval_orca_loss": 0.3511718809604645,
+ "eval_orca_runtime": 2.0018,
+ "eval_orca_samples_per_second": 499.561,
+ "eval_orca_steps_per_second": 31.472,
+ "step": 1600
+ },
+ {
+ "epoch": 0.8,
+ "eval_math_gate_load": [
+ 314.8,
+ 213.8,
+ 239.55,
+ 221.0,
+ 254.25,
+ 260.1,
+ 278.1,
+ 251.1
+ ],
+ "eval_math_loss": 0.24335937201976776,
+ "eval_math_runtime": 1.8471,
+ "eval_math_samples_per_second": 541.403,
+ "eval_math_steps_per_second": 34.108,
+ "step": 1600
+ },
+ {
+ "epoch": 0.8,
+ "eval_sharegpt_gate_load": [
+ 1525.0,
+ 1074.95,
+ 1316.25,
+ 1205.0,
+ 1138.7,
+ 1360.4,
+ 1181.75,
+ 1109.55
+ ],
+ "eval_sharegpt_loss": 0.48750001192092896,
+ "eval_sharegpt_runtime": 2.9974,
+ "eval_sharegpt_samples_per_second": 333.624,
+ "eval_sharegpt_steps_per_second": 21.018,
+ "step": 1600
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 2.0158111183804407e-06,
+ "loss": 0.3204,
+ "step": 1601
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 2.0060712799926407e-06,
+ "loss": 0.2845,
+ "step": 1602
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 1.9963524047509898e-06,
+ "loss": 0.3192,
+ "step": 1603
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 1.9866545181421016e-06,
+ "loss": 0.321,
+ "step": 1604
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 1.976977645597552e-06,
+ "loss": 0.3466,
+ "step": 1605
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 1.967321812493813e-06,
+ "loss": 0.341,
+ "step": 1606
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 1.9576870441521834e-06,
+ "loss": 0.2962,
+ "step": 1607
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 1.9480733658387175e-06,
+ "loss": 0.3237,
+ "step": 1608
+ },
+ {
+ "epoch": 0.8,
+ "learning_rate": 1.9384808027641666e-06,
+ "loss": 0.3164,
+ "step": 1609
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.9289093800839067e-06,
+ "loss": 0.3583,
+ "step": 1610
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.9193591228978815e-06,
+ "loss": 0.2978,
+ "step": 1611
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.9098300562505266e-06,
+ "loss": 0.2947,
+ "step": 1612
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.9003222051307046e-06,
+ "loss": 0.2788,
+ "step": 1613
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.8908355944716516e-06,
+ "loss": 0.3126,
+ "step": 1614
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.8813702491508956e-06,
+ "loss": 0.3048,
+ "step": 1615
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.8719261939902023e-06,
+ "loss": 0.2466,
+ "step": 1616
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.862503453755502e-06,
+ "loss": 0.3255,
+ "step": 1617
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.8531020531568377e-06,
+ "loss": 0.3581,
+ "step": 1618
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.8437220168482839e-06,
+ "loss": 0.3437,
+ "step": 1619
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.8343633694278895e-06,
+ "loss": 0.3263,
+ "step": 1620
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.825026135437622e-06,
+ "loss": 0.323,
+ "step": 1621
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.8157103393632869e-06,
+ "loss": 0.2819,
+ "step": 1622
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.8064160056344714e-06,
+ "loss": 0.2967,
+ "step": 1623
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.7971431586244814e-06,
+ "loss": 0.3171,
+ "step": 1624
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.7878918226502816e-06,
+ "loss": 0.3275,
+ "step": 1625
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.7786620219724205e-06,
+ "loss": 0.3351,
+ "step": 1626
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.7694537807949707e-06,
+ "loss": 0.2992,
+ "step": 1627
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.7602671232654755e-06,
+ "loss": 0.3353,
+ "step": 1628
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.751102073474873e-06,
+ "loss": 0.3463,
+ "step": 1629
+ },
+ {
+ "epoch": 0.81,
+ "learning_rate": 1.7419586554574364e-06,
+ "loss": 0.3037,
+ "step": 1630
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.7328368931907114e-06,
+ "loss": 0.2644,
+ "step": 1631
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.723736810595461e-06,
+ "loss": 0.2871,
+ "step": 1632
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.7146584315355886e-06,
+ "loss": 0.3261,
+ "step": 1633
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.7056017798180824e-06,
+ "loss": 0.3192,
+ "step": 1634
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.69656687919296e-06,
+ "loss": 0.3082,
+ "step": 1635
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.687553753353195e-06,
+ "loss": 0.3397,
+ "step": 1636
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.6785624259346556e-06,
+ "loss": 0.3451,
+ "step": 1637
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.669592920516049e-06,
+ "loss": 0.3263,
+ "step": 1638
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.660645260618864e-06,
+ "loss": 0.2625,
+ "step": 1639
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.6517194697072903e-06,
+ "loss": 0.2828,
+ "step": 1640
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.6428155711881722e-06,
+ "loss": 0.3153,
+ "step": 1641
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.633933588410952e-06,
+ "loss": 0.3264,
+ "step": 1642
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.6250735446675914e-06,
+ "loss": 0.2978,
+ "step": 1643
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.6162354631925203e-06,
+ "loss": 0.3242,
+ "step": 1644
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.607419367162577e-06,
+ "loss": 0.3276,
+ "step": 1645
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.5986252796969482e-06,
+ "loss": 0.3074,
+ "step": 1646
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.589853223857103e-06,
+ "loss": 0.3643,
+ "step": 1647
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.5811032226467304e-06,
+ "loss": 0.3557,
+ "step": 1648
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.5723752990116948e-06,
+ "loss": 0.3167,
+ "step": 1649
+ },
+ {
+ "epoch": 0.82,
+ "learning_rate": 1.5636694758399563e-06,
+ "loss": 0.2939,
+ "step": 1650
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.5549857759615195e-06,
+ "loss": 0.2991,
+ "step": 1651
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.5463242221483742e-06,
+ "loss": 0.3062,
+ "step": 1652
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.5376848371144404e-06,
+ "loss": 0.2834,
+ "step": 1653
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.5290676435154949e-06,
+ "loss": 0.3087,
+ "step": 1654
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.520472663949122e-06,
+ "loss": 0.3508,
+ "step": 1655
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.511899920954656e-06,
+ "loss": 0.2915,
+ "step": 1656
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.5033494370131162e-06,
+ "loss": 0.2971,
+ "step": 1657
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.4948212345471492e-06,
+ "loss": 0.3288,
+ "step": 1658
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.4863153359209693e-06,
+ "loss": 0.3216,
+ "step": 1659
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.4778317634403082e-06,
+ "loss": 0.3173,
+ "step": 1660
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.469370539352345e-06,
+ "loss": 0.3572,
+ "step": 1661
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.460931685845649e-06,
+ "loss": 0.3071,
+ "step": 1662
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.4525152250501362e-06,
+ "loss": 0.3556,
+ "step": 1663
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.4441211790369892e-06,
+ "loss": 0.294,
+ "step": 1664
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.4357495698186186e-06,
+ "loss": 0.2905,
+ "step": 1665
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.427400419348588e-06,
+ "loss": 0.3211,
+ "step": 1666
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.4190737495215746e-06,
+ "loss": 0.2948,
+ "step": 1667
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.4107695821733026e-06,
+ "loss": 0.2951,
+ "step": 1668
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.402487939080479e-06,
+ "loss": 0.2877,
+ "step": 1669
+ },
+ {
+ "epoch": 0.83,
+ "learning_rate": 1.3942288419607476e-06,
+ "loss": 0.3135,
+ "step": 1670
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.3859923124726283e-06,
+ "loss": 0.2892,
+ "step": 1671
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.3777783722154603e-06,
+ "loss": 0.286,
+ "step": 1672
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.369587042729341e-06,
+ "loss": 0.2768,
+ "step": 1673
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.3614183454950824e-06,
+ "loss": 0.3297,
+ "step": 1674
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.3532723019341376e-06,
+ "loss": 0.2959,
+ "step": 1675
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.3451489334085555e-06,
+ "loss": 0.3296,
+ "step": 1676
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.3370482612209224e-06,
+ "loss": 0.3014,
+ "step": 1677
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.3289703066143112e-06,
+ "loss": 0.3268,
+ "step": 1678
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.3209150907722124e-06,
+ "loss": 0.2981,
+ "step": 1679
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.3128826348184886e-06,
+ "loss": 0.2967,
+ "step": 1680
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.3048729598173248e-06,
+ "loss": 0.2178,
+ "step": 1681
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.296886086773157e-06,
+ "loss": 0.2973,
+ "step": 1682
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.2889220366306276e-06,
+ "loss": 0.3261,
+ "step": 1683
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.2809808302745298e-06,
+ "loss": 0.2954,
+ "step": 1684
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.2730624885297537e-06,
+ "loss": 0.3454,
+ "step": 1685
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.2651670321612264e-06,
+ "loss": 0.3073,
+ "step": 1686
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.2572944818738587e-06,
+ "loss": 0.2906,
+ "step": 1687
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.249444858312502e-06,
+ "loss": 0.3021,
+ "step": 1688
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.2416181820618745e-06,
+ "loss": 0.2942,
+ "step": 1689
+ },
+ {
+ "epoch": 0.84,
+ "learning_rate": 1.233814473646524e-06,
+ "loss": 0.3381,
+ "step": 1690
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.226033753530763e-06,
+ "loss": 0.3046,
+ "step": 1691
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.218276042118629e-06,
+ "loss": 0.3362,
+ "step": 1692
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.2105413597538107e-06,
+ "loss": 0.3149,
+ "step": 1693
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.202829726719611e-06,
+ "loss": 0.2529,
+ "step": 1694
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.195141163238892e-06,
+ "loss": 0.3058,
+ "step": 1695
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.1874756894740137e-06,
+ "loss": 0.3242,
+ "step": 1696
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.1798333255267857e-06,
+ "loss": 0.2882,
+ "step": 1697
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.1722140914384162e-06,
+ "loss": 0.2735,
+ "step": 1698
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.1646180071894608e-06,
+ "loss": 0.2994,
+ "step": 1699
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.1570450926997657e-06,
+ "loss": 0.2764,
+ "step": 1700
+ },
+ {
+ "epoch": 0.85,
+ "eval_code_gate_load": [
+ 207.4,
+ 170.6,
+ 179.4,
+ 154.8,
+ 177.85,
+ 186.5,
+ 195.0,
+ 168.45
+ ],
+ "eval_code_loss": 0.26079100370407104,
+ "eval_code_runtime": 1.7851,
+ "eval_code_samples_per_second": 560.183,
+ "eval_code_steps_per_second": 35.292,
+ "step": 1700
+ },
+ {
+ "epoch": 0.85,
+ "eval_orca_gate_load": [
+ 499.15,
+ 341.7,
+ 406.8,
+ 396.0,
+ 351.7,
+ 421.4,
+ 368.95,
+ 356.6
+ ],
+ "eval_orca_loss": 0.35004884004592896,
+ "eval_orca_runtime": 2.0049,
+ "eval_orca_samples_per_second": 498.779,
+ "eval_orca_steps_per_second": 31.423,
+ "step": 1700
+ },
+ {
+ "epoch": 0.85,
+ "eval_math_gate_load": [
+ 314.35,
+ 215.9,
+ 240.2,
+ 220.65,
+ 253.5,
+ 262.05,
+ 277.05,
+ 249.0
+ ],
+ "eval_math_loss": 0.2432861328125,
+ "eval_math_runtime": 1.8844,
+ "eval_math_samples_per_second": 530.665,
+ "eval_math_steps_per_second": 33.432,
+ "step": 1700
+ },
+ {
+ "epoch": 0.85,
+ "eval_sharegpt_gate_load": [
+ 1523.75,
+ 1077.65,
+ 1308.6,
+ 1203.3,
+ 1140.45,
+ 1368.2,
+ 1179.25,
+ 1110.4
+ ],
+ "eval_sharegpt_loss": 0.4883789122104645,
+ "eval_sharegpt_runtime": 3.003,
+ "eval_sharegpt_samples_per_second": 333.001,
+ "eval_sharegpt_steps_per_second": 20.979,
+ "step": 1700
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.1494953678284105e-06,
+ "loss": 0.2901,
+ "step": 1701
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.1419688523736761e-06,
+ "loss": 0.3298,
+ "step": 1702
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.1344655660729676e-06,
+ "loss": 0.3355,
+ "step": 1703
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.1269855286027798e-06,
+ "loss": 0.323,
+ "step": 1704
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.1195287595786352e-06,
+ "loss": 0.3004,
+ "step": 1705
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.1120952785550477e-06,
+ "loss": 0.258,
+ "step": 1706
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.1046851050254504e-06,
+ "loss": 0.2928,
+ "step": 1707
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.0972982584221592e-06,
+ "loss": 0.294,
+ "step": 1708
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.0899347581163222e-06,
+ "loss": 0.3543,
+ "step": 1709
+ },
+ {
+ "epoch": 0.85,
+ "learning_rate": 1.0825946234178575e-06,
+ "loss": 0.3076,
+ "step": 1710
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 1.0752778735754121e-06,
+ "loss": 0.2811,
+ "step": 1711
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 1.067984527776309e-06,
+ "loss": 0.2797,
+ "step": 1712
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 1.0607146051465011e-06,
+ "loss": 0.2882,
+ "step": 1713
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 1.0534681247505107e-06,
+ "loss": 0.2896,
+ "step": 1714
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 1.0462451055913847e-06,
+ "loss": 0.2878,
+ "step": 1715
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 1.0390455666106547e-06,
+ "loss": 0.3366,
+ "step": 1716
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 1.0318695266882696e-06,
+ "loss": 0.2932,
+ "step": 1717
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 1.024717004642557e-06,
+ "loss": 0.3224,
+ "step": 1718
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 1.0175880192301713e-06,
+ "loss": 0.3163,
+ "step": 1719
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 1.010482589146048e-06,
+ "loss": 0.3059,
+ "step": 1720
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 1.0034007330233487e-06,
+ "loss": 0.3024,
+ "step": 1721
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 9.963424694334122e-07,
+ "loss": 0.31,
+ "step": 1722
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 9.893078168857173e-07,
+ "loss": 0.2936,
+ "step": 1723
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 9.822967938278172e-07,
+ "loss": 0.244,
+ "step": 1724
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 9.753094186453028e-07,
+ "loss": 0.362,
+ "step": 1725
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 9.683457096617487e-07,
+ "loss": 0.3021,
+ "step": 1726
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 9.614056851386743e-07,
+ "loss": 0.2698,
+ "step": 1727
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 9.544893632754816e-07,
+ "loss": 0.3254,
+ "step": 1728
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 9.475967622094207e-07,
+ "loss": 0.3472,
+ "step": 1729
+ },
+ {
+ "epoch": 0.86,
+ "learning_rate": 9.407279000155311e-07,
+ "loss": 0.3558,
+ "step": 1730
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 9.338827947066076e-07,
+ "loss": 0.2826,
+ "step": 1731
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 9.270614642331377e-07,
+ "loss": 0.287,
+ "step": 1732
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 9.202639264832669e-07,
+ "loss": 0.3271,
+ "step": 1733
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 9.134901992827427e-07,
+ "loss": 0.3017,
+ "step": 1734
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 9.067403003948783e-07,
+ "loss": 0.2957,
+ "step": 1735
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 9.000142475204965e-07,
+ "loss": 0.2881,
+ "step": 1736
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 8.933120582978827e-07,
+ "loss": 0.3096,
+ "step": 1737
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 8.866337503027523e-07,
+ "loss": 0.3048,
+ "step": 1738
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 8.799793410481871e-07,
+ "loss": 0.2805,
+ "step": 1739
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 8.733488479845997e-07,
+ "loss": 0.3241,
+ "step": 1740
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 8.667422884996823e-07,
+ "loss": 0.2841,
+ "step": 1741
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 8.60159679918372e-07,
+ "loss": 0.3075,
+ "step": 1742
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 8.536010395027905e-07,
+ "loss": 0.3022,
+ "step": 1743
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 8.470663844522053e-07,
+ "loss": 0.2859,
+ "step": 1744
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 8.405557319029911e-07,
+ "loss": 0.2964,
+ "step": 1745
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 8.340690989285727e-07,
+ "loss": 0.2967,
+ "step": 1746
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 8.276065025393909e-07,
+ "loss": 0.3191,
+ "step": 1747
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 8.211679596828481e-07,
+ "loss": 0.27,
+ "step": 1748
+ },
+ {
+ "epoch": 0.87,
+ "learning_rate": 8.147534872432761e-07,
+ "loss": 0.3067,
+ "step": 1749
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 8.083631020418792e-07,
+ "loss": 0.3003,
+ "step": 1750
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 8.019968208366958e-07,
+ "loss": 0.3156,
+ "step": 1751
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 7.956546603225601e-07,
+ "loss": 0.2852,
+ "step": 1752
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 7.893366371310463e-07,
+ "loss": 0.3485,
+ "step": 1753
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 7.830427678304353e-07,
+ "loss": 0.3002,
+ "step": 1754
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 7.767730689256614e-07,
+ "loss": 0.3188,
+ "step": 1755
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 7.705275568582848e-07,
+ "loss": 0.3151,
+ "step": 1756
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 7.643062480064301e-07,
+ "loss": 0.2763,
+ "step": 1757
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 7.581091586847522e-07,
+ "loss": 0.3074,
+ "step": 1758
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 7.519363051443996e-07,
+ "loss": 0.2607,
+ "step": 1759
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 7.457877035729588e-07,
+ "loss": 0.333,
+ "step": 1760
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 7.3966337009442e-07,
+ "loss": 0.3025,
+ "step": 1761
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 7.335633207691362e-07,
+ "loss": 0.2867,
+ "step": 1762
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 7.274875715937746e-07,
+ "loss": 0.2828,
+ "step": 1763
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 7.21436138501278e-07,
+ "loss": 0.2814,
+ "step": 1764
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 7.154090373608236e-07,
+ "loss": 0.2994,
+ "step": 1765
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 7.094062839777838e-07,
+ "loss": 0.2821,
+ "step": 1766
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 7.03427894093679e-07,
+ "loss": 0.2824,
+ "step": 1767
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 6.974738833861383e-07,
+ "loss": 0.2706,
+ "step": 1768
+ },
+ {
+ "epoch": 0.88,
+ "learning_rate": 6.915442674688633e-07,
+ "loss": 0.2929,
+ "step": 1769
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 6.856390618915775e-07,
+ "loss": 0.3311,
+ "step": 1770
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 6.797582821399973e-07,
+ "loss": 0.31,
+ "step": 1771
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 6.739019436357774e-07,
+ "loss": 0.3079,
+ "step": 1772
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 6.680700617364877e-07,
+ "loss": 0.307,
+ "step": 1773
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 6.622626517355557e-07,
+ "loss": 0.3126,
+ "step": 1774
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 6.564797288622371e-07,
+ "loss": 0.3571,
+ "step": 1775
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 6.507213082815745e-07,
+ "loss": 0.277,
+ "step": 1776
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 6.449874050943549e-07,
+ "loss": 0.3365,
+ "step": 1777
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 6.392780343370686e-07,
+ "loss": 0.2917,
+ "step": 1778
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 6.335932109818754e-07,
+ "loss": 0.2717,
+ "step": 1779
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 6.279329499365649e-07,
+ "loss": 0.2477,
+ "step": 1780
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 6.222972660445082e-07,
+ "loss": 0.3076,
+ "step": 1781
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 6.166861740846297e-07,
+ "loss": 0.2606,
+ "step": 1782
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 6.11099688771366e-07,
+ "loss": 0.2695,
+ "step": 1783
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 6.055378247546217e-07,
+ "loss": 0.3152,
+ "step": 1784
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 6.000005966197387e-07,
+ "loss": 0.268,
+ "step": 1785
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 5.94488018887448e-07,
+ "loss": 0.3017,
+ "step": 1786
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 5.890001060138484e-07,
+ "loss": 0.3279,
+ "step": 1787
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 5.835368723903456e-07,
+ "loss": 0.2949,
+ "step": 1788
+ },
+ {
+ "epoch": 0.89,
+ "learning_rate": 5.780983323436374e-07,
+ "loss": 0.3345,
+ "step": 1789
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 5.726845001356573e-07,
+ "loss": 0.3232,
+ "step": 1790
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 5.672953899635524e-07,
+ "loss": 0.2993,
+ "step": 1791
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 5.619310159596358e-07,
+ "loss": 0.3065,
+ "step": 1792
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 5.565913921913513e-07,
+ "loss": 0.2778,
+ "step": 1793
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 5.51276532661238e-07,
+ "loss": 0.2822,
+ "step": 1794
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 5.459864513068991e-07,
+ "loss": 0.3144,
+ "step": 1795
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 5.407211620009545e-07,
+ "loss": 0.3007,
+ "step": 1796
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 5.354806785510113e-07,
+ "loss": 0.2625,
+ "step": 1797
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 5.30265014699628e-07,
+ "loss": 0.293,
+ "step": 1798
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 5.250741841242735e-07,
+ "loss": 0.2848,
+ "step": 1799
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 5.199082004372958e-07,
+ "loss": 0.2789,
+ "step": 1800
+ },
+ {
+ "epoch": 0.9,
+ "eval_code_gate_load": [
+ 206.3,
+ 171.0,
+ 178.4,
+ 153.6,
+ 178.55,
+ 188.05,
+ 195.05,
+ 169.05
+ ],
+ "eval_code_loss": 0.24262695014476776,
+ "eval_code_runtime": 1.786,
+ "eval_code_samples_per_second": 559.908,
+ "eval_code_steps_per_second": 35.274,
+ "step": 1800
+ },
+ {
+ "epoch": 0.9,
+ "eval_orca_gate_load": [
+ 498.15,
+ 341.45,
+ 405.0,
+ 396.0,
+ 352.45,
+ 421.95,
+ 368.95,
+ 358.35
+ ],
+ "eval_orca_loss": 0.3505859375,
+ "eval_orca_runtime": 2.0051,
+ "eval_orca_samples_per_second": 498.718,
+ "eval_orca_steps_per_second": 31.419,
+ "step": 1800
+ },
+ {
+ "epoch": 0.9,
+ "eval_math_gate_load": [
+ 314.35,
+ 216.7,
+ 239.55,
+ 220.1,
+ 253.35,
+ 261.95,
+ 275.8,
+ 250.9
+ ],
+ "eval_math_loss": 0.24282225966453552,
+ "eval_math_runtime": 1.865,
+ "eval_math_samples_per_second": 536.202,
+ "eval_math_steps_per_second": 33.781,
+ "step": 1800
+ },
+ {
+ "epoch": 0.9,
+ "eval_sharegpt_gate_load": [
+ 1525.35,
+ 1079.5,
+ 1303.65,
+ 1203.5,
+ 1140.4,
+ 1370.1,
+ 1178.8,
+ 1110.3
+ ],
+ "eval_sharegpt_loss": 0.48857420682907104,
+ "eval_sharegpt_runtime": 2.9886,
+ "eval_sharegpt_samples_per_second": 334.604,
+ "eval_sharegpt_steps_per_second": 21.08,
+ "step": 1800
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 5.147670771858848e-07,
+ "loss": 0.3078,
+ "step": 1801
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 5.096508278520385e-07,
+ "loss": 0.3373,
+ "step": 1802
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 5.045594658525232e-07,
+ "loss": 0.3121,
+ "step": 1803
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 4.994930045388414e-07,
+ "loss": 0.2671,
+ "step": 1804
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 4.944514571971981e-07,
+ "loss": 0.3293,
+ "step": 1805
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 4.894348370484648e-07,
+ "loss": 0.3154,
+ "step": 1806
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 4.844431572481412e-07,
+ "loss": 0.2988,
+ "step": 1807
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 4.794764308863242e-07,
+ "loss": 0.2988,
+ "step": 1808
+ },
+ {
+ "epoch": 0.9,
+ "learning_rate": 4.745346709876786e-07,
+ "loss": 0.292,
+ "step": 1809
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 4.696178905113913e-07,
+ "loss": 0.2773,
+ "step": 1810
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 4.6472610235114513e-07,
+ "loss": 0.2962,
+ "step": 1811
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 4.5985931933508757e-07,
+ "loss": 0.2758,
+ "step": 1812
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 4.550175542257862e-07,
+ "loss": 0.3309,
+ "step": 1813
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 4.502008197202068e-07,
+ "loss": 0.3072,
+ "step": 1814
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 4.454091284496731e-07,
+ "loss": 0.3043,
+ "step": 1815
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 4.406424929798403e-07,
+ "loss": 0.2964,
+ "step": 1816
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 4.3590092581065055e-07,
+ "loss": 0.2578,
+ "step": 1817
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 4.3118443937631094e-07,
+ "loss": 0.2903,
+ "step": 1818
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 4.26493046045261e-07,
+ "loss": 0.2921,
+ "step": 1819
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 4.218267581201296e-07,
+ "loss": 0.3116,
+ "step": 1820
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 4.17185587837714e-07,
+ "loss": 0.2662,
+ "step": 1821
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 4.125695473689406e-07,
+ "loss": 0.2796,
+ "step": 1822
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 4.0797864881883977e-07,
+ "loss": 0.3705,
+ "step": 1823
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 4.034129042265067e-07,
+ "loss": 0.265,
+ "step": 1824
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 3.988723255650728e-07,
+ "loss": 0.3237,
+ "step": 1825
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 3.943569247416801e-07,
+ "loss": 0.2654,
+ "step": 1826
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 3.8986671359743767e-07,
+ "loss": 0.3055,
+ "step": 1827
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 3.8540170390740097e-07,
+ "loss": 0.2912,
+ "step": 1828
+ },
+ {
+ "epoch": 0.91,
+ "learning_rate": 3.8096190738053815e-07,
+ "loss": 0.2577,
+ "step": 1829
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.7654733565969826e-07,
+ "loss": 0.3198,
+ "step": 1830
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.721580003215808e-07,
+ "loss": 0.3015,
+ "step": 1831
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.67793912876705e-07,
+ "loss": 0.2554,
+ "step": 1832
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.6345508476938296e-07,
+ "loss": 0.2986,
+ "step": 1833
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.591415273776855e-07,
+ "loss": 0.2911,
+ "step": 1834
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.548532520134129e-07,
+ "loss": 0.2769,
+ "step": 1835
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.5059026992206645e-07,
+ "loss": 0.3182,
+ "step": 1836
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.4635259228282256e-07,
+ "loss": 0.2591,
+ "step": 1837
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.421402302084953e-07,
+ "loss": 0.3009,
+ "step": 1838
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.379531947455128e-07,
+ "loss": 0.2904,
+ "step": 1839
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.3379149687388866e-07,
+ "loss": 0.3221,
+ "step": 1840
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.2965514750718964e-07,
+ "loss": 0.3366,
+ "step": 1841
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.255441574925089e-07,
+ "loss": 0.2937,
+ "step": 1842
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.2145853761043844e-07,
+ "loss": 0.3,
+ "step": 1843
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.1739829857504235e-07,
+ "loss": 0.2915,
+ "step": 1844
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.133634510338235e-07,
+ "loss": 0.2622,
+ "step": 1845
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.093540055676958e-07,
+ "loss": 0.2865,
+ "step": 1846
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.053699726909676e-07,
+ "loss": 0.2946,
+ "step": 1847
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 3.0141136285129825e-07,
+ "loss": 0.3349,
+ "step": 1848
+ },
+ {
+ "epoch": 0.92,
+ "learning_rate": 2.974781864296783e-07,
+ "loss": 0.2425,
+ "step": 1849
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.935704537404083e-07,
+ "loss": 0.2605,
+ "step": 1850
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.8968817503105984e-07,
+ "loss": 0.2733,
+ "step": 1851
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.8583136048245697e-07,
+ "loss": 0.2423,
+ "step": 1852
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.820000202086459e-07,
+ "loss": 0.2926,
+ "step": 1853
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.781941642568686e-07,
+ "loss": 0.242,
+ "step": 1854
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.744138026075405e-07,
+ "loss": 0.3168,
+ "step": 1855
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.706589451742181e-07,
+ "loss": 0.3208,
+ "step": 1856
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.669296018035772e-07,
+ "loss": 0.2808,
+ "step": 1857
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.632257822753881e-07,
+ "loss": 0.2936,
+ "step": 1858
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.5954749630248355e-07,
+ "loss": 0.2861,
+ "step": 1859
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.5589475353073987e-07,
+ "loss": 0.2806,
+ "step": 1860
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.5226756353904925e-07,
+ "loss": 0.3313,
+ "step": 1861
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.486659358392951e-07,
+ "loss": 0.288,
+ "step": 1862
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.450898798763268e-07,
+ "loss": 0.2649,
+ "step": 1863
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.4153940502793185e-07,
+ "loss": 0.2993,
+ "step": 1864
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.380145206048201e-07,
+ "loss": 0.2831,
+ "step": 1865
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.3451523585058756e-07,
+ "loss": 0.269,
+ "step": 1866
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.3104155994170042e-07,
+ "loss": 0.2863,
+ "step": 1867
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.2759350198746978e-07,
+ "loss": 0.2895,
+ "step": 1868
+ },
+ {
+ "epoch": 0.93,
+ "learning_rate": 2.24171071030026e-07,
+ "loss": 0.3511,
+ "step": 1869
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 2.2077427604429435e-07,
+ "loss": 0.3163,
+ "step": 1870
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 2.1740312593797274e-07,
+ "loss": 0.2954,
+ "step": 1871
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 2.1405762955151178e-07,
+ "loss": 0.3207,
+ "step": 1872
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 2.1073779565808471e-07,
+ "loss": 0.2869,
+ "step": 1873
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 2.0744363296356872e-07,
+ "loss": 0.296,
+ "step": 1874
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 2.0417515010652032e-07,
+ "loss": 0.3008,
+ "step": 1875
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 2.009323556581566e-07,
+ "loss": 0.2822,
+ "step": 1876
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 1.977152581223274e-07,
+ "loss": 0.2678,
+ "step": 1877
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 1.9452386593549534e-07,
+ "loss": 0.3009,
+ "step": 1878
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 1.9135818746671587e-07,
+ "loss": 0.2525,
+ "step": 1879
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 1.8821823101760949e-07,
+ "loss": 0.342,
+ "step": 1880
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 1.8510400482234848e-07,
+ "loss": 0.3283,
+ "step": 1881
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 1.8201551704762453e-07,
+ "loss": 0.3051,
+ "step": 1882
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 1.7895277579264015e-07,
+ "loss": 0.2982,
+ "step": 1883
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 1.7591578908907724e-07,
+ "loss": 0.2743,
+ "step": 1884
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 1.7290456490107522e-07,
+ "loss": 0.2717,
+ "step": 1885
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 1.699191111252241e-07,
+ "loss": 0.3157,
+ "step": 1886
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 1.6695943559052463e-07,
+ "loss": 0.3009,
+ "step": 1887
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 1.6402554605838173e-07,
+ "loss": 0.2954,
+ "step": 1888
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 1.6111745022257873e-07,
+ "loss": 0.2454,
+ "step": 1889
+ },
+ {
+ "epoch": 0.94,
+ "learning_rate": 1.5823515570925763e-07,
+ "loss": 0.325,
+ "step": 1890
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.5537867007690111e-07,
+ "loss": 0.3027,
+ "step": 1891
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.5254800081630828e-07,
+ "loss": 0.2612,
+ "step": 1892
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.4974315535058016e-07,
+ "loss": 0.3325,
+ "step": 1893
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.469641410350964e-07,
+ "loss": 0.2978,
+ "step": 1894
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.4421096515749855e-07,
+ "loss": 0.284,
+ "step": 1895
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.4148363493766803e-07,
+ "loss": 0.3069,
+ "step": 1896
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.3878215752771264e-07,
+ "loss": 0.3225,
+ "step": 1897
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.361065400119399e-07,
+ "loss": 0.3019,
+ "step": 1898
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.3345678940684615e-07,
+ "loss": 0.3103,
+ "step": 1899
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.30832912661093e-07,
+ "loss": 0.2691,
+ "step": 1900
+ },
+ {
+ "epoch": 0.95,
+ "eval_code_gate_load": [
+ 205.55,
+ 171.4,
+ 178.65,
+ 153.4,
+ 178.2,
+ 188.1,
+ 195.95,
+ 168.75
+ ],
+ "eval_code_loss": 0.24282225966453552,
+ "eval_code_runtime": 1.7813,
+ "eval_code_samples_per_second": 561.397,
+ "eval_code_steps_per_second": 35.368,
+ "step": 1900
+ },
+ {
+ "epoch": 0.95,
+ "eval_orca_gate_load": [
+ 497.35,
+ 342.7,
+ 406.6,
+ 396.1,
+ 351.0,
+ 421.1,
+ 369.95,
+ 357.5
+ ],
+ "eval_orca_loss": 0.34912109375,
+ "eval_orca_runtime": 2.0183,
+ "eval_orca_samples_per_second": 495.476,
+ "eval_orca_steps_per_second": 31.215,
+ "step": 1900
+ },
+ {
+ "epoch": 0.95,
+ "eval_math_gate_load": [
+ 314.85,
+ 216.8,
+ 238.95,
+ 220.8,
+ 253.1,
+ 261.85,
+ 276.2,
+ 250.15
+ ],
+ "eval_math_loss": 0.24270018935203552,
+ "eval_math_runtime": 1.8503,
+ "eval_math_samples_per_second": 540.439,
+ "eval_math_steps_per_second": 34.048,
+ "step": 1900
+ },
+ {
+ "epoch": 0.95,
+ "eval_sharegpt_gate_load": [
+ 1522.5,
+ 1078.8,
+ 1304.65,
+ 1205.55,
+ 1138.55,
+ 1369.8,
+ 1181.35,
+ 1110.4
+ ],
+ "eval_sharegpt_loss": 0.4888671934604645,
+ "eval_sharegpt_runtime": 3.0127,
+ "eval_sharegpt_samples_per_second": 331.925,
+ "eval_sharegpt_steps_per_second": 20.911,
+ "step": 1900
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.2823491665549193e-07,
+ "loss": 0.3058,
+ "step": 1901
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.2566280820298427e-07,
+ "loss": 0.3011,
+ "step": 1902
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.231165940486234e-07,
+ "loss": 0.2873,
+ "step": 1903
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.2059628086956044e-07,
+ "loss": 0.3044,
+ "step": 1904
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.1810187527502182e-07,
+ "loss": 0.2932,
+ "step": 1905
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.1563338380629618e-07,
+ "loss": 0.3145,
+ "step": 1906
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.1319081293671541e-07,
+ "loss": 0.3078,
+ "step": 1907
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.1077416907163573e-07,
+ "loss": 0.2436,
+ "step": 1908
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.0838345854842447e-07,
+ "loss": 0.2853,
+ "step": 1909
+ },
+ {
+ "epoch": 0.95,
+ "learning_rate": 1.0601868763643997e-07,
+ "loss": 0.3035,
+ "step": 1910
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 1.0367986253701945e-07,
+ "loss": 0.3109,
+ "step": 1911
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 1.0136698938346012e-07,
+ "loss": 0.2506,
+ "step": 1912
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 9.90800742410003e-08,
+ "loss": 0.3005,
+ "step": 1913
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 9.68191231068083e-08,
+ "loss": 0.284,
+ "step": 1914
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 9.45841419099669e-08,
+ "loss": 0.2738,
+ "step": 1915
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 9.237513651145224e-08,
+ "loss": 0.3183,
+ "step": 1916
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 9.019211270412275e-08,
+ "loss": 0.3705,
+ "step": 1917
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 8.80350762127058e-08,
+ "loss": 0.2912,
+ "step": 1918
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 8.590403269377656e-08,
+ "loss": 0.334,
+ "step": 1919
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 8.379898773574924e-08,
+ "loss": 0.3033,
+ "step": 1920
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 8.171994685885698e-08,
+ "loss": 0.2956,
+ "step": 1921
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 7.966691551514527e-08,
+ "loss": 0.3074,
+ "step": 1922
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 7.763989908844749e-08,
+ "loss": 0.3667,
+ "step": 1923
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 7.563890289437825e-08,
+ "loss": 0.328,
+ "step": 1924
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 7.366393218031564e-08,
+ "loss": 0.317,
+ "step": 1925
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 7.171499212539124e-08,
+ "loss": 0.263,
+ "step": 1926
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 6.979208784047454e-08,
+ "loss": 0.2675,
+ "step": 1927
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 6.78952243681541e-08,
+ "loss": 0.3353,
+ "step": 1928
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 6.602440668273758e-08,
+ "loss": 0.2805,
+ "step": 1929
+ },
+ {
+ "epoch": 0.96,
+ "learning_rate": 6.417963969022389e-08,
+ "loss": 0.2993,
+ "step": 1930
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 6.236092822829887e-08,
+ "loss": 0.2835,
+ "step": 1931
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 6.056827706632185e-08,
+ "loss": 0.2756,
+ "step": 1932
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 5.880169090531351e-08,
+ "loss": 0.2952,
+ "step": 1933
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 5.7061174377937015e-08,
+ "loss": 0.3181,
+ "step": 1934
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 5.534673204849572e-08,
+ "loss": 0.2758,
+ "step": 1935
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 5.365836841291439e-08,
+ "loss": 0.3501,
+ "step": 1936
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 5.199608789873134e-08,
+ "loss": 0.285,
+ "step": 1937
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 5.035989486508075e-08,
+ "loss": 0.3217,
+ "step": 1938
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 4.874979360268928e-08,
+ "loss": 0.3108,
+ "step": 1939
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 4.716578833386054e-08,
+ "loss": 0.2918,
+ "step": 1940
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 4.56078832124629e-08,
+ "loss": 0.2769,
+ "step": 1941
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 4.4076082323920576e-08,
+ "loss": 0.2785,
+ "step": 1942
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 4.257038968520366e-08,
+ "loss": 0.3826,
+ "step": 1943
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 4.109080924481479e-08,
+ "loss": 0.3202,
+ "step": 1944
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 3.963734488278248e-08,
+ "loss": 0.3273,
+ "step": 1945
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 3.82100004106456e-08,
+ "loss": 0.2841,
+ "step": 1946
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 3.680877957145112e-08,
+ "loss": 0.2521,
+ "step": 1947
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 3.543368603973529e-08,
+ "loss": 0.2639,
+ "step": 1948
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 3.408472342152136e-08,
+ "loss": 0.2813,
+ "step": 1949
+ },
+ {
+ "epoch": 0.97,
+ "learning_rate": 3.2761895254306285e-08,
+ "loss": 0.2856,
+ "step": 1950
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 3.1465205007052965e-08,
+ "loss": 0.2849,
+ "step": 1951
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 3.019465608018024e-08,
+ "loss": 0.313,
+ "step": 1952
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 2.8950251805553997e-08,
+ "loss": 0.2965,
+ "step": 1953
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 2.773199544648164e-08,
+ "loss": 0.2648,
+ "step": 1954
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 2.6539890197695428e-08,
+ "loss": 0.3016,
+ "step": 1955
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 2.537393918535358e-08,
+ "loss": 0.2957,
+ "step": 1956
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 2.423414546702807e-08,
+ "loss": 0.2827,
+ "step": 1957
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 2.312051203169352e-08,
+ "loss": 0.3606,
+ "step": 1958
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 2.2033041799723877e-08,
+ "loss": 0.3286,
+ "step": 1959
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 2.0971737622883515e-08,
+ "loss": 0.2846,
+ "step": 1960
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 1.9936602284318375e-08,
+ "loss": 0.3424,
+ "step": 1961
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 1.8927638498551502e-08,
+ "loss": 0.3063,
+ "step": 1962
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 1.7944848911470857e-08,
+ "loss": 0.2941,
+ "step": 1963
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 1.698823610032929e-08,
+ "loss": 0.3948,
+ "step": 1964
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 1.605780257373124e-08,
+ "loss": 0.3256,
+ "step": 1965
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 1.5153550771630498e-08,
+ "loss": 0.2806,
+ "step": 1966
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 1.4275483065321338e-08,
+ "loss": 0.2948,
+ "step": 1967
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 1.3423601757436289e-08,
+ "loss": 0.2675,
+ "step": 1968
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 1.2597909081931702e-08,
+ "loss": 0.2999,
+ "step": 1969
+ },
+ {
+ "epoch": 0.98,
+ "learning_rate": 1.179840720409331e-08,
+ "loss": 0.2986,
+ "step": 1970
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 1.102509822051845e-08,
+ "loss": 0.3122,
+ "step": 1971
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 1.0277984159122734e-08,
+ "loss": 0.3172,
+ "step": 1972
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 9.557066979123398e-09,
+ "loss": 0.315,
+ "step": 1973
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 8.862348571043733e-09,
+ "loss": 0.3141,
+ "step": 1974
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 8.193830756699773e-09,
+ "loss": 0.3203,
+ "step": 1975
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 7.551515289203615e-09,
+ "loss": 0.3238,
+ "step": 1976
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 6.935403852950107e-09,
+ "loss": 0.2921,
+ "step": 1977
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 6.345498063622391e-09,
+ "loss": 0.3407,
+ "step": 1978
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 5.781799468177473e-09,
+ "loss": 0.2725,
+ "step": 1979
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 5.2443095448506674e-09,
+ "loss": 0.3079,
+ "step": 1980
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 4.733029703146708e-09,
+ "loss": 0.3236,
+ "step": 1981
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 4.247961283835311e-09,
+ "loss": 0.3236,
+ "step": 1982
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 3.789105558954509e-09,
+ "loss": 0.3095,
+ "step": 1983
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 3.3564637317984318e-09,
+ "loss": 0.3494,
+ "step": 1984
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 2.9500369369195313e-09,
+ "loss": 0.3102,
+ "step": 1985
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 2.5698262401263607e-09,
+ "loss": 0.2826,
+ "step": 1986
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 2.215832638474691e-09,
+ "loss": 0.2811,
+ "step": 1987
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 1.888057060274173e-09,
+ "loss": 0.2664,
+ "step": 1988
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 1.5865003650761268e-09,
+ "loss": 0.3725,
+ "step": 1989
+ },
+ {
+ "epoch": 0.99,
+ "learning_rate": 1.3111633436779792e-09,
+ "loss": 0.3254,
+ "step": 1990
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 1.062046718121046e-09,
+ "loss": 0.3177,
+ "step": 1991
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 8.391511416816489e-10,
+ "loss": 0.2967,
+ "step": 1992
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 6.424771988788881e-10,
+ "loss": 0.3657,
+ "step": 1993
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 4.720254054679796e-10,
+ "loss": 0.2909,
+ "step": 1994
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 3.277962084369257e-10,
+ "loss": 0.3087,
+ "step": 1995
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 2.0978998601206558e-10,
+ "loss": 0.3516,
+ "step": 1996
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 1.1800704765030367e-10,
+ "loss": 0.3215,
+ "step": 1997
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 5.244763404133046e-11,
+ "loss": 0.3044,
+ "step": 1998
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 1.311191710651194e-11,
+ "loss": 0.3368,
+ "step": 1999
+ },
+ {
+ "epoch": 1.0,
+ "learning_rate": 0.0,
+ "loss": 0.2367,
+ "step": 2000
+ },
+ {
+ "epoch": 1.0,
+ "eval_code_gate_load": [
+ 205.2,
+ 169.7,
+ 178.05,
+ 154.0,
+ 178.95,
+ 188.05,
+ 195.85,
+ 170.2
+ ],
+ "eval_code_loss": 0.2437744140625,
+ "eval_code_runtime": 1.7981,
+ "eval_code_samples_per_second": 556.137,
+ "eval_code_steps_per_second": 35.037,
+ "step": 2000
+ },
+ {
+ "epoch": 1.0,
+ "eval_orca_gate_load": [
+ 497.05,
+ 341.55,
+ 406.6,
+ 396.8,
+ 351.5,
+ 422.2,
+ 370.25,
+ 356.35
+ ],
+ "eval_orca_loss": 0.34916990995407104,
+ "eval_orca_runtime": 2.0032,
+ "eval_orca_samples_per_second": 499.21,
+ "eval_orca_steps_per_second": 31.45,
+ "step": 2000
+ },
+ {
+ "epoch": 1.0,
+ "eval_math_gate_load": [
+ 314.55,
+ 216.95,
+ 239.45,
+ 221.2,
+ 253.25,
+ 261.05,
+ 276.5,
+ 249.75
+ ],
+ "eval_math_loss": 0.24287109076976776,
+ "eval_math_runtime": 1.8382,
+ "eval_math_samples_per_second": 544.019,
+ "eval_math_steps_per_second": 34.273,
+ "step": 2000
+ },
+ {
+ "epoch": 1.0,
+ "eval_sharegpt_gate_load": [
+ 1524.05,
+ 1079.85,
+ 1307.8,
+ 1203.45,
+ 1136.45,
+ 1369.95,
+ 1180.15,
+ 1109.9
+ ],
+ "eval_sharegpt_loss": 0.4881835877895355,
+ "eval_sharegpt_runtime": 2.9976,
+ "eval_sharegpt_samples_per_second": 333.601,
+ "eval_sharegpt_steps_per_second": 21.017,
+ "step": 2000
+ },
+ {
+ "epoch": 1.0,
+ "step": 2000,
+ "total_flos": 2.079150933069005e+19,
+ "train_loss": 0.5031886091232299,
+ "train_runtime": 25719.4214,
+ "train_samples_per_second": 9.954,
+ "train_steps_per_second": 0.078
+ }
+ ],
+ "logging_steps": 1.0,
+ "max_steps": 2000,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 9223372036854775807,
+ "save_steps": 9999999999999,
+ "total_flos": 2.079150933069005e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}