JW17 commited on Dec 18, 2024

Commit

f10c1bf

verified ·

1 Parent(s): 10c981c

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

checkpoint-1575/config.json +33 -0
checkpoint-1575/generation_config.json +6 -0
checkpoint-1575/latest +1 -0
checkpoint-1575/special_tokens_map.json +16 -0
checkpoint-1575/tokenizer.json +0 -0
checkpoint-1575/tokenizer_config.json +239 -0
checkpoint-1575/trainer_state.json +0 -0
checkpoint-1575/zero_to_fp32.py +674 -0
checkpoint-20450/config.json +33 -0
checkpoint-20450/generation_config.json +6 -0
checkpoint-20450/latest +1 -0
checkpoint-20450/special_tokens_map.json +16 -0
checkpoint-20450/tokenizer.json +0 -0
checkpoint-20450/tokenizer_config.json +239 -0
checkpoint-20450/trainer_state.json +0 -0
checkpoint-20450/zero_to_fp32.py +674 -0
checkpoint-20925/config.json +33 -0
checkpoint-20925/generation_config.json +6 -0
checkpoint-20925/latest +1 -0
checkpoint-20925/special_tokens_map.json +16 -0
checkpoint-20925/tokenizer.json +0 -0
checkpoint-20925/tokenizer_config.json +239 -0
checkpoint-20925/trainer_state.json +0 -0
checkpoint-20925/zero_to_fp32.py +674 -0
checkpoint-26025/config.json +33 -0
checkpoint-26025/generation_config.json +6 -0
checkpoint-26025/latest +1 -0
checkpoint-26025/special_tokens_map.json +16 -0
checkpoint-26025/tokenizer.json +0 -0
checkpoint-26025/tokenizer_config.json +239 -0
checkpoint-26025/trainer_state.json +0 -0
checkpoint-26025/zero_to_fp32.py +674 -0
checkpoint-45525/config.json +33 -0
checkpoint-45525/generation_config.json +6 -0
checkpoint-45525/special_tokens_map.json +16 -0
checkpoint-45525/tokenizer.json +0 -0
checkpoint-45525/tokenizer_config.json +239 -0
checkpoint-45525/trainer_state.json +0 -0
checkpoint-45525/zero_to_fp32.py +674 -0
checkpoint-475/config.json +33 -0
checkpoint-475/generation_config.json +6 -0
checkpoint-475/latest +1 -0
checkpoint-475/special_tokens_map.json +16 -0
checkpoint-475/tokenizer.json +0 -0
checkpoint-475/tokenizer_config.json +239 -0
checkpoint-475/trainer_state.json +3358 -0
checkpoint-475/zero_to_fp32.py +674 -0
checkpoint-49875/config.json +33 -0
checkpoint-49875/generation_config.json +6 -0
checkpoint-49875/latest +1 -0

checkpoint-1575/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "JW17/SmolLM-14m-v0.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "flash_attn": true,
+  "head_dim": 32,
+  "hidden_act": "silu",
+  "hidden_size": 128,
+  "initializer_range": 0.02,
+  "intermediate_size": 512,
+  "is_llama_config": true,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 6,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_interleaved": false,
+  "rope_scaling": null,
+  "rope_theta": 100000,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.3",
+  "use_cache": true,
+  "vocab_size": 50280
+}

checkpoint-1575/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "transformers_version": "4.46.3"
+}

checkpoint-1575/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step1575

checkpoint-1575/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|padding|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-1575/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1575/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "|||IP_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "<|padding|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50254": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50255": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50256": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50257": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "|||EMAIL_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50278": {
+      "content": "|||PHONE_NUMBER|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50279": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|padding|>",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": null
+}

checkpoint-1575/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1575/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,674 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import json
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering Sharded Weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        state_dict_split = split_torch_state_dict_into_shards(state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard, output_path)
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

checkpoint-20450/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "JW17/SmolLM-14m-v0.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "flash_attn": true,
+  "head_dim": 32,
+  "hidden_act": "silu",
+  "hidden_size": 128,
+  "initializer_range": 0.02,
+  "intermediate_size": 512,
+  "is_llama_config": true,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 6,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_interleaved": false,
+  "rope_scaling": null,
+  "rope_theta": 100000,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.3",
+  "use_cache": true,
+  "vocab_size": 50280
+}

checkpoint-20450/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "transformers_version": "4.46.3"
+}

checkpoint-20450/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step20450

checkpoint-20450/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|padding|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-20450/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-20450/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "|||IP_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "<|padding|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50254": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50255": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50256": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50257": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "|||EMAIL_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50278": {
+      "content": "|||PHONE_NUMBER|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50279": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|padding|>",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": null
+}

checkpoint-20450/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-20450/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,674 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import json
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering Sharded Weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        state_dict_split = split_torch_state_dict_into_shards(state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard, output_path)
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

checkpoint-20925/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "JW17/SmolLM-14m-v0.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "flash_attn": true,
+  "head_dim": 32,
+  "hidden_act": "silu",
+  "hidden_size": 128,
+  "initializer_range": 0.02,
+  "intermediate_size": 512,
+  "is_llama_config": true,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 6,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_interleaved": false,
+  "rope_scaling": null,
+  "rope_theta": 100000,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.3",
+  "use_cache": true,
+  "vocab_size": 50280
+}

checkpoint-20925/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "transformers_version": "4.46.3"
+}

checkpoint-20925/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step20925

checkpoint-20925/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|padding|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-20925/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-20925/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "|||IP_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "<|padding|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50254": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50255": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50256": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50257": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "|||EMAIL_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50278": {
+      "content": "|||PHONE_NUMBER|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50279": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|padding|>",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": null
+}

checkpoint-20925/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-20925/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,674 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import json
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering Sharded Weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        state_dict_split = split_torch_state_dict_into_shards(state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard, output_path)
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

checkpoint-26025/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "JW17/SmolLM-14m-v0.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "flash_attn": true,
+  "head_dim": 32,
+  "hidden_act": "silu",
+  "hidden_size": 128,
+  "initializer_range": 0.02,
+  "intermediate_size": 512,
+  "is_llama_config": true,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 6,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_interleaved": false,
+  "rope_scaling": null,
+  "rope_theta": 100000,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.3",
+  "use_cache": true,
+  "vocab_size": 50280
+}

checkpoint-26025/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "transformers_version": "4.46.3"
+}

checkpoint-26025/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step26025

checkpoint-26025/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|padding|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-26025/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-26025/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "|||IP_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "<|padding|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50254": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50255": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50256": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50257": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "|||EMAIL_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50278": {
+      "content": "|||PHONE_NUMBER|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50279": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|padding|>",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": null
+}

checkpoint-26025/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-26025/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,674 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import json
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering Sharded Weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        state_dict_split = split_torch_state_dict_into_shards(state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard, output_path)
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

checkpoint-45525/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "JW17/SmolLM-14m-v0.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "flash_attn": true,
+  "head_dim": 32,
+  "hidden_act": "silu",
+  "hidden_size": 128,
+  "initializer_range": 0.02,
+  "intermediate_size": 512,
+  "is_llama_config": true,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 6,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_interleaved": false,
+  "rope_scaling": null,
+  "rope_theta": 100000,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.3",
+  "use_cache": true,
+  "vocab_size": 50280
+}

checkpoint-45525/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "transformers_version": "4.46.3"
+}

checkpoint-45525/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|padding|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-45525/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-45525/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "|||IP_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "<|padding|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50254": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50255": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50256": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50257": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "|||EMAIL_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50278": {
+      "content": "|||PHONE_NUMBER|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50279": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|padding|>",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": null
+}

checkpoint-45525/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-45525/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,674 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import json
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering Sharded Weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        state_dict_split = split_torch_state_dict_into_shards(state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard, output_path)
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

checkpoint-475/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "JW17/SmolLM-14m-v0.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "flash_attn": true,
+  "head_dim": 32,
+  "hidden_act": "silu",
+  "hidden_size": 128,
+  "initializer_range": 0.02,
+  "intermediate_size": 512,
+  "is_llama_config": true,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 6,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_interleaved": false,
+  "rope_scaling": null,
+  "rope_theta": 100000,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.3",
+  "use_cache": true,
+  "vocab_size": 50280
+}

checkpoint-475/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "transformers_version": "4.46.3"
+}

checkpoint-475/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step475

checkpoint-475/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|padding|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-475/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-475/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "|||IP_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "<|padding|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50254": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50255": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50256": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50257": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "|||EMAIL_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50278": {
+      "content": "|||PHONE_NUMBER|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50279": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|padding|>",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": null
+}

checkpoint-475/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3358 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.00475,
+  "eval_steps": 500,
+  "global_step": 475,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1e-05,
+      "grad_norm": 1.7129806011276105,
+      "learning_rate": 3e-06,
+      "loss": 10.8348,
+      "step": 1
+    },
+    {
+      "epoch": 2e-05,
+      "grad_norm": 1.6872753303603527,
+      "learning_rate": 6e-06,
+      "loss": 10.8356,
+      "step": 2
+    },
+    {
+      "epoch": 3e-05,
+      "grad_norm": 1.6970020903903387,
+      "learning_rate": 9e-06,
+      "loss": 10.834,
+      "step": 3
+    },
+    {
+      "epoch": 4e-05,
+      "grad_norm": 1.690199421583159,
+      "learning_rate": 1.2e-05,
+      "loss": 10.8334,
+      "step": 4
+    },
+    {
+      "epoch": 5e-05,
+      "grad_norm": 1.6936208883930068,
+      "learning_rate": 1.5e-05,
+      "loss": 10.8294,
+      "step": 5
+    },
+    {
+      "epoch": 6e-05,
+      "grad_norm": 1.6935154610161474,
+      "learning_rate": 1.8e-05,
+      "loss": 10.8281,
+      "step": 6
+    },
+    {
+      "epoch": 7e-05,
+      "grad_norm": 1.6333694173725648,
+      "learning_rate": 2.1000000000000002e-05,
+      "loss": 10.8133,
+      "step": 7
+    },
+    {
+      "epoch": 8e-05,
+      "grad_norm": 1.4463755011186001,
+      "learning_rate": 2.4e-05,
+      "loss": 10.783,
+      "step": 8
+    },
+    {
+      "epoch": 9e-05,
+      "grad_norm": 1.3815123169360315,
+      "learning_rate": 2.7e-05,
+      "loss": 10.7779,
+      "step": 9
+    },
+    {
+      "epoch": 0.0001,
+      "grad_norm": 1.3507621465484316,
+      "learning_rate": 3e-05,
+      "loss": 10.7629,
+      "step": 10
+    },
+    {
+      "epoch": 0.00011,
+      "grad_norm": 1.257508561634155,
+      "learning_rate": 3.2999999999999996e-05,
+      "loss": 10.7454,
+      "step": 11
+    },
+    {
+      "epoch": 0.00012,
+      "grad_norm": 1.224298046820689,
+      "learning_rate": 3.6e-05,
+      "loss": 10.7321,
+      "step": 12
+    },
+    {
+      "epoch": 0.00013,
+      "grad_norm": 1.1609107458726389,
+      "learning_rate": 3.9e-05,
+      "loss": 10.7098,
+      "step": 13
+    },
+    {
+      "epoch": 0.00014,
+      "grad_norm": 1.1251765756585856,
+      "learning_rate": 4.2000000000000004e-05,
+      "loss": 10.6986,
+      "step": 14
+    },
+    {
+      "epoch": 0.00015,
+      "grad_norm": 1.1021031797679595,
+      "learning_rate": 4.4999999999999996e-05,
+      "loss": 10.6882,
+      "step": 15
+    },
+    {
+      "epoch": 0.00016,
+      "grad_norm": 1.05231707077907,
+      "learning_rate": 4.8e-05,
+      "loss": 10.6681,
+      "step": 16
+    },
+    {
+      "epoch": 0.00017,
+      "grad_norm": 1.0082613504118885,
+      "learning_rate": 5.1000000000000006e-05,
+      "loss": 10.6513,
+      "step": 17
+    },
+    {
+      "epoch": 0.00018,
+      "grad_norm": 0.9840324393168475,
+      "learning_rate": 5.4e-05,
+      "loss": 10.6344,
+      "step": 18
+    },
+    {
+      "epoch": 0.00019,
+      "grad_norm": 0.953923239589578,
+      "learning_rate": 5.7e-05,
+      "loss": 10.6196,
+      "step": 19
+    },
+    {
+      "epoch": 0.0002,
+      "grad_norm": 0.9458057853193742,
+      "learning_rate": 6e-05,
+      "loss": 10.6069,
+      "step": 20
+    },
+    {
+      "epoch": 0.00021,
+      "grad_norm": 0.9177457999897578,
+      "learning_rate": 6.3e-05,
+      "loss": 10.5922,
+      "step": 21
+    },
+    {
+      "epoch": 0.00022,
+      "grad_norm": 0.9134874433162486,
+      "learning_rate": 6.599999999999999e-05,
+      "loss": 10.576,
+      "step": 22
+    },
+    {
+      "epoch": 0.00023,
+      "grad_norm": 0.908696989628468,
+      "learning_rate": 6.9e-05,
+      "loss": 10.5635,
+      "step": 23
+    },
+    {
+      "epoch": 0.00024,
+      "grad_norm": 0.9051143007426985,
+      "learning_rate": 7.2e-05,
+      "loss": 10.5499,
+      "step": 24
+    },
+    {
+      "epoch": 0.00025,
+      "grad_norm": 0.9082451576693834,
+      "learning_rate": 7.500000000000001e-05,
+      "loss": 10.5361,
+      "step": 25
+    },
+    {
+      "epoch": 0.00026,
+      "grad_norm": 0.9099344486243927,
+      "learning_rate": 7.8e-05,
+      "loss": 10.521,
+      "step": 26
+    },
+    {
+      "epoch": 0.00027,
+      "grad_norm": 0.9053293550746107,
+      "learning_rate": 8.1e-05,
+      "loss": 10.5103,
+      "step": 27
+    },
+    {
+      "epoch": 0.00028,
+      "grad_norm": 0.9002471153364864,
+      "learning_rate": 8.400000000000001e-05,
+      "loss": 10.4955,
+      "step": 28
+    },
+    {
+      "epoch": 0.00029,
+      "grad_norm": 0.9068699186733776,
+      "learning_rate": 8.7e-05,
+      "loss": 10.4811,
+      "step": 29
+    },
+    {
+      "epoch": 0.0003,
+      "grad_norm": 0.9095271030063902,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 10.4648,
+      "step": 30
+    },
+    {
+      "epoch": 0.00031,
+      "grad_norm": 0.9097010936405139,
+      "learning_rate": 9.3e-05,
+      "loss": 10.4503,
+      "step": 31
+    },
+    {
+      "epoch": 0.00032,
+      "grad_norm": 0.9047462396891427,
+      "learning_rate": 9.6e-05,
+      "loss": 10.4348,
+      "step": 32
+    },
+    {
+      "epoch": 0.00033,
+      "grad_norm": 0.9068703333942145,
+      "learning_rate": 9.900000000000001e-05,
+      "loss": 10.418,
+      "step": 33
+    },
+    {
+      "epoch": 0.00034,
+      "grad_norm": 0.9072965837486595,
+      "learning_rate": 0.00010200000000000001,
+      "loss": 10.4,
+      "step": 34
+    },
+    {
+      "epoch": 0.00035,
+      "grad_norm": 0.9110003633890357,
+      "learning_rate": 0.00010500000000000002,
+      "loss": 10.3835,
+      "step": 35
+    },
+    {
+      "epoch": 0.00036,
+      "grad_norm": 0.9049119959927198,
+      "learning_rate": 0.000108,
+      "loss": 10.3652,
+      "step": 36
+    },
+    {
+      "epoch": 0.00037,
+      "grad_norm": 0.8970709544624084,
+      "learning_rate": 0.000111,
+      "loss": 10.3479,
+      "step": 37
+    },
+    {
+      "epoch": 0.00038,
+      "grad_norm": 0.8959068278842482,
+      "learning_rate": 0.000114,
+      "loss": 10.3275,
+      "step": 38
+    },
+    {
+      "epoch": 0.00039,
+      "grad_norm": 0.9005947927478184,
+      "learning_rate": 0.000117,
+      "loss": 10.3069,
+      "step": 39
+    },
+    {
+      "epoch": 0.0004,
+      "grad_norm": 0.9014442598894896,
+      "learning_rate": 0.00012,
+      "loss": 10.2842,
+      "step": 40
+    },
+    {
+      "epoch": 0.00041,
+      "grad_norm": 0.8992939718171602,
+      "learning_rate": 0.000123,
+      "loss": 10.2657,
+      "step": 41
+    },
+    {
+      "epoch": 0.00042,
+      "grad_norm": 0.8994818536906172,
+      "learning_rate": 0.000126,
+      "loss": 10.2444,
+      "step": 42
+    },
+    {
+      "epoch": 0.00043,
+      "grad_norm": 0.9062946670458473,
+      "learning_rate": 0.000129,
+      "loss": 10.2208,
+      "step": 43
+    },
+    {
+      "epoch": 0.00044,
+      "grad_norm": 0.9072550424345267,
+      "learning_rate": 0.00013199999999999998,
+      "loss": 10.1985,
+      "step": 44
+    },
+    {
+      "epoch": 0.00045,
+      "grad_norm": 0.908308760029939,
+      "learning_rate": 0.000135,
+      "loss": 10.1758,
+      "step": 45
+    },
+    {
+      "epoch": 0.00046,
+      "grad_norm": 0.8994605508976834,
+      "learning_rate": 0.000138,
+      "loss": 10.1528,
+      "step": 46
+    },
+    {
+      "epoch": 0.00047,
+      "grad_norm": 0.904955141258698,
+      "learning_rate": 0.000141,
+      "loss": 10.1274,
+      "step": 47
+    },
+    {
+      "epoch": 0.00048,
+      "grad_norm": 0.9044693581157806,
+      "learning_rate": 0.000144,
+      "loss": 10.1031,
+      "step": 48
+    },
+    {
+      "epoch": 0.00049,
+      "grad_norm": 0.8992120995192336,
+      "learning_rate": 0.000147,
+      "loss": 10.0777,
+      "step": 49
+    },
+    {
+      "epoch": 0.0005,
+      "grad_norm": 0.905676588399281,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 10.0519,
+      "step": 50
+    },
+    {
+      "epoch": 0.00051,
+      "grad_norm": 0.9066841497261428,
+      "learning_rate": 0.000153,
+      "loss": 10.0251,
+      "step": 51
+    },
+    {
+      "epoch": 0.00052,
+      "grad_norm": 0.9046656683417261,
+      "learning_rate": 0.000156,
+      "loss": 9.9981,
+      "step": 52
+    },
+    {
+      "epoch": 0.00053,
+      "grad_norm": 0.8943714853313668,
+      "learning_rate": 0.000159,
+      "loss": 9.974,
+      "step": 53
+    },
+    {
+      "epoch": 0.00054,
+      "grad_norm": 0.9141658233846578,
+      "learning_rate": 0.000162,
+      "loss": 9.9419,
+      "step": 54
+    },
+    {
+      "epoch": 0.00055,
+      "grad_norm": 0.9035944774643171,
+      "learning_rate": 0.000165,
+      "loss": 9.9169,
+      "step": 55
+    },
+    {
+      "epoch": 0.00056,
+      "grad_norm": 0.895407870582166,
+      "learning_rate": 0.00016800000000000002,
+      "loss": 9.8872,
+      "step": 56
+    },
+    {
+      "epoch": 0.00057,
+      "grad_norm": 0.9021731997760362,
+      "learning_rate": 0.000171,
+      "loss": 9.8601,
+      "step": 57
+    },
+    {
+      "epoch": 0.00058,
+      "grad_norm": 0.8980871554912008,
+      "learning_rate": 0.000174,
+      "loss": 9.8343,
+      "step": 58
+    },
+    {
+      "epoch": 0.00059,
+      "grad_norm": 0.9073832283363998,
+      "learning_rate": 0.000177,
+      "loss": 9.8028,
+      "step": 59
+    },
+    {
+      "epoch": 0.0006,
+      "grad_norm": 0.8921071838486323,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 9.777,
+      "step": 60
+    },
+    {
+      "epoch": 0.00061,
+      "grad_norm": 0.8918001641348363,
+      "learning_rate": 0.000183,
+      "loss": 9.7484,
+      "step": 61
+    },
+    {
+      "epoch": 0.00062,
+      "grad_norm": 0.897401330332219,
+      "learning_rate": 0.000186,
+      "loss": 9.717,
+      "step": 62
+    },
+    {
+      "epoch": 0.00063,
+      "grad_norm": 0.8914516241190131,
+      "learning_rate": 0.000189,
+      "loss": 9.6894,
+      "step": 63
+    },
+    {
+      "epoch": 0.00064,
+      "grad_norm": 0.8896652156254973,
+      "learning_rate": 0.000192,
+      "loss": 9.6587,
+      "step": 64
+    },
+    {
+      "epoch": 0.00065,
+      "grad_norm": 0.8995447585153489,
+      "learning_rate": 0.00019500000000000002,
+      "loss": 9.6261,
+      "step": 65
+    },
+    {
+      "epoch": 0.00066,
+      "grad_norm": 0.8896939041293862,
+      "learning_rate": 0.00019800000000000002,
+      "loss": 9.6026,
+      "step": 66
+    },
+    {
+      "epoch": 0.00067,
+      "grad_norm": 0.8935314234316469,
+      "learning_rate": 0.000201,
+      "loss": 9.5723,
+      "step": 67
+    },
+    {
+      "epoch": 0.00068,
+      "grad_norm": 0.8971584182008717,
+      "learning_rate": 0.00020400000000000003,
+      "loss": 9.5393,
+      "step": 68
+    },
+    {
+      "epoch": 0.00069,
+      "grad_norm": 0.8806455604370673,
+      "learning_rate": 0.00020700000000000002,
+      "loss": 9.5119,
+      "step": 69
+    },
+    {
+      "epoch": 0.0007,
+      "grad_norm": 0.892956094531968,
+      "learning_rate": 0.00021000000000000004,
+      "loss": 9.4751,
+      "step": 70
+    },
+    {
+      "epoch": 0.00071,
+      "grad_norm": 0.8848452972865632,
+      "learning_rate": 0.00021299999999999997,
+      "loss": 9.4495,
+      "step": 71
+    },
+    {
+      "epoch": 0.00072,
+      "grad_norm": 0.8831148731992822,
+      "learning_rate": 0.000216,
+      "loss": 9.4223,
+      "step": 72
+    },
+    {
+      "epoch": 0.00073,
+      "grad_norm": 0.887150899449638,
+      "learning_rate": 0.00021899999999999998,
+      "loss": 9.3879,
+      "step": 73
+    },
+    {
+      "epoch": 0.00074,
+      "grad_norm": 0.8878619769377328,
+      "learning_rate": 0.000222,
+      "loss": 9.3616,
+      "step": 74
+    },
+    {
+      "epoch": 0.00075,
+      "grad_norm": 0.8808154408936898,
+      "learning_rate": 0.000225,
+      "loss": 9.3275,
+      "step": 75
+    },
+    {
+      "epoch": 0.00076,
+      "grad_norm": 0.8908035269749474,
+      "learning_rate": 0.000228,
+      "loss": 9.3009,
+      "step": 76
+    },
+    {
+      "epoch": 0.00077,
+      "grad_norm": 0.884531047332737,
+      "learning_rate": 0.000231,
+      "loss": 9.2727,
+      "step": 77
+    },
+    {
+      "epoch": 0.00078,
+      "grad_norm": 0.8838664917591654,
+      "learning_rate": 0.000234,
+      "loss": 9.2422,
+      "step": 78
+    },
+    {
+      "epoch": 0.00079,
+      "grad_norm": 0.8858668201182466,
+      "learning_rate": 0.00023700000000000001,
+      "loss": 9.2056,
+      "step": 79
+    },
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.8856967305037482,
+      "learning_rate": 0.00024,
+      "loss": 9.1711,
+      "step": 80
+    },
+    {
+      "epoch": 0.00081,
+      "grad_norm": 0.8942846826675519,
+      "learning_rate": 0.00024300000000000002,
+      "loss": 9.1382,
+      "step": 81
+    },
+    {
+      "epoch": 0.00082,
+      "grad_norm": 0.897767651472895,
+      "learning_rate": 0.000246,
+      "loss": 9.1142,
+      "step": 82
+    },
+    {
+      "epoch": 0.00083,
+      "grad_norm": 0.8951752702012633,
+      "learning_rate": 0.00024900000000000004,
+      "loss": 9.0778,
+      "step": 83
+    },
+    {
+      "epoch": 0.00084,
+      "grad_norm": 0.8980395289622467,
+      "learning_rate": 0.000252,
+      "loss": 9.0469,
+      "step": 84
+    },
+    {
+      "epoch": 0.00085,
+      "grad_norm": 0.8894006576183595,
+      "learning_rate": 0.000255,
+      "loss": 9.0242,
+      "step": 85
+    },
+    {
+      "epoch": 0.00086,
+      "grad_norm": 0.8907945566480024,
+      "learning_rate": 0.000258,
+      "loss": 8.9886,
+      "step": 86
+    },
+    {
+      "epoch": 0.00087,
+      "grad_norm": 0.8869170795764568,
+      "learning_rate": 0.000261,
+      "loss": 8.9664,
+      "step": 87
+    },
+    {
+      "epoch": 0.00088,
+      "grad_norm": 0.8860541210154026,
+      "learning_rate": 0.00026399999999999997,
+      "loss": 8.9293,
+      "step": 88
+    },
+    {
+      "epoch": 0.00089,
+      "grad_norm": 0.8822605600899943,
+      "learning_rate": 0.000267,
+      "loss": 8.9037,
+      "step": 89
+    },
+    {
+      "epoch": 0.0009,
+      "grad_norm": 0.8817151929172502,
+      "learning_rate": 0.00027,
+      "loss": 8.8766,
+      "step": 90
+    },
+    {
+      "epoch": 0.00091,
+      "grad_norm": 0.877617615465877,
+      "learning_rate": 0.000273,
+      "loss": 8.8478,
+      "step": 91
+    },
+    {
+      "epoch": 0.00092,
+      "grad_norm": 0.8822716293479064,
+      "learning_rate": 0.000276,
+      "loss": 8.8156,
+      "step": 92
+    },
+    {
+      "epoch": 0.00093,
+      "grad_norm": 0.8823661552266111,
+      "learning_rate": 0.000279,
+      "loss": 8.7863,
+      "step": 93
+    },
+    {
+      "epoch": 0.00094,
+      "grad_norm": 0.8830384482321385,
+      "learning_rate": 0.000282,
+      "loss": 8.7609,
+      "step": 94
+    },
+    {
+      "epoch": 0.00095,
+      "grad_norm": 0.8735042737334501,
+      "learning_rate": 0.000285,
+      "loss": 8.7321,
+      "step": 95
+    },
+    {
+      "epoch": 0.00096,
+      "grad_norm": 0.8799214796836804,
+      "learning_rate": 0.000288,
+      "loss": 8.7028,
+      "step": 96
+    },
+    {
+      "epoch": 0.00097,
+      "grad_norm": 0.8704594748643596,
+      "learning_rate": 0.000291,
+      "loss": 8.6791,
+      "step": 97
+    },
+    {
+      "epoch": 0.00098,
+      "grad_norm": 0.8706415983834461,
+      "learning_rate": 0.000294,
+      "loss": 8.642,
+      "step": 98
+    },
+    {
+      "epoch": 0.00099,
+      "grad_norm": 0.8683426041650804,
+      "learning_rate": 0.000297,
+      "loss": 8.62,
+      "step": 99
+    },
+    {
+      "epoch": 0.001,
+      "grad_norm": 0.8690594926543161,
+      "learning_rate": 0.00030000000000000003,
+      "loss": 8.5941,
+      "step": 100
+    },
+    {
+      "epoch": 0.00101,
+      "grad_norm": 0.8744725161423202,
+      "learning_rate": 0.00030300000000000005,
+      "loss": 8.5597,
+      "step": 101
+    },
+    {
+      "epoch": 0.00102,
+      "grad_norm": 0.8626538117869429,
+      "learning_rate": 0.000306,
+      "loss": 8.5407,
+      "step": 102
+    },
+    {
+      "epoch": 0.00103,
+      "grad_norm": 0.8630292491448714,
+      "learning_rate": 0.000309,
+      "loss": 8.5165,
+      "step": 103
+    },
+    {
+      "epoch": 0.00104,
+      "grad_norm": 0.8566580756350954,
+      "learning_rate": 0.000312,
+      "loss": 8.4948,
+      "step": 104
+    },
+    {
+      "epoch": 0.00105,
+      "grad_norm": 0.8588931967033124,
+      "learning_rate": 0.000315,
+      "loss": 8.4689,
+      "step": 105
+    },
+    {
+      "epoch": 0.00106,
+      "grad_norm": 0.8531867230327145,
+      "learning_rate": 0.000318,
+      "loss": 8.4397,
+      "step": 106
+    },
+    {
+      "epoch": 0.00107,
+      "grad_norm": 0.8474808010000593,
+      "learning_rate": 0.000321,
+      "loss": 8.4149,
+      "step": 107
+    },
+    {
+      "epoch": 0.00108,
+      "grad_norm": 0.858890949163445,
+      "learning_rate": 0.000324,
+      "loss": 8.3866,
+      "step": 108
+    },
+    {
+      "epoch": 0.00109,
+      "grad_norm": 0.862504115159085,
+      "learning_rate": 0.000327,
+      "loss": 8.3673,
+      "step": 109
+    },
+    {
+      "epoch": 0.0011,
+      "grad_norm": 0.8797254817902618,
+      "learning_rate": 0.00033,
+      "loss": 8.3453,
+      "step": 110
+    },
+    {
+      "epoch": 0.00111,
+      "grad_norm": 0.8938450574121157,
+      "learning_rate": 0.000333,
+      "loss": 8.3162,
+      "step": 111
+    },
+    {
+      "epoch": 0.00112,
+      "grad_norm": 0.8984693362159062,
+      "learning_rate": 0.00033600000000000004,
+      "loss": 8.2961,
+      "step": 112
+    },
+    {
+      "epoch": 0.00113,
+      "grad_norm": 0.8741969532880793,
+      "learning_rate": 0.000339,
+      "loss": 8.2543,
+      "step": 113
+    },
+    {
+      "epoch": 0.00114,
+      "grad_norm": 0.8263135137442741,
+      "learning_rate": 0.000342,
+      "loss": 8.2446,
+      "step": 114
+    },
+    {
+      "epoch": 0.00115,
+      "grad_norm": 0.8311105019716521,
+      "learning_rate": 0.00034500000000000004,
+      "loss": 8.2164,
+      "step": 115
+    },
+    {
+      "epoch": 0.00116,
+      "grad_norm": 0.8585271561560018,
+      "learning_rate": 0.000348,
+      "loss": 8.1938,
+      "step": 116
+    },
+    {
+      "epoch": 0.00117,
+      "grad_norm": 0.8687047969468357,
+      "learning_rate": 0.000351,
+      "loss": 8.1623,
+      "step": 117
+    },
+    {
+      "epoch": 0.00118,
+      "grad_norm": 0.8460500876754268,
+      "learning_rate": 0.000354,
+      "loss": 8.1456,
+      "step": 118
+    },
+    {
+      "epoch": 0.00119,
+      "grad_norm": 0.80734714043103,
+      "learning_rate": 0.000357,
+      "loss": 8.131,
+      "step": 119
+    },
+    {
+      "epoch": 0.0012,
+      "grad_norm": 0.7912470130916918,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 8.103,
+      "step": 120
+    },
+    {
+      "epoch": 0.00121,
+      "grad_norm": 0.811181199244069,
+      "learning_rate": 0.000363,
+      "loss": 8.0751,
+      "step": 121
+    },
+    {
+      "epoch": 0.00122,
+      "grad_norm": 0.8641427784894331,
+      "learning_rate": 0.000366,
+      "loss": 8.0581,
+      "step": 122
+    },
+    {
+      "epoch": 0.00123,
+      "grad_norm": 1.059706847686213,
+      "learning_rate": 0.000369,
+      "loss": 8.038,
+      "step": 123
+    },
+    {
+      "epoch": 0.00124,
+      "grad_norm": 1.1727027216994725,
+      "learning_rate": 0.000372,
+      "loss": 8.021,
+      "step": 124
+    },
+    {
+      "epoch": 0.00125,
+      "grad_norm": 0.8130002892684417,
+      "learning_rate": 0.000375,
+      "loss": 7.9874,
+      "step": 125
+    },
+    {
+      "epoch": 0.00126,
+      "grad_norm": 0.9195848585380069,
+      "learning_rate": 0.000378,
+      "loss": 7.9767,
+      "step": 126
+    },
+    {
+      "epoch": 0.00127,
+      "grad_norm": 1.0843433185909894,
+      "learning_rate": 0.000381,
+      "loss": 7.9572,
+      "step": 127
+    },
+    {
+      "epoch": 0.00128,
+      "grad_norm": 0.7822919696307823,
+      "learning_rate": 0.000384,
+      "loss": 7.9285,
+      "step": 128
+    },
+    {
+      "epoch": 0.00129,
+      "grad_norm": 0.8822428605457112,
+      "learning_rate": 0.00038700000000000003,
+      "loss": 7.9179,
+      "step": 129
+    },
+    {
+      "epoch": 0.0013,
+      "grad_norm": 0.844355538937723,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 7.8895,
+      "step": 130
+    },
+    {
+      "epoch": 0.00131,
+      "grad_norm": 0.7902535444057679,
+      "learning_rate": 0.000393,
+      "loss": 7.8528,
+      "step": 131
+    },
+    {
+      "epoch": 0.00132,
+      "grad_norm": 0.8777082094723105,
+      "learning_rate": 0.00039600000000000003,
+      "loss": 7.8441,
+      "step": 132
+    },
+    {
+      "epoch": 0.00133,
+      "grad_norm": 0.7973277807473979,
+      "learning_rate": 0.00039900000000000005,
+      "loss": 7.8195,
+      "step": 133
+    },
+    {
+      "epoch": 0.00134,
+      "grad_norm": 0.7889088832890946,
+      "learning_rate": 0.000402,
+      "loss": 7.8056,
+      "step": 134
+    },
+    {
+      "epoch": 0.00135,
+      "grad_norm": 0.7461125825498439,
+      "learning_rate": 0.00040500000000000003,
+      "loss": 7.7735,
+      "step": 135
+    },
+    {
+      "epoch": 0.00136,
+      "grad_norm": 0.7727896835908762,
+      "learning_rate": 0.00040800000000000005,
+      "loss": 7.7579,
+      "step": 136
+    },
+    {
+      "epoch": 0.00137,
+      "grad_norm": 0.6932995987295251,
+      "learning_rate": 0.000411,
+      "loss": 7.7341,
+      "step": 137
+    },
+    {
+      "epoch": 0.00138,
+      "grad_norm": 0.758084762416224,
+      "learning_rate": 0.00041400000000000003,
+      "loss": 7.7117,
+      "step": 138
+    },
+    {
+      "epoch": 0.00139,
+      "grad_norm": 0.7171019453691133,
+      "learning_rate": 0.00041700000000000005,
+      "loss": 7.6963,
+      "step": 139
+    },
+    {
+      "epoch": 0.0014,
+      "grad_norm": 0.6814920611933867,
+      "learning_rate": 0.00042000000000000007,
+      "loss": 7.6775,
+      "step": 140
+    },
+    {
+      "epoch": 0.00141,
+      "grad_norm": 0.7091532995122851,
+      "learning_rate": 0.000423,
+      "loss": 7.6638,
+      "step": 141
+    },
+    {
+      "epoch": 0.00142,
+      "grad_norm": 0.6928279523561562,
+      "learning_rate": 0.00042599999999999995,
+      "loss": 7.6501,
+      "step": 142
+    },
+    {
+      "epoch": 0.00143,
+      "grad_norm": 0.6614572727332786,
+      "learning_rate": 0.00042899999999999997,
+      "loss": 7.6195,
+      "step": 143
+    },
+    {
+      "epoch": 0.00144,
+      "grad_norm": 0.6903462553659518,
+      "learning_rate": 0.000432,
+      "loss": 7.6015,
+      "step": 144
+    },
+    {
+      "epoch": 0.00145,
+      "grad_norm": 0.690019772183536,
+      "learning_rate": 0.000435,
+      "loss": 7.5953,
+      "step": 145
+    },
+    {
+      "epoch": 0.00146,
+      "grad_norm": 0.6908198257220046,
+      "learning_rate": 0.00043799999999999997,
+      "loss": 7.5557,
+      "step": 146
+    },
+    {
+      "epoch": 0.00147,
+      "grad_norm": 0.7009866965495668,
+      "learning_rate": 0.000441,
+      "loss": 7.5482,
+      "step": 147
+    },
+    {
+      "epoch": 0.00148,
+      "grad_norm": 0.6832764187147686,
+      "learning_rate": 0.000444,
+      "loss": 7.5366,
+      "step": 148
+    },
+    {
+      "epoch": 0.00149,
+      "grad_norm": 0.59797192318343,
+      "learning_rate": 0.00044699999999999997,
+      "loss": 7.5272,
+      "step": 149
+    },
+    {
+      "epoch": 0.0015,
+      "grad_norm": 0.6655702435683013,
+      "learning_rate": 0.00045,
+      "loss": 7.4963,
+      "step": 150
+    },
+    {
+      "epoch": 0.00151,
+      "grad_norm": 0.732396941583091,
+      "learning_rate": 0.000453,
+      "loss": 7.48,
+      "step": 151
+    },
+    {
+      "epoch": 0.00152,
+      "grad_norm": 0.5836278900992692,
+      "learning_rate": 0.000456,
+      "loss": 7.4694,
+      "step": 152
+    },
+    {
+      "epoch": 0.00153,
+      "grad_norm": 0.6777912087785298,
+      "learning_rate": 0.000459,
+      "loss": 7.4588,
+      "step": 153
+    },
+    {
+      "epoch": 0.00154,
+      "grad_norm": 0.727978180952039,
+      "learning_rate": 0.000462,
+      "loss": 7.442,
+      "step": 154
+    },
+    {
+      "epoch": 0.00155,
+      "grad_norm": 0.7368922682622268,
+      "learning_rate": 0.000465,
+      "loss": 7.4241,
+      "step": 155
+    },
+    {
+      "epoch": 0.00156,
+      "grad_norm": 0.8391325557731037,
+      "learning_rate": 0.000468,
+      "loss": 7.4013,
+      "step": 156
+    },
+    {
+      "epoch": 0.00157,
+      "grad_norm": 0.8289929528374833,
+      "learning_rate": 0.000471,
+      "loss": 7.3995,
+      "step": 157
+    },
+    {
+      "epoch": 0.00158,
+      "grad_norm": 0.5070337070851558,
+      "learning_rate": 0.00047400000000000003,
+      "loss": 7.3713,
+      "step": 158
+    },
+    {
+      "epoch": 0.00159,
+      "grad_norm": 0.783946493417518,
+      "learning_rate": 0.000477,
+      "loss": 7.3668,
+      "step": 159
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.6957053326984224,
+      "learning_rate": 0.00048,
+      "loss": 7.3475,
+      "step": 160
+    },
+    {
+      "epoch": 0.00161,
+      "grad_norm": 0.547833885334286,
+      "learning_rate": 0.00048300000000000003,
+      "loss": 7.3204,
+      "step": 161
+    },
+    {
+      "epoch": 0.00162,
+      "grad_norm": 0.8547649122041628,
+      "learning_rate": 0.00048600000000000005,
+      "loss": 7.3325,
+      "step": 162
+    },
+    {
+      "epoch": 0.00163,
+      "grad_norm": 0.8673949773728752,
+      "learning_rate": 0.0004890000000000001,
+      "loss": 7.316,
+      "step": 163
+    },
+    {
+      "epoch": 0.00164,
+      "grad_norm": 1.0619539506126108,
+      "learning_rate": 0.000492,
+      "loss": 7.3191,
+      "step": 164
+    },
+    {
+      "epoch": 0.00165,
+      "grad_norm": 0.6324744711420325,
+      "learning_rate": 0.000495,
+      "loss": 7.2864,
+      "step": 165
+    },
+    {
+      "epoch": 0.00166,
+      "grad_norm": 0.5662260261966024,
+      "learning_rate": 0.0004980000000000001,
+      "loss": 7.26,
+      "step": 166
+    },
+    {
+      "epoch": 0.00167,
+      "grad_norm": 0.7262900850309921,
+      "learning_rate": 0.000501,
+      "loss": 7.2554,
+      "step": 167
+    },
+    {
+      "epoch": 0.00168,
+      "grad_norm": 0.6121691436587496,
+      "learning_rate": 0.000504,
+      "loss": 7.2353,
+      "step": 168
+    },
+    {
+      "epoch": 0.00169,
+      "grad_norm": 0.5390794603769147,
+      "learning_rate": 0.0005070000000000001,
+      "loss": 7.2263,
+      "step": 169
+    },
+    {
+      "epoch": 0.0017,
+      "grad_norm": 0.5999036994585554,
+      "learning_rate": 0.00051,
+      "loss": 7.213,
+      "step": 170
+    },
+    {
+      "epoch": 0.00171,
+      "grad_norm": 0.4637320512013434,
+      "learning_rate": 0.000513,
+      "loss": 7.1933,
+      "step": 171
+    },
+    {
+      "epoch": 0.00172,
+      "grad_norm": 0.5250975302523401,
+      "learning_rate": 0.000516,
+      "loss": 7.1953,
+      "step": 172
+    },
+    {
+      "epoch": 0.00173,
+      "grad_norm": 0.40559164125903624,
+      "learning_rate": 0.0005189999999999999,
+      "loss": 7.1764,
+      "step": 173
+    },
+    {
+      "epoch": 0.00174,
+      "grad_norm": 0.4505921111310584,
+      "learning_rate": 0.000522,
+      "loss": 7.1953,
+      "step": 174
+    },
+    {
+      "epoch": 0.00175,
+      "grad_norm": 0.4234331150208657,
+      "learning_rate": 0.000525,
+      "loss": 7.1572,
+      "step": 175
+    },
+    {
+      "epoch": 0.00176,
+      "grad_norm": 0.3852967422981744,
+      "learning_rate": 0.0005279999999999999,
+      "loss": 7.1322,
+      "step": 176
+    },
+    {
+      "epoch": 0.00177,
+      "grad_norm": 0.3685443025565043,
+      "learning_rate": 0.000531,
+      "loss": 7.1378,
+      "step": 177
+    },
+    {
+      "epoch": 0.00178,
+      "grad_norm": 0.44280593644992733,
+      "learning_rate": 0.000534,
+      "loss": 7.1301,
+      "step": 178
+    },
+    {
+      "epoch": 0.00179,
+      "grad_norm": 0.3638226120256115,
+      "learning_rate": 0.000537,
+      "loss": 7.1191,
+      "step": 179
+    },
+    {
+      "epoch": 0.0018,
+      "grad_norm": 0.37841703582661185,
+      "learning_rate": 0.00054,
+      "loss": 7.0921,
+      "step": 180
+    },
+    {
+      "epoch": 0.00181,
+      "grad_norm": 0.3275750999054276,
+      "learning_rate": 0.000543,
+      "loss": 7.0801,
+      "step": 181
+    },
+    {
+      "epoch": 0.00182,
+      "grad_norm": 0.3469517461565544,
+      "learning_rate": 0.000546,
+      "loss": 7.0774,
+      "step": 182
+    },
+    {
+      "epoch": 0.00183,
+      "grad_norm": 0.3965623823212328,
+      "learning_rate": 0.000549,
+      "loss": 7.0674,
+      "step": 183
+    },
+    {
+      "epoch": 0.00184,
+      "grad_norm": 0.47244712960577356,
+      "learning_rate": 0.000552,
+      "loss": 7.0582,
+      "step": 184
+    },
+    {
+      "epoch": 0.00185,
+      "grad_norm": 0.7068086356604425,
+      "learning_rate": 0.000555,
+      "loss": 7.0369,
+      "step": 185
+    },
+    {
+      "epoch": 0.00186,
+      "grad_norm": 0.9665650200874053,
+      "learning_rate": 0.000558,
+      "loss": 7.0604,
+      "step": 186
+    },
+    {
+      "epoch": 0.00187,
+      "grad_norm": 1.1379887499575514,
+      "learning_rate": 0.000561,
+      "loss": 7.0366,
+      "step": 187
+    },
+    {
+      "epoch": 0.00188,
+      "grad_norm": 0.5005933831438132,
+      "learning_rate": 0.000564,
+      "loss": 7.0008,
+      "step": 188
+    },
+    {
+      "epoch": 0.00189,
+      "grad_norm": 0.4490325126563288,
+      "learning_rate": 0.000567,
+      "loss": 6.997,
+      "step": 189
+    },
+    {
+      "epoch": 0.0019,
+      "grad_norm": 0.6949112483193859,
+      "learning_rate": 0.00057,
+      "loss": 6.9846,
+      "step": 190
+    },
+    {
+      "epoch": 0.00191,
+      "grad_norm": 0.4887612962658467,
+      "learning_rate": 0.000573,
+      "loss": 6.9724,
+      "step": 191
+    },
+    {
+      "epoch": 0.00192,
+      "grad_norm": 0.5374763602633008,
+      "learning_rate": 0.000576,
+      "loss": 6.9655,
+      "step": 192
+    },
+    {
+      "epoch": 0.00193,
+      "grad_norm": 0.4491815623326969,
+      "learning_rate": 0.000579,
+      "loss": 6.9637,
+      "step": 193
+    },
+    {
+      "epoch": 0.00194,
+      "grad_norm": 0.4044031823800156,
+      "learning_rate": 0.000582,
+      "loss": 6.9565,
+      "step": 194
+    },
+    {
+      "epoch": 0.00195,
+      "grad_norm": 0.5115147380417242,
+      "learning_rate": 0.000585,
+      "loss": 6.9386,
+      "step": 195
+    },
+    {
+      "epoch": 0.00196,
+      "grad_norm": 0.45947827433809557,
+      "learning_rate": 0.000588,
+      "loss": 6.9258,
+      "step": 196
+    },
+    {
+      "epoch": 0.00197,
+      "grad_norm": 0.5289316721591154,
+      "learning_rate": 0.000591,
+      "loss": 6.9226,
+      "step": 197
+    },
+    {
+      "epoch": 0.00198,
+      "grad_norm": 0.4416511613975406,
+      "learning_rate": 0.000594,
+      "loss": 6.9132,
+      "step": 198
+    },
+    {
+      "epoch": 0.00199,
+      "grad_norm": 0.36314866008934127,
+      "learning_rate": 0.0005970000000000001,
+      "loss": 6.8916,
+      "step": 199
+    },
+    {
+      "epoch": 0.002,
+      "grad_norm": 0.4299454881914127,
+      "learning_rate": 0.0006000000000000001,
+      "loss": 6.8932,
+      "step": 200
+    },
+    {
+      "epoch": 0.00201,
+      "grad_norm": 0.2786890859011363,
+      "learning_rate": 0.000603,
+      "loss": 6.8645,
+      "step": 201
+    },
+    {
+      "epoch": 0.00202,
+      "grad_norm": 0.4553990879307791,
+      "learning_rate": 0.0006060000000000001,
+      "loss": 6.855,
+      "step": 202
+    },
+    {
+      "epoch": 0.00203,
+      "grad_norm": 0.49491513980041124,
+      "learning_rate": 0.0006090000000000001,
+      "loss": 6.8592,
+      "step": 203
+    },
+    {
+      "epoch": 0.00204,
+      "grad_norm": 0.5750090076580947,
+      "learning_rate": 0.000612,
+      "loss": 6.8457,
+      "step": 204
+    },
+    {
+      "epoch": 0.00205,
+      "grad_norm": 0.6904749130436038,
+      "learning_rate": 0.000615,
+      "loss": 6.8381,
+      "step": 205
+    },
+    {
+      "epoch": 0.00206,
+      "grad_norm": 0.7582947936777445,
+      "learning_rate": 0.000618,
+      "loss": 6.83,
+      "step": 206
+    },
+    {
+      "epoch": 0.00207,
+      "grad_norm": 0.728748472942146,
+      "learning_rate": 0.000621,
+      "loss": 6.8214,
+      "step": 207
+    },
+    {
+      "epoch": 0.00208,
+      "grad_norm": 0.5163586812963157,
+      "learning_rate": 0.000624,
+      "loss": 6.8116,
+      "step": 208
+    },
+    {
+      "epoch": 0.00209,
+      "grad_norm": 0.5726761174567752,
+      "learning_rate": 0.000627,
+      "loss": 6.7933,
+      "step": 209
+    },
+    {
+      "epoch": 0.0021,
+      "grad_norm": 0.6890311463252623,
+      "learning_rate": 0.00063,
+      "loss": 6.7854,
+      "step": 210
+    },
+    {
+      "epoch": 0.00211,
+      "grad_norm": 0.9174002778722206,
+      "learning_rate": 0.000633,
+      "loss": 6.7849,
+      "step": 211
+    },
+    {
+      "epoch": 0.00212,
+      "grad_norm": 0.8086617968740898,
+      "learning_rate": 0.000636,
+      "loss": 6.7808,
+      "step": 212
+    },
+    {
+      "epoch": 0.00213,
+      "grad_norm": 0.6685717599500662,
+      "learning_rate": 0.000639,
+      "loss": 6.7542,
+      "step": 213
+    },
+    {
+      "epoch": 0.00214,
+      "grad_norm": 0.511917016650173,
+      "learning_rate": 0.000642,
+      "loss": 6.7483,
+      "step": 214
+    },
+    {
+      "epoch": 0.00215,
+      "grad_norm": 0.5132261185164837,
+      "learning_rate": 0.000645,
+      "loss": 6.7465,
+      "step": 215
+    },
+    {
+      "epoch": 0.00216,
+      "grad_norm": 0.3896647006337605,
+      "learning_rate": 0.000648,
+      "loss": 6.7354,
+      "step": 216
+    },
+    {
+      "epoch": 0.00217,
+      "grad_norm": 0.53153444147609,
+      "learning_rate": 0.000651,
+      "loss": 6.7114,
+      "step": 217
+    },
+    {
+      "epoch": 0.00218,
+      "grad_norm": 0.4560253950102483,
+      "learning_rate": 0.000654,
+      "loss": 6.7136,
+      "step": 218
+    },
+    {
+      "epoch": 0.00219,
+      "grad_norm": 0.38246603109839156,
+      "learning_rate": 0.000657,
+      "loss": 6.6847,
+      "step": 219
+    },
+    {
+      "epoch": 0.0022,
+      "grad_norm": 0.502249830770979,
+      "learning_rate": 0.00066,
+      "loss": 6.7061,
+      "step": 220
+    },
+    {
+      "epoch": 0.00221,
+      "grad_norm": 0.555840042257826,
+      "learning_rate": 0.0006630000000000001,
+      "loss": 6.6817,
+      "step": 221
+    },
+    {
+      "epoch": 0.00222,
+      "grad_norm": 0.7008290795132841,
+      "learning_rate": 0.000666,
+      "loss": 6.6751,
+      "step": 222
+    },
+    {
+      "epoch": 0.00223,
+      "grad_norm": 0.9665649898158697,
+      "learning_rate": 0.000669,
+      "loss": 6.6759,
+      "step": 223
+    },
+    {
+      "epoch": 0.00224,
+      "grad_norm": 1.0460190685952617,
+      "learning_rate": 0.0006720000000000001,
+      "loss": 6.6821,
+      "step": 224
+    },
+    {
+      "epoch": 0.00225,
+      "grad_norm": 0.9709238336565439,
+      "learning_rate": 0.000675,
+      "loss": 6.6643,
+      "step": 225
+    },
+    {
+      "epoch": 0.00226,
+      "grad_norm": 0.9675609159629996,
+      "learning_rate": 0.000678,
+      "loss": 6.6602,
+      "step": 226
+    },
+    {
+      "epoch": 0.00227,
+      "grad_norm": 0.6069059963900193,
+      "learning_rate": 0.0006810000000000001,
+      "loss": 6.6251,
+      "step": 227
+    },
+    {
+      "epoch": 0.00228,
+      "grad_norm": 0.6661980684886607,
+      "learning_rate": 0.000684,
+      "loss": 6.6314,
+      "step": 228
+    },
+    {
+      "epoch": 0.00229,
+      "grad_norm": 0.5755819115869941,
+      "learning_rate": 0.000687,
+      "loss": 6.6231,
+      "step": 229
+    },
+    {
+      "epoch": 0.0023,
+      "grad_norm": 0.48612508126201437,
+      "learning_rate": 0.0006900000000000001,
+      "loss": 6.6015,
+      "step": 230
+    },
+    {
+      "epoch": 0.00231,
+      "grad_norm": 0.42902206098374773,
+      "learning_rate": 0.000693,
+      "loss": 6.5844,
+      "step": 231
+    },
+    {
+      "epoch": 0.00232,
+      "grad_norm": 0.4617696239896233,
+      "learning_rate": 0.000696,
+      "loss": 6.5964,
+      "step": 232
+    },
+    {
+      "epoch": 0.00233,
+      "grad_norm": 0.397560103207551,
+      "learning_rate": 0.0006990000000000001,
+      "loss": 6.5819,
+      "step": 233
+    },
+    {
+      "epoch": 0.00234,
+      "grad_norm": 0.39436477469071923,
+      "learning_rate": 0.000702,
+      "loss": 6.5732,
+      "step": 234
+    },
+    {
+      "epoch": 0.00235,
+      "grad_norm": 0.37818254545129,
+      "learning_rate": 0.000705,
+      "loss": 6.5584,
+      "step": 235
+    },
+    {
+      "epoch": 0.00236,
+      "grad_norm": 0.39793300295732814,
+      "learning_rate": 0.000708,
+      "loss": 6.539,
+      "step": 236
+    },
+    {
+      "epoch": 0.00237,
+      "grad_norm": 0.32880148477167265,
+      "learning_rate": 0.0007109999999999999,
+      "loss": 6.5486,
+      "step": 237
+    },
+    {
+      "epoch": 0.00238,
+      "grad_norm": 0.33186649759843,
+      "learning_rate": 0.000714,
+      "loss": 6.5374,
+      "step": 238
+    },
+    {
+      "epoch": 0.00239,
+      "grad_norm": 0.3861082150171924,
+      "learning_rate": 0.000717,
+      "loss": 6.5195,
+      "step": 239
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.3818382545466509,
+      "learning_rate": 0.0007199999999999999,
+      "loss": 6.5368,
+      "step": 240
+    },
+    {
+      "epoch": 0.00241,
+      "grad_norm": 0.26517003068907236,
+      "learning_rate": 0.000723,
+      "loss": 6.5167,
+      "step": 241
+    },
+    {
+      "epoch": 0.00242,
+      "grad_norm": 0.30652105574179844,
+      "learning_rate": 0.000726,
+      "loss": 6.4934,
+      "step": 242
+    },
+    {
+      "epoch": 0.00243,
+      "grad_norm": 0.3382942246099826,
+      "learning_rate": 0.000729,
+      "loss": 6.4799,
+      "step": 243
+    },
+    {
+      "epoch": 0.00244,
+      "grad_norm": 0.4164388540502762,
+      "learning_rate": 0.000732,
+      "loss": 6.4843,
+      "step": 244
+    },
+    {
+      "epoch": 0.00245,
+      "grad_norm": 0.4035007765909141,
+      "learning_rate": 0.000735,
+      "loss": 6.4741,
+      "step": 245
+    },
+    {
+      "epoch": 0.00246,
+      "grad_norm": 0.4484502106991885,
+      "learning_rate": 0.000738,
+      "loss": 6.458,
+      "step": 246
+    },
+    {
+      "epoch": 0.00247,
+      "grad_norm": 0.6057401118193197,
+      "learning_rate": 0.000741,
+      "loss": 6.4543,
+      "step": 247
+    },
+    {
+      "epoch": 0.00248,
+      "grad_norm": 1.1775332556723501,
+      "learning_rate": 0.000744,
+      "loss": 6.4781,
+      "step": 248
+    },
+    {
+      "epoch": 0.00249,
+      "grad_norm": 1.0888595785598245,
+      "learning_rate": 0.000747,
+      "loss": 6.4635,
+      "step": 249
+    },
+    {
+      "epoch": 0.0025,
+      "grad_norm": 0.536872031808477,
+      "learning_rate": 0.00075,
+      "loss": 6.4479,
+      "step": 250
+    },
+    {
+      "epoch": 0.00251,
+      "grad_norm": 0.7926645181932281,
+      "learning_rate": 0.000753,
+      "loss": 6.4169,
+      "step": 251
+    },
+    {
+      "epoch": 0.00252,
+      "grad_norm": 0.6863348407685264,
+      "learning_rate": 0.000756,
+      "loss": 6.4273,
+      "step": 252
+    },
+    {
+      "epoch": 0.00253,
+      "grad_norm": 0.7123800606299509,
+      "learning_rate": 0.000759,
+      "loss": 6.4195,
+      "step": 253
+    },
+    {
+      "epoch": 0.00254,
+      "grad_norm": 0.839405849029746,
+      "learning_rate": 0.000762,
+      "loss": 6.4177,
+      "step": 254
+    },
+    {
+      "epoch": 0.00255,
+      "grad_norm": 0.76501143998226,
+      "learning_rate": 0.0007650000000000001,
+      "loss": 6.4159,
+      "step": 255
+    },
+    {
+      "epoch": 0.00256,
+      "grad_norm": 0.597608858095952,
+      "learning_rate": 0.000768,
+      "loss": 6.3889,
+      "step": 256
+    },
+    {
+      "epoch": 0.00257,
+      "grad_norm": 0.6526176951631347,
+      "learning_rate": 0.000771,
+      "loss": 6.3981,
+      "step": 257
+    },
+    {
+      "epoch": 0.00258,
+      "grad_norm": 0.651228257980475,
+      "learning_rate": 0.0007740000000000001,
+      "loss": 6.3725,
+      "step": 258
+    },
+    {
+      "epoch": 0.00259,
+      "grad_norm": 0.5603901273931662,
+      "learning_rate": 0.000777,
+      "loss": 6.3719,
+      "step": 259
+    },
+    {
+      "epoch": 0.0026,
+      "grad_norm": 0.41845020316479425,
+      "learning_rate": 0.0007800000000000001,
+      "loss": 6.3536,
+      "step": 260
+    },
+    {
+      "epoch": 0.00261,
+      "grad_norm": 0.5144884019867095,
+      "learning_rate": 0.0007830000000000001,
+      "loss": 6.3665,
+      "step": 261
+    },
+    {
+      "epoch": 0.00262,
+      "grad_norm": 0.5548811083770797,
+      "learning_rate": 0.000786,
+      "loss": 6.3412,
+      "step": 262
+    },
+    {
+      "epoch": 0.00263,
+      "grad_norm": 0.702068573310266,
+      "learning_rate": 0.0007890000000000001,
+      "loss": 6.353,
+      "step": 263
+    },
+    {
+      "epoch": 0.00264,
+      "grad_norm": 0.9481897048028406,
+      "learning_rate": 0.0007920000000000001,
+      "loss": 6.3404,
+      "step": 264
+    },
+    {
+      "epoch": 0.00265,
+      "grad_norm": 1.2297805755386195,
+      "learning_rate": 0.000795,
+      "loss": 6.3478,
+      "step": 265
+    },
+    {
+      "epoch": 0.00266,
+      "grad_norm": 0.5034998722006886,
+      "learning_rate": 0.0007980000000000001,
+      "loss": 6.3233,
+      "step": 266
+    },
+    {
+      "epoch": 0.00267,
+      "grad_norm": 0.8457797300321087,
+      "learning_rate": 0.0008010000000000001,
+      "loss": 6.3268,
+      "step": 267
+    },
+    {
+      "epoch": 0.00268,
+      "grad_norm": 0.7624901894608749,
+      "learning_rate": 0.000804,
+      "loss": 6.3226,
+      "step": 268
+    },
+    {
+      "epoch": 0.00269,
+      "grad_norm": 0.6803898428780553,
+      "learning_rate": 0.0008070000000000001,
+      "loss": 6.3045,
+      "step": 269
+    },
+    {
+      "epoch": 0.0027,
+      "grad_norm": 0.5891673657315365,
+      "learning_rate": 0.0008100000000000001,
+      "loss": 6.3,
+      "step": 270
+    },
+    {
+      "epoch": 0.00271,
+      "grad_norm": 0.6321969571669588,
+      "learning_rate": 0.000813,
+      "loss": 6.3081,
+      "step": 271
+    },
+    {
+      "epoch": 0.00272,
+      "grad_norm": 0.6080230974854919,
+      "learning_rate": 0.0008160000000000001,
+      "loss": 6.2911,
+      "step": 272
+    },
+    {
+      "epoch": 0.00273,
+      "grad_norm": 0.577176950863229,
+      "learning_rate": 0.0008190000000000001,
+      "loss": 6.2786,
+      "step": 273
+    },
+    {
+      "epoch": 0.00274,
+      "grad_norm": 0.46970800663022055,
+      "learning_rate": 0.000822,
+      "loss": 6.2573,
+      "step": 274
+    },
+    {
+      "epoch": 0.00275,
+      "grad_norm": 0.5095773122618286,
+      "learning_rate": 0.0008250000000000001,
+      "loss": 6.2676,
+      "step": 275
+    },
+    {
+      "epoch": 0.00276,
+      "grad_norm": 0.421367493059458,
+      "learning_rate": 0.0008280000000000001,
+      "loss": 6.2547,
+      "step": 276
+    },
+    {
+      "epoch": 0.00277,
+      "grad_norm": 0.4229723742956301,
+      "learning_rate": 0.0008310000000000001,
+      "loss": 6.2503,
+      "step": 277
+    },
+    {
+      "epoch": 0.00278,
+      "grad_norm": 0.4631977178825306,
+      "learning_rate": 0.0008340000000000001,
+      "loss": 6.2346,
+      "step": 278
+    },
+    {
+      "epoch": 0.00279,
+      "grad_norm": 0.41870110969580987,
+      "learning_rate": 0.0008370000000000001,
+      "loss": 6.2332,
+      "step": 279
+    },
+    {
+      "epoch": 0.0028,
+      "grad_norm": 0.4083314739680453,
+      "learning_rate": 0.0008400000000000001,
+      "loss": 6.2161,
+      "step": 280
+    },
+    {
+      "epoch": 0.00281,
+      "grad_norm": 0.42451645247510567,
+      "learning_rate": 0.0008430000000000001,
+      "loss": 6.2058,
+      "step": 281
+    },
+    {
+      "epoch": 0.00282,
+      "grad_norm": 0.4811013283391871,
+      "learning_rate": 0.000846,
+      "loss": 6.206,
+      "step": 282
+    },
+    {
+      "epoch": 0.00283,
+      "grad_norm": 0.6798083705841664,
+      "learning_rate": 0.0008489999999999999,
+      "loss": 6.2015,
+      "step": 283
+    },
+    {
+      "epoch": 0.00284,
+      "grad_norm": 1.0382201143248402,
+      "learning_rate": 0.0008519999999999999,
+      "loss": 6.2055,
+      "step": 284
+    },
+    {
+      "epoch": 0.00285,
+      "grad_norm": 1.115942818342409,
+      "learning_rate": 0.000855,
+      "loss": 6.2129,
+      "step": 285
+    },
+    {
+      "epoch": 0.00286,
+      "grad_norm": 0.8889955339821247,
+      "learning_rate": 0.0008579999999999999,
+      "loss": 6.187,
+      "step": 286
+    },
+    {
+      "epoch": 0.00287,
+      "grad_norm": 1.2422801265585652,
+      "learning_rate": 0.000861,
+      "loss": 6.209,
+      "step": 287
+    },
+    {
+      "epoch": 0.00288,
+      "grad_norm": 0.8315932342234975,
+      "learning_rate": 0.000864,
+      "loss": 6.174,
+      "step": 288
+    },
+    {
+      "epoch": 0.00289,
+      "grad_norm": 1.2914759013339998,
+      "learning_rate": 0.0008669999999999999,
+      "loss": 6.2078,
+      "step": 289
+    },
+    {
+      "epoch": 0.0029,
+      "grad_norm": 0.8376507056381004,
+      "learning_rate": 0.00087,
+      "loss": 6.1757,
+      "step": 290
+    },
+    {
+      "epoch": 0.00291,
+      "grad_norm": 0.8412780961911104,
+      "learning_rate": 0.000873,
+      "loss": 6.1658,
+      "step": 291
+    },
+    {
+      "epoch": 0.00292,
+      "grad_norm": 1.047021583757866,
+      "learning_rate": 0.0008759999999999999,
+      "loss": 6.1758,
+      "step": 292
+    },
+    {
+      "epoch": 0.00293,
+      "grad_norm": 0.8919470282886952,
+      "learning_rate": 0.000879,
+      "loss": 6.151,
+      "step": 293
+    },
+    {
+      "epoch": 0.00294,
+      "grad_norm": 0.665529628519212,
+      "learning_rate": 0.000882,
+      "loss": 6.159,
+      "step": 294
+    },
+    {
+      "epoch": 0.00295,
+      "grad_norm": 0.5169660787787601,
+      "learning_rate": 0.0008849999999999999,
+      "loss": 6.1239,
+      "step": 295
+    },
+    {
+      "epoch": 0.00296,
+      "grad_norm": 0.5611538425989948,
+      "learning_rate": 0.000888,
+      "loss": 6.1363,
+      "step": 296
+    },
+    {
+      "epoch": 0.00297,
+      "grad_norm": 0.46398604023920087,
+      "learning_rate": 0.000891,
+      "loss": 6.1045,
+      "step": 297
+    },
+    {
+      "epoch": 0.00298,
+      "grad_norm": 0.4361556326298739,
+      "learning_rate": 0.0008939999999999999,
+      "loss": 6.1198,
+      "step": 298
+    },
+    {
+      "epoch": 0.00299,
+      "grad_norm": 0.4319584905904094,
+      "learning_rate": 0.000897,
+      "loss": 6.0941,
+      "step": 299
+    },
+    {
+      "epoch": 0.003,
+      "grad_norm": 0.4255386299160817,
+      "learning_rate": 0.0009,
+      "loss": 6.0936,
+      "step": 300
+    },
+    {
+      "epoch": 0.00301,
+      "grad_norm": 0.3316584659066082,
+      "learning_rate": 0.0009029999999999999,
+      "loss": 6.0857,
+      "step": 301
+    },
+    {
+      "epoch": 0.00302,
+      "grad_norm": 0.37299869635167304,
+      "learning_rate": 0.000906,
+      "loss": 6.0685,
+      "step": 302
+    },
+    {
+      "epoch": 0.00303,
+      "grad_norm": 0.40148217950038195,
+      "learning_rate": 0.000909,
+      "loss": 6.0805,
+      "step": 303
+    },
+    {
+      "epoch": 0.00304,
+      "grad_norm": 0.420191340163935,
+      "learning_rate": 0.000912,
+      "loss": 6.0758,
+      "step": 304
+    },
+    {
+      "epoch": 0.00305,
+      "grad_norm": 0.45307668264044143,
+      "learning_rate": 0.000915,
+      "loss": 6.0736,
+      "step": 305
+    },
+    {
+      "epoch": 0.00306,
+      "grad_norm": 0.6122731249830943,
+      "learning_rate": 0.000918,
+      "loss": 6.0651,
+      "step": 306
+    },
+    {
+      "epoch": 0.00307,
+      "grad_norm": 0.851197326038436,
+      "learning_rate": 0.000921,
+      "loss": 6.0633,
+      "step": 307
+    },
+    {
+      "epoch": 0.00308,
+      "grad_norm": 1.1284591769922636,
+      "learning_rate": 0.000924,
+      "loss": 6.0582,
+      "step": 308
+    },
+    {
+      "epoch": 0.00309,
+      "grad_norm": 0.9596545216263644,
+      "learning_rate": 0.000927,
+      "loss": 6.0709,
+      "step": 309
+    },
+    {
+      "epoch": 0.0031,
+      "grad_norm": 1.039007932956353,
+      "learning_rate": 0.00093,
+      "loss": 6.0624,
+      "step": 310
+    },
+    {
+      "epoch": 0.00311,
+      "grad_norm": 0.9855401820369791,
+      "learning_rate": 0.000933,
+      "loss": 6.0524,
+      "step": 311
+    },
+    {
+      "epoch": 0.00312,
+      "grad_norm": 1.0163701418335827,
+      "learning_rate": 0.000936,
+      "loss": 6.041,
+      "step": 312
+    },
+    {
+      "epoch": 0.00313,
+      "grad_norm": 1.0223663613867633,
+      "learning_rate": 0.0009390000000000001,
+      "loss": 6.0491,
+      "step": 313
+    },
+    {
+      "epoch": 0.00314,
+      "grad_norm": 1.0997292958340223,
+      "learning_rate": 0.000942,
+      "loss": 6.0641,
+      "step": 314
+    },
+    {
+      "epoch": 0.00315,
+      "grad_norm": 0.932276773939602,
+      "learning_rate": 0.000945,
+      "loss": 6.0354,
+      "step": 315
+    },
+    {
+      "epoch": 0.00316,
+      "grad_norm": 0.8624268848533463,
+      "learning_rate": 0.0009480000000000001,
+      "loss": 6.0096,
+      "step": 316
+    },
+    {
+      "epoch": 0.00317,
+      "grad_norm": 0.6867359398602113,
+      "learning_rate": 0.000951,
+      "loss": 6.0237,
+      "step": 317
+    },
+    {
+      "epoch": 0.00318,
+      "grad_norm": 0.5777711812516898,
+      "learning_rate": 0.000954,
+      "loss": 6.014,
+      "step": 318
+    },
+    {
+      "epoch": 0.00319,
+      "grad_norm": 0.5907201170091796,
+      "learning_rate": 0.0009570000000000001,
+      "loss": 6.0042,
+      "step": 319
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.5929910498481646,
+      "learning_rate": 0.00096,
+      "loss": 6.0021,
+      "step": 320
+    },
+    {
+      "epoch": 0.00321,
+      "grad_norm": 0.6559636432249029,
+      "learning_rate": 0.000963,
+      "loss": 5.9891,
+      "step": 321
+    },
+    {
+      "epoch": 0.00322,
+      "grad_norm": 0.5844133161497509,
+      "learning_rate": 0.0009660000000000001,
+      "loss": 5.9766,
+      "step": 322
+    },
+    {
+      "epoch": 0.00323,
+      "grad_norm": 0.5466606066369618,
+      "learning_rate": 0.000969,
+      "loss": 5.9736,
+      "step": 323
+    },
+    {
+      "epoch": 0.00324,
+      "grad_norm": 0.563270781105711,
+      "learning_rate": 0.0009720000000000001,
+      "loss": 5.9778,
+      "step": 324
+    },
+    {
+      "epoch": 0.00325,
+      "grad_norm": 0.5312990845923178,
+      "learning_rate": 0.0009750000000000001,
+      "loss": 5.9405,
+      "step": 325
+    },
+    {
+      "epoch": 0.00326,
+      "grad_norm": 0.5118566622058196,
+      "learning_rate": 0.0009780000000000001,
+      "loss": 5.9566,
+      "step": 326
+    },
+    {
+      "epoch": 0.00327,
+      "grad_norm": 0.5259315695578027,
+      "learning_rate": 0.000981,
+      "loss": 5.9568,
+      "step": 327
+    },
+    {
+      "epoch": 0.00328,
+      "grad_norm": 0.5410551164101933,
+      "learning_rate": 0.000984,
+      "loss": 5.9324,
+      "step": 328
+    },
+    {
+      "epoch": 0.00329,
+      "grad_norm": 0.48301414107035934,
+      "learning_rate": 0.000987,
+      "loss": 5.931,
+      "step": 329
+    },
+    {
+      "epoch": 0.0033,
+      "grad_norm": 0.5975532498257996,
+      "learning_rate": 0.00099,
+      "loss": 5.9265,
+      "step": 330
+    },
+    {
+      "epoch": 0.00331,
+      "grad_norm": 0.9746373555768076,
+      "learning_rate": 0.0009930000000000002,
+      "loss": 5.9381,
+      "step": 331
+    },
+    {
+      "epoch": 0.00332,
+      "grad_norm": 1.1994973147641799,
+      "learning_rate": 0.0009960000000000001,
+      "loss": 5.9385,
+      "step": 332
+    },
+    {
+      "epoch": 0.00333,
+      "grad_norm": 0.5144992648158865,
+      "learning_rate": 0.000999,
+      "loss": 5.8989,
+      "step": 333
+    },
+    {
+      "epoch": 0.00334,
+      "grad_norm": 0.9073672400240862,
+      "learning_rate": 0.001002,
+      "loss": 5.9262,
+      "step": 334
+    },
+    {
+      "epoch": 0.00335,
+      "grad_norm": 0.7561451103694465,
+      "learning_rate": 0.001005,
+      "loss": 5.9186,
+      "step": 335
+    },
+    {
+      "epoch": 0.00336,
+      "grad_norm": 0.7872757919528415,
+      "learning_rate": 0.001008,
+      "loss": 5.9134,
+      "step": 336
+    },
+    {
+      "epoch": 0.00337,
+      "grad_norm": 0.7536020827923614,
+      "learning_rate": 0.0010110000000000002,
+      "loss": 5.8884,
+      "step": 337
+    },
+    {
+      "epoch": 0.00338,
+      "grad_norm": 1.0523353883962923,
+      "learning_rate": 0.0010140000000000001,
+      "loss": 5.9132,
+      "step": 338
+    },
+    {
+      "epoch": 0.00339,
+      "grad_norm": 1.2857238182949966,
+      "learning_rate": 0.0010170000000000001,
+      "loss": 5.9334,
+      "step": 339
+    },
+    {
+      "epoch": 0.0034,
+      "grad_norm": 0.7361708327689132,
+      "learning_rate": 0.00102,
+      "loss": 5.885,
+      "step": 340
+    },
+    {
+      "epoch": 0.00341,
+      "grad_norm": 0.6901997441262301,
+      "learning_rate": 0.001023,
+      "loss": 5.869,
+      "step": 341
+    },
+    {
+      "epoch": 0.00342,
+      "grad_norm": 0.6381033550571615,
+      "learning_rate": 0.001026,
+      "loss": 5.8702,
+      "step": 342
+    },
+    {
+      "epoch": 0.00343,
+      "grad_norm": 0.6071718978500397,
+      "learning_rate": 0.0010290000000000002,
+      "loss": 5.8743,
+      "step": 343
+    },
+    {
+      "epoch": 0.00344,
+      "grad_norm": 0.5857796625429044,
+      "learning_rate": 0.001032,
+      "loss": 5.861,
+      "step": 344
+    },
+    {
+      "epoch": 0.00345,
+      "grad_norm": 0.626640702848716,
+      "learning_rate": 0.001035,
+      "loss": 5.8537,
+      "step": 345
+    },
+    {
+      "epoch": 0.00346,
+      "grad_norm": 0.6755670022907736,
+      "learning_rate": 0.0010379999999999999,
+      "loss": 5.8603,
+      "step": 346
+    },
+    {
+      "epoch": 0.00347,
+      "grad_norm": 0.9144508249400731,
+      "learning_rate": 0.001041,
+      "loss": 5.8338,
+      "step": 347
+    },
+    {
+      "epoch": 0.00348,
+      "grad_norm": 1.2125876856754099,
+      "learning_rate": 0.001044,
+      "loss": 5.8634,
+      "step": 348
+    },
+    {
+      "epoch": 0.00349,
+      "grad_norm": 0.6928695941460523,
+      "learning_rate": 0.001047,
+      "loss": 5.8236,
+      "step": 349
+    },
+    {
+      "epoch": 0.0035,
+      "grad_norm": 0.7654262923967496,
+      "learning_rate": 0.00105,
+      "loss": 5.8502,
+      "step": 350
+    },
+    {
+      "epoch": 0.00351,
+      "grad_norm": 0.8300223804260752,
+      "learning_rate": 0.001053,
+      "loss": 5.8507,
+      "step": 351
+    },
+    {
+      "epoch": 0.00352,
+      "grad_norm": 1.1393832643973667,
+      "learning_rate": 0.0010559999999999999,
+      "loss": 5.841,
+      "step": 352
+    },
+    {
+      "epoch": 0.00353,
+      "grad_norm": 0.7670875434573843,
+      "learning_rate": 0.001059,
+      "loss": 5.8302,
+      "step": 353
+    },
+    {
+      "epoch": 0.00354,
+      "grad_norm": 0.8617169684849714,
+      "learning_rate": 0.001062,
+      "loss": 5.8072,
+      "step": 354
+    },
+    {
+      "epoch": 0.00355,
+      "grad_norm": 0.8787230305505044,
+      "learning_rate": 0.001065,
+      "loss": 5.8221,
+      "step": 355
+    },
+    {
+      "epoch": 0.00356,
+      "grad_norm": 0.9037602020080988,
+      "learning_rate": 0.001068,
+      "loss": 5.8164,
+      "step": 356
+    },
+    {
+      "epoch": 0.00357,
+      "grad_norm": 0.785887699185926,
+      "learning_rate": 0.001071,
+      "loss": 5.8055,
+      "step": 357
+    },
+    {
+      "epoch": 0.00358,
+      "grad_norm": 0.6152742029666318,
+      "learning_rate": 0.001074,
+      "loss": 5.7887,
+      "step": 358
+    },
+    {
+      "epoch": 0.00359,
+      "grad_norm": 0.5372063086433791,
+      "learning_rate": 0.001077,
+      "loss": 5.78,
+      "step": 359
+    },
+    {
+      "epoch": 0.0036,
+      "grad_norm": 0.5078257678271803,
+      "learning_rate": 0.00108,
+      "loss": 5.7825,
+      "step": 360
+    },
+    {
+      "epoch": 0.00361,
+      "grad_norm": 0.4885651334266738,
+      "learning_rate": 0.001083,
+      "loss": 5.7748,
+      "step": 361
+    },
+    {
+      "epoch": 0.00362,
+      "grad_norm": 0.5495429650143561,
+      "learning_rate": 0.001086,
+      "loss": 5.7596,
+      "step": 362
+    },
+    {
+      "epoch": 0.00363,
+      "grad_norm": 0.5626950540152672,
+      "learning_rate": 0.001089,
+      "loss": 5.7515,
+      "step": 363
+    },
+    {
+      "epoch": 0.00364,
+      "grad_norm": 0.6199658617744055,
+      "learning_rate": 0.001092,
+      "loss": 5.766,
+      "step": 364
+    },
+    {
+      "epoch": 0.00365,
+      "grad_norm": 0.7392438146286566,
+      "learning_rate": 0.001095,
+      "loss": 5.7655,
+      "step": 365
+    },
+    {
+      "epoch": 0.00366,
+      "grad_norm": 0.9864875055616179,
+      "learning_rate": 0.001098,
+      "loss": 5.7524,
+      "step": 366
+    },
+    {
+      "epoch": 0.00367,
+      "grad_norm": 1.1449768044417052,
+      "learning_rate": 0.001101,
+      "loss": 5.7648,
+      "step": 367
+    },
+    {
+      "epoch": 0.00368,
+      "grad_norm": 0.9013400729864322,
+      "learning_rate": 0.001104,
+      "loss": 5.755,
+      "step": 368
+    },
+    {
+      "epoch": 0.00369,
+      "grad_norm": 0.9093377711089596,
+      "learning_rate": 0.001107,
+      "loss": 5.7659,
+      "step": 369
+    },
+    {
+      "epoch": 0.0037,
+      "grad_norm": 0.575923493278037,
+      "learning_rate": 0.00111,
+      "loss": 5.7328,
+      "step": 370
+    },
+    {
+      "epoch": 0.00371,
+      "grad_norm": 0.6737016310188764,
+      "learning_rate": 0.001113,
+      "loss": 5.7102,
+      "step": 371
+    },
+    {
+      "epoch": 0.00372,
+      "grad_norm": 0.4833347808689347,
+      "learning_rate": 0.001116,
+      "loss": 5.7236,
+      "step": 372
+    },
+    {
+      "epoch": 0.00373,
+      "grad_norm": 0.6361357392920576,
+      "learning_rate": 0.001119,
+      "loss": 5.7181,
+      "step": 373
+    },
+    {
+      "epoch": 0.00374,
+      "grad_norm": 0.6286357986456886,
+      "learning_rate": 0.001122,
+      "loss": 5.7192,
+      "step": 374
+    },
+    {
+      "epoch": 0.00375,
+      "grad_norm": 0.7140127832546589,
+      "learning_rate": 0.0011250000000000001,
+      "loss": 5.7248,
+      "step": 375
+    },
+    {
+      "epoch": 0.00376,
+      "grad_norm": 0.728891228424708,
+      "learning_rate": 0.001128,
+      "loss": 5.7207,
+      "step": 376
+    },
+    {
+      "epoch": 0.00377,
+      "grad_norm": 0.7251122752592066,
+      "learning_rate": 0.001131,
+      "loss": 5.7141,
+      "step": 377
+    },
+    {
+      "epoch": 0.00378,
+      "grad_norm": 0.8109517942362439,
+      "learning_rate": 0.001134,
+      "loss": 5.7121,
+      "step": 378
+    },
+    {
+      "epoch": 0.00379,
+      "grad_norm": 0.7548765882892476,
+      "learning_rate": 0.001137,
+      "loss": 5.6981,
+      "step": 379
+    },
+    {
+      "epoch": 0.0038,
+      "grad_norm": 0.5982490555873449,
+      "learning_rate": 0.00114,
+      "loss": 5.7128,
+      "step": 380
+    },
+    {
+      "epoch": 0.00381,
+      "grad_norm": 0.5479723067602768,
+      "learning_rate": 0.0011430000000000001,
+      "loss": 5.6793,
+      "step": 381
+    },
+    {
+      "epoch": 0.00382,
+      "grad_norm": 0.5400365110175976,
+      "learning_rate": 0.001146,
+      "loss": 5.6631,
+      "step": 382
+    },
+    {
+      "epoch": 0.00383,
+      "grad_norm": 0.4406698702316126,
+      "learning_rate": 0.001149,
+      "loss": 5.673,
+      "step": 383
+    },
+    {
+      "epoch": 0.00384,
+      "grad_norm": 0.5495584711003424,
+      "learning_rate": 0.001152,
+      "loss": 5.6782,
+      "step": 384
+    },
+    {
+      "epoch": 0.00385,
+      "grad_norm": 0.7047837665038742,
+      "learning_rate": 0.001155,
+      "loss": 5.6686,
+      "step": 385
+    },
+    {
+      "epoch": 0.00386,
+      "grad_norm": 1.0039450355838517,
+      "learning_rate": 0.001158,
+      "loss": 5.6846,
+      "step": 386
+    },
+    {
+      "epoch": 0.00387,
+      "grad_norm": 1.2552299335364856,
+      "learning_rate": 0.0011610000000000001,
+      "loss": 5.6713,
+      "step": 387
+    },
+    {
+      "epoch": 0.00388,
+      "grad_norm": 1.0630057286422998,
+      "learning_rate": 0.001164,
+      "loss": 5.6615,
+      "step": 388
+    },
+    {
+      "epoch": 0.00389,
+      "grad_norm": 1.4085777330550793,
+      "learning_rate": 0.001167,
+      "loss": 5.6963,
+      "step": 389
+    },
+    {
+      "epoch": 0.0039,
+      "grad_norm": 0.7893066659624004,
+      "learning_rate": 0.00117,
+      "loss": 5.651,
+      "step": 390
+    },
+    {
+      "epoch": 0.00391,
+      "grad_norm": 0.8891270576556106,
+      "learning_rate": 0.001173,
+      "loss": 5.666,
+      "step": 391
+    },
+    {
+      "epoch": 0.00392,
+      "grad_norm": 1.1058013192110903,
+      "learning_rate": 0.001176,
+      "loss": 5.6796,
+      "step": 392
+    },
+    {
+      "epoch": 0.00393,
+      "grad_norm": 1.170614508410806,
+      "learning_rate": 0.0011790000000000001,
+      "loss": 5.6646,
+      "step": 393
+    },
+    {
+      "epoch": 0.00394,
+      "grad_norm": 0.8391276502601887,
+      "learning_rate": 0.001182,
+      "loss": 5.6402,
+      "step": 394
+    },
+    {
+      "epoch": 0.00395,
+      "grad_norm": 0.9435882620236007,
+      "learning_rate": 0.001185,
+      "loss": 5.6277,
+      "step": 395
+    },
+    {
+      "epoch": 0.00396,
+      "grad_norm": 0.7925001626557522,
+      "learning_rate": 0.001188,
+      "loss": 5.6404,
+      "step": 396
+    },
+    {
+      "epoch": 0.00397,
+      "grad_norm": 0.8633162203152536,
+      "learning_rate": 0.001191,
+      "loss": 5.6366,
+      "step": 397
+    },
+    {
+      "epoch": 0.00398,
+      "grad_norm": 0.9359127674730449,
+      "learning_rate": 0.0011940000000000002,
+      "loss": 5.6437,
+      "step": 398
+    },
+    {
+      "epoch": 0.00399,
+      "grad_norm": 1.0926478209626875,
+      "learning_rate": 0.0011970000000000001,
+      "loss": 5.6494,
+      "step": 399
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.8943926064407558,
+      "learning_rate": 0.0012000000000000001,
+      "loss": 5.6306,
+      "step": 400
+    },
+    {
+      "epoch": 0.00401,
+      "grad_norm": 1.211025202532141,
+      "learning_rate": 0.001203,
+      "loss": 5.6241,
+      "step": 401
+    },
+    {
+      "epoch": 0.00402,
+      "grad_norm": 0.8585006093020132,
+      "learning_rate": 0.001206,
+      "loss": 5.6196,
+      "step": 402
+    },
+    {
+      "epoch": 0.00403,
+      "grad_norm": 0.8708424012246267,
+      "learning_rate": 0.001209,
+      "loss": 5.618,
+      "step": 403
+    },
+    {
+      "epoch": 0.00404,
+      "grad_norm": 0.6771235232466043,
+      "learning_rate": 0.0012120000000000002,
+      "loss": 5.6057,
+      "step": 404
+    },
+    {
+      "epoch": 0.00405,
+      "grad_norm": 0.5089480196948696,
+      "learning_rate": 0.0012150000000000002,
+      "loss": 5.5986,
+      "step": 405
+    },
+    {
+      "epoch": 0.00406,
+      "grad_norm": 0.5324784457955185,
+      "learning_rate": 0.0012180000000000001,
+      "loss": 5.583,
+      "step": 406
+    },
+    {
+      "epoch": 0.00407,
+      "grad_norm": 0.4806328504890235,
+      "learning_rate": 0.0012209999999999999,
+      "loss": 5.575,
+      "step": 407
+    },
+    {
+      "epoch": 0.00408,
+      "grad_norm": 0.5340674298116082,
+      "learning_rate": 0.001224,
+      "loss": 5.5941,
+      "step": 408
+    },
+    {
+      "epoch": 0.00409,
+      "grad_norm": 0.6817510563164704,
+      "learning_rate": 0.001227,
+      "loss": 5.5739,
+      "step": 409
+    },
+    {
+      "epoch": 0.0041,
+      "grad_norm": 0.8230982603577015,
+      "learning_rate": 0.00123,
+      "loss": 5.5739,
+      "step": 410
+    },
+    {
+      "epoch": 0.00411,
+      "grad_norm": 0.7982833366881755,
+      "learning_rate": 0.001233,
+      "loss": 5.5886,
+      "step": 411
+    },
+    {
+      "epoch": 0.00412,
+      "grad_norm": 0.7882120301866252,
+      "learning_rate": 0.001236,
+      "loss": 5.5767,
+      "step": 412
+    },
+    {
+      "epoch": 0.00413,
+      "grad_norm": 0.9078160072473371,
+      "learning_rate": 0.0012389999999999999,
+      "loss": 5.5798,
+      "step": 413
+    },
+    {
+      "epoch": 0.00414,
+      "grad_norm": 0.8046291024914881,
+      "learning_rate": 0.001242,
+      "loss": 5.5582,
+      "step": 414
+    },
+    {
+      "epoch": 0.00415,
+      "grad_norm": 0.9449024826812693,
+      "learning_rate": 0.001245,
+      "loss": 5.5633,
+      "step": 415
+    },
+    {
+      "epoch": 0.00416,
+      "grad_norm": 0.9578358959807691,
+      "learning_rate": 0.001248,
+      "loss": 5.5489,
+      "step": 416
+    },
+    {
+      "epoch": 0.00417,
+      "grad_norm": 0.7364680005190741,
+      "learning_rate": 0.001251,
+      "loss": 5.575,
+      "step": 417
+    },
+    {
+      "epoch": 0.00418,
+      "grad_norm": 0.5600093653837771,
+      "learning_rate": 0.001254,
+      "loss": 5.5419,
+      "step": 418
+    },
+    {
+      "epoch": 0.00419,
+      "grad_norm": 0.7369458002937045,
+      "learning_rate": 0.0012569999999999999,
+      "loss": 5.535,
+      "step": 419
+    },
+    {
+      "epoch": 0.0042,
+      "grad_norm": 0.7566412883958042,
+      "learning_rate": 0.00126,
+      "loss": 5.5478,
+      "step": 420
+    },
+    {
+      "epoch": 0.00421,
+      "grad_norm": 0.9341471688658377,
+      "learning_rate": 0.001263,
+      "loss": 5.5468,
+      "step": 421
+    },
+    {
+      "epoch": 0.00422,
+      "grad_norm": 0.9387048270351058,
+      "learning_rate": 0.001266,
+      "loss": 5.5395,
+      "step": 422
+    },
+    {
+      "epoch": 0.00423,
+      "grad_norm": 0.738543672170714,
+      "learning_rate": 0.001269,
+      "loss": 5.5309,
+      "step": 423
+    },
+    {
+      "epoch": 0.00424,
+      "grad_norm": 0.879163854006119,
+      "learning_rate": 0.001272,
+      "loss": 5.5379,
+      "step": 424
+    },
+    {
+      "epoch": 0.00425,
+      "grad_norm": 0.878245832078137,
+      "learning_rate": 0.001275,
+      "loss": 5.5393,
+      "step": 425
+    },
+    {
+      "epoch": 0.00426,
+      "grad_norm": 0.8393572375675296,
+      "learning_rate": 0.001278,
+      "loss": 5.5388,
+      "step": 426
+    },
+    {
+      "epoch": 0.00427,
+      "grad_norm": 0.8175993205302655,
+      "learning_rate": 0.001281,
+      "loss": 5.5188,
+      "step": 427
+    },
+    {
+      "epoch": 0.00428,
+      "grad_norm": 0.8492227718152501,
+      "learning_rate": 0.001284,
+      "loss": 5.5011,
+      "step": 428
+    },
+    {
+      "epoch": 0.00429,
+      "grad_norm": 0.8455500448937461,
+      "learning_rate": 0.001287,
+      "loss": 5.5167,
+      "step": 429
+    },
+    {
+      "epoch": 0.0043,
+      "grad_norm": 0.9588196540360735,
+      "learning_rate": 0.00129,
+      "loss": 5.5126,
+      "step": 430
+    },
+    {
+      "epoch": 0.00431,
+      "grad_norm": 1.0358439149859766,
+      "learning_rate": 0.001293,
+      "loss": 5.5121,
+      "step": 431
+    },
+    {
+      "epoch": 0.00432,
+      "grad_norm": 0.853137595287236,
+      "learning_rate": 0.001296,
+      "loss": 5.5152,
+      "step": 432
+    },
+    {
+      "epoch": 0.00433,
+      "grad_norm": 0.9144896540159448,
+      "learning_rate": 0.001299,
+      "loss": 5.5075,
+      "step": 433
+    },
+    {
+      "epoch": 0.00434,
+      "grad_norm": 1.0340397416077374,
+      "learning_rate": 0.001302,
+      "loss": 5.5131,
+      "step": 434
+    },
+    {
+      "epoch": 0.00435,
+      "grad_norm": 1.1136200661191735,
+      "learning_rate": 0.001305,
+      "loss": 5.5153,
+      "step": 435
+    },
+    {
+      "epoch": 0.00436,
+      "grad_norm": 0.7998503424321469,
+      "learning_rate": 0.001308,
+      "loss": 5.4814,
+      "step": 436
+    },
+    {
+      "epoch": 0.00437,
+      "grad_norm": 0.8862208467810537,
+      "learning_rate": 0.001311,
+      "loss": 5.5052,
+      "step": 437
+    },
+    {
+      "epoch": 0.00438,
+      "grad_norm": 0.85557749799579,
+      "learning_rate": 0.001314,
+      "loss": 5.4855,
+      "step": 438
+    },
+    {
+      "epoch": 0.00439,
+      "grad_norm": 0.6596001138977952,
+      "learning_rate": 0.001317,
+      "loss": 5.5056,
+      "step": 439
+    },
+    {
+      "epoch": 0.0044,
+      "grad_norm": 0.5461926920380444,
+      "learning_rate": 0.00132,
+      "loss": 5.4734,
+      "step": 440
+    },
+    {
+      "epoch": 0.00441,
+      "grad_norm": 0.5325344576484976,
+      "learning_rate": 0.001323,
+      "loss": 5.4692,
+      "step": 441
+    },
+    {
+      "epoch": 0.00442,
+      "grad_norm": 0.46029396349038315,
+      "learning_rate": 0.0013260000000000001,
+      "loss": 5.4603,
+      "step": 442
+    },
+    {
+      "epoch": 0.00443,
+      "grad_norm": 0.5200620875251907,
+      "learning_rate": 0.001329,
+      "loss": 5.4641,
+      "step": 443
+    },
+    {
+      "epoch": 0.00444,
+      "grad_norm": 0.511034817927936,
+      "learning_rate": 0.001332,
+      "loss": 5.4632,
+      "step": 444
+    },
+    {
+      "epoch": 0.00445,
+      "grad_norm": 0.61375364791033,
+      "learning_rate": 0.001335,
+      "loss": 5.4483,
+      "step": 445
+    },
+    {
+      "epoch": 0.00446,
+      "grad_norm": 0.7540282970336214,
+      "learning_rate": 0.001338,
+      "loss": 5.4549,
+      "step": 446
+    },
+    {
+      "epoch": 0.00447,
+      "grad_norm": 0.7743861790351634,
+      "learning_rate": 0.001341,
+      "loss": 5.456,
+      "step": 447
+    },
+    {
+      "epoch": 0.00448,
+      "grad_norm": 0.6949785247448689,
+      "learning_rate": 0.0013440000000000001,
+      "loss": 5.4375,
+      "step": 448
+    },
+    {
+      "epoch": 0.00449,
+      "grad_norm": 0.8972954522362333,
+      "learning_rate": 0.001347,
+      "loss": 5.4453,
+      "step": 449
+    },
+    {
+      "epoch": 0.0045,
+      "grad_norm": 1.0136292885280909,
+      "learning_rate": 0.00135,
+      "loss": 5.4524,
+      "step": 450
+    },
+    {
+      "epoch": 0.00451,
+      "grad_norm": 0.7959348815359711,
+      "learning_rate": 0.001353,
+      "loss": 5.4372,
+      "step": 451
+    },
+    {
+      "epoch": 0.00452,
+      "grad_norm": 0.750530581913797,
+      "learning_rate": 0.001356,
+      "loss": 5.4212,
+      "step": 452
+    },
+    {
+      "epoch": 0.00453,
+      "grad_norm": 0.718332283553841,
+      "learning_rate": 0.001359,
+      "loss": 5.4094,
+      "step": 453
+    },
+    {
+      "epoch": 0.00454,
+      "grad_norm": 0.8243339574967999,
+      "learning_rate": 0.0013620000000000001,
+      "loss": 5.4327,
+      "step": 454
+    },
+    {
+      "epoch": 0.00455,
+      "grad_norm": 0.8060545663764288,
+      "learning_rate": 0.0013650000000000001,
+      "loss": 5.4278,
+      "step": 455
+    },
+    {
+      "epoch": 0.00456,
+      "grad_norm": 0.9387057405661987,
+      "learning_rate": 0.001368,
+      "loss": 5.4287,
+      "step": 456
+    },
+    {
+      "epoch": 0.00457,
+      "grad_norm": 1.110172512819111,
+      "learning_rate": 0.001371,
+      "loss": 5.4304,
+      "step": 457
+    },
+    {
+      "epoch": 0.00458,
+      "grad_norm": 0.7485802071411273,
+      "learning_rate": 0.001374,
+      "loss": 5.4279,
+      "step": 458
+    },
+    {
+      "epoch": 0.00459,
+      "grad_norm": 0.846395295484429,
+      "learning_rate": 0.0013770000000000002,
+      "loss": 5.4177,
+      "step": 459
+    },
+    {
+      "epoch": 0.0046,
+      "grad_norm": 1.2095188964594632,
+      "learning_rate": 0.0013800000000000002,
+      "loss": 5.4166,
+      "step": 460
+    },
+    {
+      "epoch": 0.00461,
+      "grad_norm": 1.1548058188436976,
+      "learning_rate": 0.0013830000000000001,
+      "loss": 5.417,
+      "step": 461
+    },
+    {
+      "epoch": 0.00462,
+      "grad_norm": 0.9626057997692408,
+      "learning_rate": 0.001386,
+      "loss": 5.4177,
+      "step": 462
+    },
+    {
+      "epoch": 0.00463,
+      "grad_norm": 1.1365427244526745,
+      "learning_rate": 0.001389,
+      "loss": 5.4083,
+      "step": 463
+    },
+    {
+      "epoch": 0.00464,
+      "grad_norm": 0.7154214826672701,
+      "learning_rate": 0.001392,
+      "loss": 5.4147,
+      "step": 464
+    },
+    {
+      "epoch": 0.00465,
+      "grad_norm": 0.5933778225768791,
+      "learning_rate": 0.0013950000000000002,
+      "loss": 5.3806,
+      "step": 465
+    },
+    {
+      "epoch": 0.00466,
+      "grad_norm": 0.6213055581786315,
+      "learning_rate": 0.0013980000000000002,
+      "loss": 5.3973,
+      "step": 466
+    },
+    {
+      "epoch": 0.00467,
+      "grad_norm": 0.5608640811659587,
+      "learning_rate": 0.0014010000000000001,
+      "loss": 5.3871,
+      "step": 467
+    },
+    {
+      "epoch": 0.00468,
+      "grad_norm": 0.4459725756410885,
+      "learning_rate": 0.001404,
+      "loss": 5.3713,
+      "step": 468
+    },
+    {
+      "epoch": 0.00469,
+      "grad_norm": 0.46857319789964524,
+      "learning_rate": 0.001407,
+      "loss": 5.3733,
+      "step": 469
+    },
+    {
+      "epoch": 0.0047,
+      "grad_norm": 0.4864537455422831,
+      "learning_rate": 0.00141,
+      "loss": 5.3823,
+      "step": 470
+    },
+    {
+      "epoch": 0.00471,
+      "grad_norm": 0.5233417273033707,
+      "learning_rate": 0.001413,
+      "loss": 5.3595,
+      "step": 471
+    },
+    {
+      "epoch": 0.00472,
+      "grad_norm": 0.7276814872840428,
+      "learning_rate": 0.001416,
+      "loss": 5.376,
+      "step": 472
+    },
+    {
+      "epoch": 0.00473,
+      "grad_norm": 0.9313958457119089,
+      "learning_rate": 0.001419,
+      "loss": 5.3908,
+      "step": 473
+    },
+    {
+      "epoch": 0.00474,
+      "grad_norm": 0.9969581851520253,
+      "learning_rate": 0.0014219999999999999,
+      "loss": 5.3782,
+      "step": 474
+    },
+    {
+      "epoch": 0.00475,
+      "grad_norm": 0.7172709684261298,
+      "learning_rate": 0.001425,
+      "loss": 5.3626,
+      "step": 475
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 100000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 25,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.88307303563264e+16,
+  "train_batch_size": 512,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-475/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,674 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import json
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering Sharded Weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        state_dict_split = split_torch_state_dict_into_shards(state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard, output_path)
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

checkpoint-49875/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "JW17/SmolLM-14m-v0.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "flash_attn": true,
+  "head_dim": 32,
+  "hidden_act": "silu",
+  "hidden_size": 128,
+  "initializer_range": 0.02,
+  "intermediate_size": 512,
+  "is_llama_config": true,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 6,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_interleaved": false,
+  "rope_scaling": null,
+  "rope_theta": 100000,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.3",
+  "use_cache": true,
+  "vocab_size": 50280
+}

checkpoint-49875/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "transformers_version": "4.46.3"
+}

checkpoint-49875/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step49875