In [1]:
model_name_or_path = "TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T" #@param {type:"string"}
model_name = model_name_or_path.split("/")[-1]

save_mistral_dir = "/content/tiny_mistral" #@param {type:"string"}

mixtral_num_experts = 8 #@param {type:"integer"}
save_mixtral_dir = "/content/tiny_mixtral_x" #@param {type:"string"}


In [None]:
!pip install transformers --upgrade
!pip install torch safetensors

In [None]:
!git clone https://huggingface.co./{model_name_or_path}

In [3]:
import json
import torch

# load config.json
with open(f"{model_name}/config.json") as f:
 config = json.load(f)

print(config)

mistral_config = {
 "architectures": [
 "MistralForCausalLM"
 ],
 "attention_dropout": 0.0,
 "bos_token_id": 1,
 "eos_token_id": 2,
 "hidden_act": "silu",
 "hidden_size": 4096,
 "initializer_range": 0.02,
 "intermediate_size": 14336,
 "max_position_embeddings": 32768,
 "model_type": "mistral",
 "num_attention_heads": 32,
 "num_hidden_layers": 32,
 "num_key_value_heads": 8,
 "rms_norm_eps": 1e-05,
 "rope_theta": 1000000.0,
 "sliding_window": None,
 "tie_word_embeddings": False,
 # "torch_dtype": "bfloat16",
 "transformers_version": "4.36.0",
 "use_cache": True,
 "vocab_size": 32000
}
mistral_config["architectures"] = ["MistralForCausalLM"]
mistral_config["model_type"] = "mistral"
mistral_config["bos_token_id"] = config["bos_token_id"]
mistral_config["eos_token_id"] = config["eos_token_id"]
mistral_config["hidden_act"] = config["hidden_act"]
mistral_config["hidden_size"] = config["hidden_size"]
mistral_config["initializer_range"] = config["initializer_range"]
mistral_config["intermediate_size"] = config["intermediate_size"]
mistral_config["max_position_embeddings"] = config["max_position_embeddings"]
mistral_config["num_attention_heads"] = config["num_attention_heads"]
mistral_config["num_hidden_layers"] = config["num_hidden_layers"]
mistral_config["num_key_value_heads"] = config["num_key_value_heads"]
mistral_config["rms_norm_eps"] = config["rms_norm_eps"]
mistral_config["rope_theta"] = 1000000.0
mistral_config["sliding_window"] = None
mistral_config["tie_word_embeddings"] = config["tie_word_embeddings"]
mistral_config["torch_dtype"] = config["torch_dtype"]
mistral_config["transformers_version"] = "4.36.0"
mistral_config["use_cache"] = config["use_cache"]
mistral_config["vocab_size"] = config["vocab_size"]

# save tokenizer and model
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.save_pretrained(save_mistral_dir)

from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
if model.dtype == torch.float32:
 model.half()
 model.to(torch.bfloat16)
 mistral_config["torch_dtype"] = "bfloat16"

model.save_pretrained(save_mistral_dir)

# save convert mistral config
with open(f"{save_mistral_dir}/config.json", "w") as f:
 json.dump(mistral_config, f, indent=2)


{'_name_or_path': 'meta-llama/Llama-2-7b-hf', 'architectures': ['LlamaForCausalLM'], 'bos_token_id': 1, 'eos_token_id': 2, 'hidden_act': 'silu', 'hidden_size': 2048, 'initializer_range': 0.02, 'intermediate_size': 5632, 'max_position_embeddings': 2048, 'model_type': 'llama', 'num_attention_heads': 32, 'num_hidden_layers': 22, 'num_key_value_heads': 4, 'pretraining_tp': 1, 'rms_norm_eps': 1e-05, 'rope_scaling': None, 'tie_word_embeddings': False, 'torch_dtype': 'float32', 'transformers_version': '4.31.0.dev0', 'use_cache': True, 'vocab_size': 32000}


In [5]:
#
# モデルの出力テスト
#
from transformers import AutoModelForCausalLM, MistralForCausalLM, MixtralForCausalLM
def test_gen(model_name_or_path):

 device = "cpu" # ここを変えてね

 model = AutoModelForCausalLM.from_pretrained(model_name_or_path)

 print("check model load ")
 print(model.config)
 print(model)

 print("check model generate text")
 messages = [
 {"role": "user", "content": "What is your favourite condiment?"},
 {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
 {"role": "user", "content": "Do you have mayonnaise recipes?"}
 ]

 encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

 model_inputs = encodeds.to(device)
 model.to(device)

 generated_ids = model.generate(model_inputs, max_new_tokens=128, do_sample=True)
 decoded = tokenizer.batch_decode(generated_ids)
 print(decoded[0])
 print("------------------------")
 return model, tokenizer

_ , _ = test_gen(save_mistral_dir)


check model load 
MistralConfig {
 "_name_or_path": "/content/tiny_mistral",
 "architectures": [
 "MistralForCausalLM"
 ],
 "attention_dropout": 0.0,
 "bos_token_id": 1,
 "eos_token_id": 2,
 "hidden_act": "silu",
 "hidden_size": 2048,
 "initializer_range": 0.02,
 "intermediate_size": 5632,
 "max_position_embeddings": 2048,
 "model_type": "mistral",
 "num_attention_heads": 32,
 "num_hidden_layers": 22,
 "num_key_value_heads": 4,
 "rms_norm_eps": 1e-05,
 "rope_theta": 1000000.0,
 "sliding_window": null,
 "tie_word_embeddings": false,
 "torch_dtype": "bfloat16",
 "transformers_version": "4.36.1",
 "use_cache": true,
 "vocab_size": 32000
}

MistralForCausalLM(
 (model): MistralModel(
 (embed_tokens): Embedding(32000, 2048)
 (layers): ModuleList(
 (0-21): 22 x MistralDecoderLayer(
 (self_attn): MistralAttention(
 (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
 (k_proj): Linear(in_features=2048, out_features=256, bias=False)
 (v_proj): Linear(in_features=2048, out_features

In [6]:
#
# mixtral config setting
#

mixtral_config = {
 "architectures": [
 "MixtralForCausalLM"
 ],
 "attention_dropout": 0.0,
 "bos_token_id": 1,
 "eos_token_id": 2,
 "hidden_act": "silu",
 "hidden_size": 4096,
 "initializer_range": 0.02,
 "intermediate_size": 14336,
 "max_position_embeddings": 32768,
 "model_type": "mixtral",
 "num_attention_heads": 32,
 "num_experts_per_tok": 2,
 "num_hidden_layers": 32,
 "num_key_value_heads": 8,
 "num_local_experts": 8,
 "output_router_logits": False,
 "rms_norm_eps": 1e-05,
 "rope_theta": 1000000.0,
 "router_aux_loss_coef": 0.02,
 "sliding_window": None,
 "tie_word_embeddings": False,
 "torch_dtype": "bfloat16",
 "transformers_version": "4.36.0.dev0",
 "use_cache": True,
 "vocab_size": 32000
}

mixtral_config["architectures"] = ["MixtralForCausalLM"]
mixtral_config["model_type"] = "mixtral"
mixtral_config["num_experts_per_tok"] = 2
mixtral_config["num_local_experts"] = mixtral_num_experts

mixtral_config["bos_token_id"] = mistral_config["bos_token_id"]
mixtral_config["eos_token_id"] = mistral_config["eos_token_id"]
mixtral_config["hidden_act"] = mistral_config["hidden_act"]
mixtral_config["hidden_size"] = mistral_config["hidden_size"]
mixtral_config["initializer_range"] = mistral_config["initializer_range"]
mixtral_config["intermediate_size"] = mistral_config["intermediate_size"]
mixtral_config["max_position_embeddings"] = mistral_config["max_position_embeddings"]
mixtral_config["num_attention_heads"] = mistral_config["num_attention_heads"]
mixtral_config["num_hidden_layers"] = mistral_config["num_hidden_layers"]
mixtral_config["num_key_value_heads"] = mistral_config["num_key_value_heads"]
mixtral_config["rms_norm_eps"] = mistral_config["rms_norm_eps"]
mixtral_config["rope_theta"] = mistral_config["rope_theta"]
mixtral_config["sliding_window"] = mistral_config["sliding_window"]
mixtral_config["tie_word_embeddings"] = mistral_config["tie_word_embeddings"]
mixtral_config["torch_dtype"] = mistral_config["torch_dtype"]
mixtral_config["transformers_version"] = "4.36.0.dev0"
mixtral_config["use_cache"] = mistral_config["use_cache"]
mixtral_config["vocab_size"] = mistral_config["vocab_size"]

print(json.dumps(mixtral_config,indent=2))

# configをsave
!mkdir -p {save_mixtral_dir}
with open(f"{save_mixtral_dir}/config.json", "w") as f:
 json.dump(mixtral_config, f, indent=2)


{
 "architectures": [
 "MixtralForCausalLM"
 ],
 "attention_dropout": 0.0,
 "bos_token_id": 1,
 "eos_token_id": 2,
 "hidden_act": "silu",
 "hidden_size": 2048,
 "initializer_range": 0.02,
 "intermediate_size": 5632,
 "max_position_embeddings": 2048,
 "model_type": "mixtral",
 "num_attention_heads": 32,
 "num_experts_per_tok": 2,
 "num_hidden_layers": 22,
 "num_key_value_heads": 4,
 "num_local_experts": 8,
 "output_router_logits": false,
 "rms_norm_eps": 1e-05,
 "rope_theta": 1000000.0,
 "router_aux_loss_coef": 0.02,
 "sliding_window": null,
 "tie_word_embeddings": false,
 "torch_dtype": "bfloat16",
 "transformers_version": "4.36.0.dev0",
 "use_cache": true,
 "vocab_size": 32000
}


In [7]:
# copy other model files

# save tokenizer
if tokenizer is None:
 from transformers import AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

tokenizer.save_pretrained(save_mixtral_dir)

!cp {save_mistral_dir}/generation_config.json {save_mixtral_dir}/generation_config.json


In [8]:
# convert mixtral clone
import torch
from safetensors import safe_open
from safetensors.torch import save_file
import re

def convert_weight_name(mistral_key, mixtral_expert_num):
 if "mlp.gate_proj" in mistral_key:
 return mistral_key.replace(".mlp.gate_proj.", f".block_sparse_moe.experts.{mixtral_expert_num}.w1.")
 elif "mlp.down_proj" in mistral_key:
 return mistral_key.replace(".mlp.down_proj.", f".block_sparse_moe.experts.{mixtral_expert_num}.w2.")
 elif "mlp.up_proj" in mistral_key:
 return mistral_key.replace(".mlp.up_proj." , f".block_sparse_moe.experts.{mixtral_expert_num}.w3.")
 else:
 return mistral_key

def is_experts_key(mistral_key):
 return ".mlp." in mistral_key

def get_layer(mistral_key):
 layer = re.match(r'model[.]layers[.]\d+[.]', mistral_key)
 if layer is not None:
 return int(re.findall(r'\d+', layer[0])[0])
 return None

def get_weight_byte_size(weight):

 if isinstance(weight, torch.Tensor):
 weight_byte_size = weight.nelement() * weight.element_size()
 else:
 weight_byte_size = sum(p.nelement() * p.element_size() for p in weight.parameters())

 return weight_byte_size

# mistralのweight取得
mistral_weights = safe_open(save_mistral_dir + "/model.safetensors", framework="pt")
# print(mistral_weights.keys())

first_weights = {}

gate_shape = mistral_weights.get_tensor("model.layers.0.mlp.up_proj.weight").shape
gate_tensor = torch.full((mixtral_num_experts, gate_shape[1]), 0.5)

common_layer_weights = {}

print("mixtral_num_experts", mixtral_num_experts, "gate_shape[1]", gate_shape[1], "gate_tensor", gate_tensor)

# max layer
max_layer_no = 0
for key in mistral_weights.keys():
 layer_no = get_layer(key)
 if layer_no is None:
 first_weights[key] = mistral_weights.get_tensor(key)
 else:
 max_layer_no = max(max_layer_no, layer_no)

mixtral_weight_map = {
 "metadata": {
 "total_size": 0
 },
 "weight_map": {
 }
}

total_size = 0

!rm {save_mixtral_dir + "/*.safetensors"}

for i in range(max_layer_no + 1):
 weight_file_no = i + 1
 layer_weights = {}

 # first weight
 if weight_file_no == 1:
 for key in first_weights.keys():
 mixtral_key = convert_weight_name(key, 0)
 layer_weights[mixtral_key] = first_weights[mixtral_key]
 total_size += get_weight_byte_size(layer_weights[mixtral_key])
 print("first", mixtral_key, layer_weights[mixtral_key].shape)


 for key in mistral_weights.keys():

 lk = re.match(re.compile(f"model[.]layers[.]{i}[.]"), key)
 if lk is not None:
 mistral_layer_key = key
 if not is_experts_key(mistral_layer_key):
 mixtral_key = convert_weight_name(mistral_layer_key, 0)
 layer_weights[mixtral_key] = mistral_weights.get_tensor(mistral_layer_key)
 total_size += get_weight_byte_size(layer_weights[mixtral_key])
 print("layer", i , mixtral_key, layer_weights[mixtral_key].shape)
 else:
 print("gen experts")
 for expert_no in range(mixtral_num_experts):
 mixtral_key = convert_weight_name(mistral_layer_key, expert_no)
 layer_weights[mixtral_key] = mistral_weights.get_tensor(mistral_layer_key).clone()
 total_size += get_weight_byte_size(layer_weights[mixtral_key])
 print("layer", i , "expert", expert_no, mixtral_key, layer_weights[mixtral_key].shape)

 # gate
 mixtral_key = f"model.layers.{i}.block_sparse_moe.gate.weight"
 layer_weights[mixtral_key] = gate_tensor.clone()
 total_size += get_weight_byte_size(layer_weights[mixtral_key])
 print("layer", i , "gate", mixtral_key, layer_weights[mixtral_key].shape)

 #フォーマットで0埋め
 tensor_weight_file_name = f"model.layers.{weight_file_no:05d}-of-{max_layer_no + 1:05d}.safetensors"

 # save safetensor
 save_file(layer_weights, save_mixtral_dir + "/" + tensor_weight_file_name, metadata={"format":"pt"})
 print("Save layer weighs", i, tensor_weight_file_name)

 for key in layer_weights.keys():
 mixtral_weight_map["weight_map"][key] = tensor_weight_file_name

 print(i, tensor_weight_file_name)

# set total size
mixtral_weight_map["metadata"]["total_size"] = total_size

# save model.safetensors.index.json
with open(save_mixtral_dir + "/model.safetensors.index.json", "w") as f:
 json.dump(mixtral_weight_map, f, indent=2)

print(mixtral_weight_map)


mixtral_num_experts 8 gate_shape[1] 2048 gate_tensor tensor([[0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],
 [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],
 [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],
 ...,
 [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],
 [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],
 [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000]])
first lm_head.weight torch.Size([32000, 2048])
first model.embed_tokens.weight torch.Size([32000, 2048])
first model.norm.weight torch.Size([2048])
layer 0 model.layers.0.input_layernorm.weight torch.Size([2048])
gen experts
layer 0 expert 0 model.layers.0.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])
layer 0 expert 1 model.layers.0.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])
layer 0 expert 2 model.layers.0.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])
layer 0 expert 3 model.layers.0.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 563

In [9]:
# check model
mx_model, mx_tok = test_gen(save_mixtral_dir)

Loading checkpoint shards: 0%| | 0/22 [00:00, ?it/s]

check model load 
MixtralConfig {
 "_name_or_path": "/content/tiny_mixtral_x",
 "architectures": [
 "MixtralForCausalLM"
 ],
 "attention_dropout": 0.0,
 "bos_token_id": 1,
 "eos_token_id": 2,
 "hidden_act": "silu",
 "hidden_size": 2048,
 "initializer_range": 0.02,
 "intermediate_size": 5632,
 "max_position_embeddings": 2048,
 "model_type": "mixtral",
 "num_attention_heads": 32,
 "num_experts_per_tok": 2,
 "num_hidden_layers": 22,
 "num_key_value_heads": 4,
 "num_local_experts": 8,
 "output_router_logits": false,
 "rms_norm_eps": 1e-05,
 "rope_theta": 1000000.0,
 "router_aux_loss_coef": 0.02,
 "sliding_window": null,
 "tie_word_embeddings": false,
 "torch_dtype": "bfloat16",
 "transformers_version": "4.36.1",
 "use_cache": true,
 "vocab_size": 32000
}

MixtralForCausalLM(
 (model): MixtralModel(
 (embed_tokens): Embedding(32000, 2048)
 (layers): ModuleList(
 (0-21): 22 x MixtralDecoderLayer(
 (self_attn): MixtralAttention(
 (q_proj): Linear(in_features=2048, out_features=2048, bias=Fals

In [None]:
# from google.colab import userdata
# !huggingface-cli login --token {userdata.get('HUGGINGFACE_ACCESS_TOKEN')}

In [None]:
# huggingface_repo = "TinyMixtral-x8-Clonebase-7b"
# mx_model.push_to_hub(huggingface_repo, private=True)
# mx_tok.push_to_hub(huggingface_repo, private=True)