|
import os |
|
import torch |
|
from transformers import MllamaForConditionalGeneration, MllamaProcessor, AutoModelForCausalLM |
|
|
|
|
|
|
|
multimodal_model_path = "models/meta-llama-Llama-3.2-90B-Vision-Instruct" |
|
text_model_path = "models/path_to_Llama3.1_70B" |
|
save_path = "models/merged_model" |
|
|
|
multimodal_model = MllamaForConditionalGeneration.from_pretrained(multimodal_model_path, device_map="cpu", torch_dtype=torch.bfloat16) |
|
multimodal_processor = MllamaProcessor.from_pretrained(multimodal_model_path) |
|
text_model = AutoModelForCausalLM.from_pretrained(text_model_path, device_map="cpu", torch_dtype=torch.bfloat16) |
|
|
|
state_dict_multimodal = multimodal_model.state_dict() |
|
state_dict_text = text_model.state_dict() |
|
|
|
num_decoder_layers_text = text_model.config.num_hidden_layers |
|
num_decoder_layers_vision = multimodal_model.config.text_config.num_hidden_layers |
|
|
|
|
|
inserted_layers = set() |
|
for key_multimodal in state_dict_multimodal.keys(): |
|
if "language_model" in key_multimodal and "cross_attn" in key_multimodal and ".layers." in key_multimodal: |
|
layer_num_multimodal = int(key_multimodal.split(".layers.")[1].split(".")[0]) if ".layers." in key_multimodal else None |
|
if layer_num_multimodal is not None: inserted_layers.add(layer_num_multimodal) |
|
|
|
|
|
|
|
|
|
assert len(inserted_layers) == num_decoder_layers_vision-num_decoder_layers_text, "# of added layers do not match" |
|
|
|
|
|
layer_map = dict() |
|
layer_num_multimodal = 0 |
|
for layer_num_text in range(num_decoder_layers_text): |
|
while layer_num_multimodal in inserted_layers: layer_num_multimodal += 1 |
|
layer_map[layer_num_multimodal] = layer_num_text |
|
layer_num_multimodal += 1 |
|
|
|
for key_multimodal in state_dict_multimodal.keys(): |
|
if "language_model" not in key_multimodal: continue |
|
if "cross_attn" in key_multimodal: continue |
|
key_text = key_multimodal.replace("language_model.", "") |
|
if "embed_tokens.weight" in key_multimodal: |
|
assert key_text in state_dict_text, f"Key not found: {key_text}" |
|
extra_tokens = state_dict_multimodal[key_multimodal].shape[0] - state_dict_text[key_text].shape[0] |
|
state_dict_multimodal[key_multimodal][:state_dict_text[key_text].shape[0], :].copy_(state_dict_text[key_text]) |
|
print(f"Replaced {key_multimodal} with {key_text} (preserving last {extra_tokens} tokens)") |
|
continue |
|
if "lm_head" in key_multimodal or "model.norm.weight" in key_multimodal: |
|
assert key_text in state_dict_text, f"Key not found: {key_text}" |
|
state_dict_multimodal[key_multimodal].copy_(state_dict_text[key_text]) |
|
print(f"Replaced {key_multimodal} with {key_text}") |
|
continue |
|
layer_num_multimodal = int(key_multimodal.split(".layers.")[1].split(".")[0]) if ".layers." in key_multimodal else None |
|
assert layer_num_multimodal is not None, f"Unknown non-decoder key encountered: {key_multimodal}" |
|
if layer_num_multimodal in inserted_layers: continue |
|
assert layer_num_multimodal in layer_map, f"Layer not found in layer_map: {layer_num_multimodal}" |
|
layer_num_text = layer_map[layer_num_multimodal] |
|
key_text = key_text.replace(f".layers.{layer_num_multimodal}.", f".layers.{layer_num_text}.") |
|
assert key_text in state_dict_text, f"Key not found: {key_text}" |
|
state_dict_multimodal[key_multimodal].copy_(state_dict_text[key_text]) |
|
print(f"Replaced {key_multimodal} with {key_text}") |
|
|
|
print("Merged model successfully. Saving...") |
|
|
|
multimodal_model.load_state_dict(state_dict_multimodal) |
|
|
|
|
|
os.makedirs(save_path, exist_ok=True) |
|
multimodal_model.save_pretrained(save_path, safe_serialization=True, max_shard_size="8192MB") |
|
multimodal_processor.save_pretrained(save_path) |
|
print(f"Model saved to {save_path}") |