LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct

์ž‘์—…ํ•ด์ฃผ์‹  maywell/EXAONE-3.0-7.8B-Instruct-Llamafied์„ ์ฐธ๊ณ ํ•ด์„œ ๋ณ€๊ฒฝํ–ˆ์Šต๋‹ˆ๋‹ค. GPU ์ž์›์ด ์—†์œผ์‹œ๋ฉด ์‚ฌ์šฉํ•˜์‹œ๋ฉด ๋ฉ๋‹ˆ๋‹ค.

์˜ฌ๋ผ๊ฐ„ ๋ชจ๋ธ์€ 8K ์ปจํ…์ŠคํŠธ๊นŒ์ง€ ์ง€์›ํ•˜๋„๋ก ์„ค์ •์„ ๋ณ€๊ฒฝํ•˜์˜€์Šต๋‹ˆ๋‹ค. (์„ฑ๋Šฅ ๋ฏธํ™•์ธ)

import torch
import gc

from transformers import LlamaConfig, LlamaForCausalLM, AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

def unload_model(model):
    del model
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def create_llama_config(exaone_config):
    return LlamaConfig(
        vocab_size=exaone_config.vocab_size,
        hidden_size=exaone_config.hidden_size,
        intermediate_size=exaone_config.intermediate_size,
        num_hidden_layers=exaone_config.num_layers,
        num_attention_heads=exaone_config.num_attention_heads,
        max_position_embeddings=exaone_config.max_position_embeddings,
        rms_norm_eps=exaone_config.layer_norm_epsilon,
        num_key_value_heads=exaone_config.num_key_value_heads,
        rope_theta=exaone_config.rope_theta,
        bos_token_id=exaone_config.bos_token_id,
        eos_token_id=exaone_config.eos_token_id,
        pad_token_id=exaone_config.pad_token_id,
        attention_bias=False,
    )

def copy_embedding_weights(llama_model, exaone_model):
    llama_model.model.embed_tokens.weight.data = exaone_model.transformer.wte.weight.data.to(torch.float16)

def copy_layer_weights(llama_layer, exaone_layer):
    # Self-attention
    llama_layer.self_attn.q_proj.weight.data = exaone_layer.attn.attention.q_proj.weight.data.to(torch.float16)
    llama_layer.self_attn.k_proj.weight.data = exaone_layer.attn.attention.k_proj.weight.data.to(torch.float16)
    llama_layer.self_attn.v_proj.weight.data = exaone_layer.attn.attention.v_proj.weight.data.to(torch.float16)
    llama_layer.self_attn.o_proj.weight.data = exaone_layer.attn.attention.out_proj.weight.data.to(torch.float16)
    # MLP
    llama_layer.mlp.gate_proj.weight.data = exaone_layer.mlp.c_fc_0.weight.data.to(torch.float16)
    llama_layer.mlp.up_proj.weight.data = exaone_layer.mlp.c_fc_1.weight.data.to(torch.float16)
    llama_layer.mlp.down_proj.weight.data = exaone_layer.mlp.c_proj.weight.data.to(torch.float16)
    # Layer Norms
    llama_layer.input_layernorm.weight.data = exaone_layer.ln_1.weight.data.to(torch.float16)
    llama_layer.post_attention_layernorm.weight.data = exaone_layer.ln_2.weight.data.to(torch.float16)

def copy_final_weights(llama_model, exaone_model):
    llama_model.model.norm.weight.data = exaone_model.transformer.ln_f.weight.data.to(torch.float16)
    llama_model.lm_head.weight.data = exaone_model.lm_head.weight.data.to(torch.float16)

def port_exaone_to_llama(exaone_model_path, llama_model_path):
    print("Loading EXAONE model and tokenizer...")
    exaone_model = AutoModelForCausalLM.from_pretrained(exaone_model_path, torch_dtype=torch.float16, device_map="cpu", trust_remote_code=True)
    exaone_tokenizer = AutoTokenizer.from_pretrained(exaone_model_path, trust_remote_code=True)
    exaone_config = exaone_model.config

    print("Creating Llama configuration...")
    llama_config = create_llama_config(exaone_config)

    print("Initializing Llama model...")
    llama_model = LlamaForCausalLM(llama_config)
    llama_model.to(torch.float16)
    llama_model.to('cpu')

    print("Copying weights...")
    with torch.no_grad():
        copy_embedding_weights(llama_model, exaone_model)

        for i in tqdm(range(exaone_config.num_layers), desc="Copying layers"):
            copy_layer_weights(llama_model.model.layers[i], exaone_model.transformer.h[i])
            if i % 10 == 0:  # Garbage collection every 10 layers
                gc.collect()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

        copy_final_weights(llama_model, exaone_model)

    print("Unloading EXAONE model to free memory...")
    unload_model(exaone_model)

    print(f"Saving ported Llama model and tokenizer to {llama_model_path}")
    llama_model.save_pretrained(llama_model_path, safe_serialization=True, max_shard_size="1GB")
    exaone_tokenizer.save_pretrained(llama_model_path)

    print("Unloading Llama model...")
    unload_model(llama_model)

    print(f"EXAONE model successfully ported to Llama format and saved at {llama_model_path}")

if __name__ == "__main__":
    exaone_model_path = "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
    llama_model_path = "./exa_llamafied"
    port_exaone_to_llama(exaone_model_path, llama_model_path)

๋ชจ๋ธ์„ ๊ณต๊ฐœํ•ด์ฃผ์‹  LG AI Research๋ถ„๋“ค๊ป˜ ๊ฐ์‚ฌ์˜ ๋ง์”€ ๋“œ๋ฆฝ๋‹ˆ๋‹ค.

Downloads last month
4
Safetensors
Model size
7.82B params
Tensor type
F32
ยท
Inference API
Unable to determine this model's library. Check the docs .

Model tree for CarrotAI/EXAONE-3.0-7.8B-Instruct-Llamafied-8k

Quantizations
1 model