์ด๋์ ๋ ์ด์ ๋ ๋ง์ ์กฐ๊ธ ์์ ๋ฃ๋๋ค.
8bit ํ์ต ๊ธฐ์ค
{ "_name_or_path": "saltlux/Ko-Llama3-Luxia-8B", "architectures": [ "LlamaForCausalLM" ], "attention_bias": false, "attention_dropout": 0.0, "bos_token_id": 128000, "eos_token_id": 128001, "hidden_act": "silu", "hidden_size": 4096, "initializer_range": 0.02, "intermediate_size": 14336, "max_position_embeddings": 8192, "model_type": "llama", "num_attention_heads": 32, "num_hidden_layers": 32, "num_key_value_heads": 8, "pretraining_tp": 1, "rms_norm_eps": 1e-05, "rope_scaling": null, "rope_theta": 500000.0, "tie_word_embeddings": false, "torch_dtype": "float16", "transformers_version": "4.39.1", "use_cache": false, "vocab_size": 145792 }
batch_size = 16 num_epochs = 1 micro_batch = 1 gradient_accumulation_steps = batch_size
ํ๋ จ ๋ฐฉ๋ฒ์ ๋ํ ํ์ดํผ ํ๋ผ๋ฏธํฐ
cutoff_len = model.config.hidden_size lr_scheduler = 'cosine' warmup_ratio = 0.06 # warmup_steps = 100 learning_rate = 2e-4 optimizer = 'adamw_torch' weight_decay = 0.01 max_grad_norm = 0.8 # ๋ชจ๋ธ์ด ๋๋ฌด ๊ณผ์ ํฉ๋๋๊ฑฐ๊ฐ๊ฑฐ๋ ์ด์ํ ๋ฐฉํฅ์ผ๋ก ํ์ต์ด ๋๋๊ฑฐ๊ฐ์ผ๋ฉด ๊ฐ์ ์ค์ฌ๋ณด์.
LoRA config
lora_r = 16 lora_alpha = 16 lora_dropout = 0.05 lora_target_modules = ["gate_proj", "down_proj", "up_proj"]
Tokenizer์์ ๋์ค๋ input๊ฐ ์ค์ ์ต์
train_on_inputs = False add_eos_token = True
val_data = None
Others
resume_from_checkpoint = False # !! ๋ง์ฝ ๋ชจ๋ธ์ ์ด์ด์ ํ๋ จํ๊ณ ์ถ๋ค๋ฉด, './custom_LLM/checkpoint-[xxx]'์ ๊ฐ์ด ํ์ผ ๊ฒฝ๋ก๋ฅผ ์ ๋ ฅํด์ผ ํฉ๋๋ค! output_dir = './custom_LLM'