Ko-Luxia-8B-it-v0.3 / README.md
MDDDDR's picture
Update README.md
ca314ef verified
|
raw
history blame
1.68 kB

์–ด๋Š์ •๋„ ์ด์ œ๋Š” ๋ง์„ ์กฐ๊ธˆ ์•Œ์•„ ๋“ฃ๋Š”๋‹ค.

8bit ํ•™์Šต ๊ธฐ์ค€

{ "_name_or_path": "saltlux/Ko-Llama3-Luxia-8B", "architectures": [ "LlamaForCausalLM" ], "attention_bias": false, "attention_dropout": 0.0, "bos_token_id": 128000, "eos_token_id": 128001, "hidden_act": "silu", "hidden_size": 4096, "initializer_range": 0.02, "intermediate_size": 14336, "max_position_embeddings": 8192, "model_type": "llama", "num_attention_heads": 32, "num_hidden_layers": 32, "num_key_value_heads": 8, "pretraining_tp": 1, "rms_norm_eps": 1e-05, "rope_scaling": null, "rope_theta": 500000.0, "tie_word_embeddings": false, "torch_dtype": "float16", "transformers_version": "4.39.1", "use_cache": false, "vocab_size": 145792 }

batch_size = 16 num_epochs = 1 micro_batch = 1 gradient_accumulation_steps = batch_size

ํ›ˆ๋ จ ๋ฐฉ๋ฒ•์— ๋Œ€ํ•œ ํ•˜์ดํผ ํŒŒ๋ผ๋ฏธํ„ฐ

cutoff_len = model.config.hidden_size lr_scheduler = 'cosine' warmup_ratio = 0.06 # warmup_steps = 100 learning_rate = 2e-4 optimizer = 'adamw_torch' weight_decay = 0.01 max_grad_norm = 0.8 # ๋ชจ๋ธ์ด ๋„ˆ๋ฌด ๊ณผ์ ํ•ฉ๋˜๋Š”๊ฑฐ๊ฐ™๊ฑฐ๋‚˜ ์ด์ƒํ•œ ๋ฐฉํ–ฅ์œผ๋กœ ํ•™์Šต์ด ๋˜๋Š”๊ฑฐ๊ฐ™์œผ๋ฉด ๊ฐ’์„ ์ค„์—ฌ๋ณด์ž.

LoRA config

lora_r = 16 lora_alpha = 16 lora_dropout = 0.05 lora_target_modules = ["gate_proj", "down_proj", "up_proj"]

Tokenizer์—์„œ ๋‚˜์˜ค๋Š” input๊ฐ’ ์„ค์ • ์˜ต์…˜

train_on_inputs = False add_eos_token = True

val_data = None

Others

resume_from_checkpoint = False # !! ๋งŒ์•ฝ ๋ชจ๋ธ์„ ์ด์–ด์„œ ํ›ˆ๋ จํ•˜๊ณ  ์‹ถ๋‹ค๋ฉด, './custom_LLM/checkpoint-[xxx]'์™€ ๊ฐ™์ด ํŒŒ์ผ ๊ฒฝ๋กœ๋ฅผ ์ž…๋ ฅํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค! output_dir = './custom_LLM'