File size: 447 Bytes
9cd4306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
adam_beta1: 0.9
adam_beta2: 0.999
global_batch_size: 64
gradient_accumulation_steps: 1
learning_rate: 0.0001
lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType
- linear
max_steps: 200000
mixed_precision: bf16
model_name_or_path: sanchit-gandhi/Mistral-1.5B-v0.1
num_train_epochs: 3.0
per_device_train_batch_size: 8
teacher_name_or_path: mistralai/Mistral-7B-v0.1
temperature: 2.0
warmup_steps: 500
weight_decay: 0.0