sanchit-gandhi's picture
Saving train state of step 5000
9cd4306 verified
raw
history blame
447 Bytes
adam_beta1: 0.9
adam_beta2: 0.999
global_batch_size: 64
gradient_accumulation_steps: 1
learning_rate: 0.0001
lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType
- linear
max_steps: 200000
mixed_precision: bf16
model_name_or_path: sanchit-gandhi/Mistral-1.5B-v0.1
num_train_epochs: 3.0
per_device_train_batch_size: 8
teacher_name_or_path: mistralai/Mistral-7B-v0.1
temperature: 2.0
warmup_steps: 500
weight_decay: 0.0