daresearch commited on
Commit
b17f3a7
·
verified ·
1 Parent(s): 7948b59

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -9
app.py CHANGED
@@ -109,7 +109,7 @@ model = FastLanguageModel.get_peft_model(
109
  target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
110
  "gate_proj", "up_proj", "down_proj"],
111
  lora_alpha=16,
112
- lora_dropout=0, # Supports any, but = 0 is optimized
113
  bias="none", # Supports any, but = "none" is optimized
114
  use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context
115
  random_state=3407,
@@ -124,18 +124,18 @@ trainer = SFTTrainer(
124
  eval_dataset=valid_dataset,
125
  dataset_text_field="text",
126
  max_seq_length=max_seq_length,
127
- dataset_num_proc=4, # Increase parallelism
128
  packing=True, # Enable sequence packing
129
  args=TrainingArguments(
130
- per_device_train_batch_size=4, # Lower batch size to prevent memory issues
131
- gradient_accumulation_steps=4, # Maintain effective batch size
132
  warmup_steps=5,
133
- max_steps=702, # Train in smaller chunks
134
- #num_train_epochs=1, # Test with fewer epochs
135
  learning_rate=2e-4,
136
- fp16=False, # Disable mixed precision temporarily
137
- bf16=False,
138
- logging_steps=25, # Log less frequently
139
  evaluation_strategy="steps",
140
  eval_steps=50, # Evaluate less frequently
141
  max_grad_norm=1.0, # Add gradient clipping
 
109
  target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
110
  "gate_proj", "up_proj", "down_proj"],
111
  lora_alpha=16,
112
+ lora_dropout=0.05, # Supports any, but = 0 is optimized
113
  bias="none", # Supports any, but = "none" is optimized
114
  use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context
115
  random_state=3407,
 
124
  eval_dataset=valid_dataset,
125
  dataset_text_field="text",
126
  max_seq_length=max_seq_length,
127
+ dataset_num_proc=8, # Increase parallelism
128
  packing=True, # Enable sequence packing
129
  args=TrainingArguments(
130
+ per_device_train_batch_size=32, # Lower batch size to prevent memory issues
131
+ gradient_accumulation_steps=1, # Maintain effective batch size
132
  warmup_steps=5,
133
+ max_steps=-1, # Train in smaller chunks
134
+ num_train_epochs=3, # Test with fewer epochs
135
  learning_rate=2e-4,
136
+ fp16=not is_bfloat16_supported(),
137
+ bf16=is_bfloat16_supported(),
138
+ logging_steps=10, # Log less frequently
139
  evaluation_strategy="steps",
140
  eval_steps=50, # Evaluate less frequently
141
  max_grad_norm=1.0, # Add gradient clipping