Spaces:

daresearch
/

ll-8b-training

Runtime error

daresearch commited on Dec 25, 2024

Commit

b17f3a7

verified ·

1 Parent(s): 7948b59

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -109,7 +109,7 @@ model = FastLanguageModel.get_peft_model(
     target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                     "gate_proj", "up_proj", "down_proj"],
     lora_alpha=16,
-    lora_dropout=0,  # Supports any, but = 0 is optimized
     bias="none",  # Supports any, but = "none" is optimized
     use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
     random_state=3407,
@@ -124,18 +124,18 @@ trainer = SFTTrainer(
     eval_dataset=valid_dataset,
     dataset_text_field="text",
     max_seq_length=max_seq_length,
-    dataset_num_proc=4,  # Increase parallelism
     packing=True,  # Enable sequence packing
     args=TrainingArguments(
-        per_device_train_batch_size=4,  # Lower batch size to prevent memory issues
-        gradient_accumulation_steps=4,  # Maintain effective batch size
         warmup_steps=5,
-        max_steps=702,  # Train in smaller chunks
-        #num_train_epochs=1,  # Test with fewer epochs
         learning_rate=2e-4,
-        fp16=False,  # Disable mixed precision temporarily
-        bf16=False,
-        logging_steps=25,  # Log less frequently
         evaluation_strategy="steps",
         eval_steps=50,  # Evaluate less frequently
         max_grad_norm=1.0,  # Add gradient clipping

     target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                     "gate_proj", "up_proj", "down_proj"],
     lora_alpha=16,
+    lora_dropout=0.05,  # Supports any, but = 0 is optimized
     bias="none",  # Supports any, but = "none" is optimized
     use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
     random_state=3407,
     eval_dataset=valid_dataset,
     dataset_text_field="text",
     max_seq_length=max_seq_length,
+    dataset_num_proc=8,  # Increase parallelism
     packing=True,  # Enable sequence packing
     args=TrainingArguments(
+        per_device_train_batch_size=32,  # Lower batch size to prevent memory issues
+        gradient_accumulation_steps=1,  # Maintain effective batch size
         warmup_steps=5,
+        max_steps=-1,  # Train in smaller chunks
+        num_train_epochs=3,  # Test with fewer epochs
         learning_rate=2e-4,
+        fp16=not is_bfloat16_supported(),
+        bf16=is_bfloat16_supported(),
+        logging_steps=10,  # Log less frequently
         evaluation_strategy="steps",
         eval_steps=50,  # Evaluate less frequently
         max_grad_norm=1.0,  # Add gradient clipping