CUDA assertion error when trying to train
I'm getting a long list of CUDA assertion errors when trying to train this model. Here is an example CUDA error:
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [197,0,0], thread: [81,0,0] Assertion srcIndex < srcSelectDimSize
failed.
Any idea what could be causing this error?
I'm using the HuggingFace accelerate API to launch the training process.
Here is the code I'm using to run the training process:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from transformers import TrainerCallback
import torch
import transformers
model = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
tokenizer = AutoTokenizer.from_pretrained(model)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
Load dataset from the Hugging Face datasets library
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
Tokenize the texts
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
Load the data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
)
Load the model
model = AutoModelForCausalLM.from_pretrained(model)
Define the training arguments
training_args = TrainingArguments(
output_dir='./results',
overwrite_output_dir=True,
num_train_epochs=3,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
save_steps=10_000,
save_total_limit=2,
bf16=True,
report_to='none'
)
Initialize the Trainer
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"]
)
trainer.train()