from datasets import load_dataset from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer from transformers import AutoTokenizer import torch # Load the dataset dataset = load_dataset("louiecerv/sentiment_analysis") # Load tokenizer model_checkpoint = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) # Tokenize function def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True) tokenized_datasets = dataset.map(tokenize_function, batched=True) # Prepare dataset for training train_dataset = tokenized_datasets["train"] # Load model model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2) # Training arguments training_args = TrainingArguments( output_dir="./results", eval_strategy="no", per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, save_strategy="epoch", push_to_hub=True, hub_model_id="louiecerv/sentiment_analysis_model" ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset ) # Train and save model trainer.train() trainer.push_to_hub()