In [None]:
# Run this first to install all the packages needed in this colab. This can take several minutes.
%%capture
!pip install huggingface_hub accelerate transformers datasets

# Train a simple sentiment classifier on Hugging Face
This colab walks you through the most simple example of fine-tuning an LLM on Hugging Face. This is part of www.huggingfacetutorial.com.

We start with a 33M parameter pretrained LLM model [MiniLM](https://huggingface.co./microsoft/MiniLM-L12-H384-uncased). We fine-tune the pretrained model with the [IMDB datasets](https://huggingface.co./datasets/imdb), which are review comments on a movie review website tied to a postive or negative sentiment.

The fine-tuned model is a sentiment classifier. You input any text, and it will output the percentage positive and percentage negative.

Be warned: the model this colab creates is not very good at its job. But it's a start. This colab has been stripped down to the bare bones of what is needed to fine-tune a model on Hugging Face using the Transformers library.

You can run this on a regular free Python 3 Google Compute Engine backend. You do not need a GPU or TPU. For those new to colabs and AI modeling, training models require lots of compute power. www.huggingfacetutorial.com tries to teach you the skills and concepts without having to pay for expensive GPU time.

In [None]:
#@title [Prework] Login with Hugging Face so that you can upload your trained model to Hugging Face Hub.
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co./front/assets/huggingface_logo-noborder.sv…

In [None]:
#@title 1) Load datasets
from datasets import load_dataset
dataset_name = 'imdb' #@param {type:'string'}
full_datasets = load_dataset(dataset_name)

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
#@title 2) Split dataset into training and eval sets
train_set_name = 'train' #@param {type:'string'}
eval_set_name = 'test' #@param {type:'string'}
train_set_sample_size = 1000 #@param {type:'integer'}
eval_set_sample_size = 1000 #@param {type:'integer'}
small_train_dataset = full_datasets[train_set_name].shuffle(seed=42).select(range(train_set_sample_size))
small_eval_dataset = full_datasets[eval_set_name].shuffle(seed=42).select(range(eval_set_sample_size))

In [None]:
#@title 3) Get model and tokenizer
pretrained_model = 'microsoft/MiniLM-L12-H384-uncased' #@param {type:'string'}
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/133M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
#@title 4) Tokenize datasets
def tokenize(examples):
    return tokenizer(examples['text'], max_length=512, padding='max_length', truncation=True)
tokenized_train_dataset = small_train_dataset.map(tokenize, batched=True)
tokenized_eval_dataset = small_eval_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
#@title 5) Prepare Trainer
from transformers import TrainingArguments
output_dir='sentiment-guesser' #@param {type:'string'}
training_args = TrainingArguments(output_dir=output_dir,
                                  num_train_epochs=1,
                                  evaluation_strategy='epoch')

In [None]:
#@title 6) Create Trainer
from transformers import Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
#@title 7) Train and save to Hugging Face
trainer.train()
trainer.push_to_hub()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 