In [1]:
%%capture

!pip install transformers
!pip install accelerate -U
!pip install datasets
!pip install huggingface_hub

In [2]:
%%capture

import torch
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import pipeline
from datasets import load_dataset
import nltk
from transformers import Trainer
from transformers import TrainingArguments
nltk.download('punkt')
##others
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["WANDB_DISABLED"] = "true"
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data_path= '/content/drive/MyDrive/deep-learning/capstone_data.csv'

In [5]:
data= pd.read_csv(data_path)

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,sentiment,clean_review
0,0,1,recently shown cable tv movie opens disclaimer...
1,1,1,i surprised film i touched lives paulie touche...
2,2,-1,now im one watch movies got poor reviews say h...
3,3,1,this film came twelve years years ago revelati...
4,4,1,when orphanage manager goes vacation father ta...


In [7]:
##checking for missing values

data.isna().sum()

Unnamed: 0      0
sentiment       0
clean_review    0
dtype: int64

In [8]:
##dropping the unneccessary column

data= data.drop(labels= "Unnamed: 0", axis=1)

In [9]:
data.head()

Unnamed: 0,sentiment,clean_review
0,1,recently shown cable tv movie opens disclaimer...
1,1,i surprised film i touched lives paulie touche...
2,-1,now im one watch movies got poor reviews say h...
3,1,this film came twelve years years ago revelati...
4,1,when orphanage manager goes vacation father ta...


In [10]:
data= data.rename(columns={'sentiment': 'label'})

## Data Splitting

In [11]:
train, eval = train_test_split(data, test_size= 0.2, random_state= 50)

In [12]:
train.shape

(20000, 2)

In [13]:
eval.shape

(5000, 2)

## Loading Data With Load Dataset

In [14]:
train.to_csv("/content/train_set.csv")
eval.to_csv("/content/eval_set.csv")

In [15]:
dataset= load_dataset("csv", data_files={"train_set":"train_set.csv", "eval_set":"eval_set.csv" }, encoding= "ISO-8859-1")

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train_set split: 0 examples [00:00, ? examples/s]

Generating eval_set split: 0 examples [00:00, ? examples/s]

In [16]:
dataset

DatasetDict({
    train_set: Dataset({
        features: ['Unnamed: 0', 'label', 'clean_review'],
        num_rows: 20000
    })
    eval_set: Dataset({
        features: ['Unnamed: 0', 'label', 'clean_review'],
        num_rows: 5000
    })
})

In [17]:
##getting my tokenizer

tokenizer= AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")

Downloading (…)lve/main/config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [18]:
## our labels are-1, and 1 and we will like to transform them into 0,1, respectively

def transform_labels(input):
  label= input["label"]
  num =0

  if label== -1:
    num= 0  ##for negative sentiment
  elif label== 1:
    num =1 ##for positive sentiment
  return {"labels": num}

def tokenize(example):
  return tokenizer(example["clean_review"], padding= True, truncation=True,  max_length=512, return_tensors= "pt")

In [19]:
dataset= dataset.map(tokenize, batched= True)
remove_columns=  ['Unnamed: 0', 'label', 'clean_review']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [20]:
dataset

DatasetDict({
    train_set: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
    eval_set: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

In [39]:
###defining my compute metric for the modelling

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  accuracy = accuracy_score(labels, preds)
  return {"accuracy": accuracy}

In [58]:
##determining my batch size for the training

batch_size= 64

In [59]:
##instantiating my training metrics.

logging_step= len(dataset['train_set'])// batch_size
training_args = TrainingArguments(
    output_dir= "Capstone_TinyBert",
    num_train_epochs=5,
    load_best_model_at_end=True,
    learning_rate = 2e-5,
    evaluation_strategy="epoch",
    per_device_train_batch_size= batch_size,
    per_device_eval_batch_size= batch_size,
    save_strategy="epoch",
    push_to_hub=True,
    logging_steps = logging_step,
    weight_decay=0.01  # Adding weight decay to handle overfitting
)

In [60]:

model= AutoModelForSequenceClassification.from_pretrained("huawei-noah/TinyBERT_General_4L_312D", num_labels= 2)

In [61]:
##getting our training and evaluation set

train_dataset= dataset['train_set'].shuffle(seed=10)
eval_dataset= dataset['eval_set'].shuffle(seed=10)

In [62]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co./front/assets/huggingface_logo-noborder.sv…

In [63]:
trainer= Trainer(
    model= model,
      args= training_args,
      train_dataset= train_dataset,
      eval_dataset= eval_dataset,
      tokenizer= tokenizer,
      compute_metrics=compute_metrics
)

In [64]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.4628,0.361662,0.852
2,0.3369,0.321795,0.8644
3,0.2949,0.314254,0.8744
4,0.2699,0.31921,0.8718
5,0.2481,0.31721,0.8772


TrainOutput(global_step=1565, training_loss=0.32232705960258506, metrics={'train_runtime': 1236.7702, 'train_samples_per_second': 80.856, 'train_steps_per_second': 1.265, 'total_flos': 1433899622400000.0, 'train_loss': 0.32232705960258506, 'epoch': 5.0})

In [65]:
trainer.push_to_hub()

'https://huggingface.co./gArthur98/Capstone_TinyBert/tree/main/'