# -*- coding: utf-8 -*- """Roberta sentiment Analysis Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/10L1VfVMZLa62qTFdUIOURELW194TjJ4e """ # Install required libraries !pip install datasets transformers huggingface_hub -q # Import key libraries and packages import numpy as np import os import pandas as pd from datasets import load_dataset, load_metric from huggingface_hub import notebook_login from sklearn.model_selection import train_test_split from transformers import AutoTokenizer, TrainingArguments, Trainer from google.colab import files from google.colab import drive # Disable Weights & Biases os.environ["WANDB_DISABLED"] = "true" drive.mount('/content/drive') # Load the datasets train_df =pd.read_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/hugging.csv").dropna(axis = 0) test_df = pd.read_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/Testhugging.csv").fillna("") train_df.head() test_df.head() train_df.isnull().sum() test_df.isnull().sum() """Fine-tuning the roberta model""" train_df, eval = train_test_split(train_df, test_size=0.2, random_state=42, stratify= train_df['label']) print(f"new dataframe shapes: train is {train_df.shape}, eval is {eval.shape}") # Save splitted subsets train_df.to_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/train_subset.csv", index=False) eval.to_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/eval_subset.csv", index=False) dataset = load_dataset('csv', data_files={'train': '/content/drive/MyDrive/PostBAP_ASSESSMENT/train_subset.csv', 'eval': '/content/drive/MyDrive/PostBAP_ASSESSMENT/eval_subset.csv'}, encoding = "ISO-8859-1") # Instantiate the tokenizer tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3) # Define helper functions ## Function to transform labels def transform_labels(label): label = label['label'] num = 0 if label == -1: #'Negative' num = 0 elif label == 0: #'Neutral' num = 1 elif label == 1: #'Positive' num = 2 return {'labels': num} ## Function to tokenize data def tokenize_data(example): return tokenizer(example['safe_text'], padding='max_length',truncation=True, max_length = 256) # Tokenize the tweets dataset = dataset.map(tokenize_data, batched=True) # Transform labels and limit the columns remove_columns = ['tweet_id', 'label', 'safe_text', 'agreement'] dataset = dataset.map(transform_labels, remove_columns=remove_columns) # Define training arguments training_args = TrainingArguments( "covid_tweets_sentiment_analysis_model", num_train_epochs=4, load_best_model_at_end=True, evaluation_strategy="epoch", save_strategy="epoch" ) # Load the pretrained model from transformers import AutoModelForSequenceClassification model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3) # Define evaluation metrics metric = load_metric("accuracy") def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) return metric.compute(predictions=predictions, references=labels) # Instantiate the training and evaluation sets train_dataset = dataset["train"].shuffle(seed=24) eval_dataset = dataset["eval"].shuffle(seed=24) #converting training data to PyTorch tensors to speed up training and adding padding: from transformers import DataCollatorWithPadding data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Instantiate the trainer trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,compute_metrics=compute_metrics) trainer.train() # Reinstantiate the trainer for evaluation trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Launch the final evaluation trainer.evaluate() # Login to HF hub notebook_login() # Push model and tokenizer to HF Hub model.push_to_hub("MavisAJ/Sentiment_analysis_roberta_model") tokenizer.push_to_hub("MavisAJ/Sentiment_analysis_roberta_model")