Spaces:
Runtime error
Runtime error
# John Makely | |
# Finetune Language Modeling Based on BERTweet | |
# ./jigsaw-toxic-comment-classification-challenge/train.csv | |
# "id","comment_text","toxic","severe_toxic","obscene","threat","insult","identity_hate" [6 total classifiers] | |
# 1. Extract text from csv | |
# 2. Tokenize text (BERTweet, RoBERTa, GPT-2) | |
# 3. Pass each tokenized text to a model with each classifier | |
# 4. Train each model | |
# 5. Save each model | |
import pandas as pd | |
import os | |
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, RobertaTokenizer, RobertaForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification | |
import torch | |
from torch.utils.data import Dataset | |
torch.cuda.empty_cache() | |
# Create Dataset class | |
class MultiLabelClassifierDataset(Dataset): | |
def __init__(self, encodings, labels): | |
self.encodings = encodings | |
self.labels = labels | |
def __getitem__(self, idx): | |
item = {key: torch.tensor(val[idx]) | |
for key, val in self.encodings.items()} | |
item['labels'] = torch.tensor(self.labels[idx]).float() | |
return item | |
def __len__(self): | |
return len(self.labels) | |
# Set up directories | |
work_dir = os.path.dirname(os.path.realpath(__file__)) + '/' | |
dataset_dir = work_dir + 'jigsaw-toxic-comment-classification-challenge/' | |
# Set up labels | |
classifiers = ['toxic', 'severe_toxic', 'obscene', | |
'threat', 'insult', 'identity_hate'] | |
# Use train.csv to split into train, val, test | |
print("Loading data...") | |
df = pd.read_csv(dataset_dir + 'train.csv') | |
df = df.sample(frac=1).reset_index(drop=True) # Shuffle | |
# Split into train, val, test | |
train_df = df[:int(len(df)*0.1)] | |
# Extracting the last 6 columns into a numpy array | |
train_labels = train_df[classifiers].to_numpy() | |
# Setting device | |
device = torch.device('cuda') | |
print("Using device: ", device) | |
# # # # # # # # # # # ## | |
# # # # # BERT # # # # # | |
# # # # # # # # # # # ## | |
training_args = TrainingArguments( | |
output_dir='./results', | |
num_train_epochs=2, | |
per_device_train_batch_size=32, | |
per_device_eval_batch_size=64, | |
warmup_steps=500, | |
weight_decay=0.01, | |
logging_dir='./logs', | |
logging_steps=10, | |
fp16=True | |
) | |
print("BERT") | |
bert_dir = work_dir + 'bert/' | |
print("Tokenizing") | |
print("Model base: ", "vinai/bertweet-base") | |
tokenizer = AutoTokenizer.from_pretrained( | |
"vinai/bertweet-base", model_max_length=128) | |
print("Creating train encodings...") | |
train_encodings = tokenizer( | |
train_df['comment_text'].tolist(), truncation=True, padding=True) | |
# def bert_train_model('vinai/bertweet-base', num_labels, training_args, train_encodings, train_dataset, model_dir): | |
print("Training model to be stored in" + bert_dir) | |
# # Create dataset | |
print("Creating dataset") | |
train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels) | |
# # Load model | |
print("Loading model for training...") | |
model = AutoModelForSequenceClassification.from_pretrained( | |
'vinai/bertweet-base', num_labels=6) | |
# Create Trainer | |
print("Creating trainer...") | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_dataset | |
) | |
# Train | |
print("Training...") | |
trainer.train() | |
# # Save model | |
print("Saving model to " + bert_dir + '_bert_model') | |
trainer.save_model(bert_dir + '_bert_model') | |
# # # # # # # # # # # # | |
# # # # RoBERTa # # # # | |
# # # # # # # # # # # # | |
training_args = TrainingArguments( | |
output_dir='./results', | |
num_train_epochs=1, | |
per_device_train_batch_size=32, | |
per_device_eval_batch_size=16, | |
warmup_steps=500, | |
weight_decay=0.01, | |
logging_dir='./logs', | |
logging_steps=10, | |
fp16=True | |
) | |
# RoBERTa | |
print("RoBERTa") | |
roberta_dir = work_dir + 'roberta/' | |
print("Tokenizing") | |
print("Model base: ", 'roberta-base') | |
tokenizer = RobertaTokenizer.from_pretrained( | |
'roberta-base', model_max_length=128) | |
train_encodings = tokenizer( | |
train_df['comment_text'].tolist(), truncation=True, padding=True) | |
# Create dataset | |
print("Creating dataset") | |
train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels) | |
# Load model | |
print("Loading model for training...") | |
model = AutoModelForSequenceClassification.from_pretrained( | |
'roberta-base', num_labels=6) | |
# Create Trainer | |
print("Creating trainer...") | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_dataset | |
) | |
# Train | |
print("Training...") | |
trainer.train() | |
# Save model | |
print("Saving model to " + roberta_dir + '_roberta_model') | |
trainer.save_model(roberta_dir + '_roberta_model') | |
# # # # # # # # # # # ## | |
# # # distilbert # # # # | |
# # # # # # # # # # # ## | |
training_args = TrainingArguments( | |
output_dir='./results', | |
num_train_epochs=1, | |
per_device_train_batch_size=32, | |
per_device_eval_batch_size=64, | |
warmup_steps=500, | |
weight_decay=0.01, | |
logging_dir='./logs', | |
logging_steps=10, | |
fp16=True | |
) | |
print("DISTILBERT") | |
distilbert_dir = work_dir + 'distilbert/' | |
print("Tokenizing") | |
print("Model base: ", 'distilbert-base-cased') | |
tokenizer = AutoTokenizer.from_pretrained( | |
'distilbert-base-cased', model_max_length=128) | |
print("Creating train encodings...") | |
train_encodings = tokenizer( | |
train_df['comment_text'].tolist(), truncation=True, padding=True) | |
print("Training model to be stored in" + distilbert_dir) | |
# Create dataset | |
print("Creating dataset") | |
train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels) | |
# Load model | |
print("Loading model for training...") | |
model = AutoModelForSequenceClassification.from_pretrained( | |
'distilbert-base-cased', num_labels=6) | |
# Create Trainer | |
print("Creating trainer...") | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_dataset | |
) | |
# Train | |
print("Training...") | |
trainer.train() | |
# Save model | |
print("Saving model to " + distilbert_dir + '_distilbert_model') | |
trainer.save_model(distilbert_dir + '_distilbert_model') | |