thesis-deployment / model.py
Avril Lalaine
Add flask app with dockerfire
0ad9aa8
import pandas as pd
import torch
from torch.utils.data import Dataset
import numpy as np
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer,TrainingArguments
# no augment dataset
# df = df = pd.read_csv(r".\train_set.csv")
# with augment training dataset
df = pd.read_csv(r".\cleaned_combined_aug_set.csv")
# df.info()
value_counts = df['label'].value_counts()
print(value_counts)
test_df = pd.read_csv(r".\test_set.csv")
# test_df.info()
test_df['label'].value_counts()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
model = model.to('cuda')
# independent var
X = list(df['article'])
X_test = list(test_df['article'])
#dependent
y= list(df['label'])
y_test = list(test_df['label'])
max_length = 512
train_encodings = tokenizer(X, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
test_encodings = tokenizer(X_test, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
class CustomDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: val[idx] for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
torch_train_dataset = CustomDataset(train_encodings,y)
torch_test_dataset = CustomDataset(test_encodings,y_test)
training_args = TrainingArguments(
output_dir='./results/fake-news-bert-aug',
evaluation_strategy='epoch',
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3
)
def compute_metrics(p):
print(type(p))
pred, labels = p
pred = np.argmax(pred,axis=1)
accuracy = accuracy_score(y_true=labels,y_pred=pred)
recall = recall_score(y_true=labels,y_pred=pred)
precision = precision_score(y_true=labels,y_pred=pred)
f1 = f1_score(y_true=labels,y_pred=pred)
return {"accuracy":accuracy,"precision":precision,"recall":recall,"f1":f1}
trainer = Trainer(
model=model,
args=training_args,
train_dataset=torch_train_dataset,
eval_dataset=torch_test_dataset,
compute_metrics=compute_metrics
)
trainer.train()
def predict(text):
return trainer.predict(text)