GuakGuak's picture
Update src/run.py
494b4c7
from glob import glob
from tqdm import tqdm
import numpy as np
import pickle
#from sklearn.model_selection import train_test_split
import torch
import os
import ast
#from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import EarlyStoppingCallback
from transformers import AutoConfig, AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
#from sklearn.utils import shuffle
from transformers import get_cosine_schedule_with_warmup
from torch.nn import functional as F
import random
import pandas as pd
from .datas import make_dataset, make_extract_dataset
from .utils import set_seed, accuracy_per_class, compute_metrics, model_eval, checkpoint_save, EarlyStopping, model_freeze, get_hidden
from .model import classification_model
from transformers import BigBirdTokenizer
import transformers
class NLP_classification():
def __init__(self, model_name=None, data_file=None, max_length=None, random_state=1000, task_type='onehot', freeze_layers=None, num_classifier=1, num_pos_emb_layer=1, gpu_num=0, sentence_piece=True, bertsum=False):
self.model_name = model_name
self.data_file = data_file
self.max_length = max_length
self.random_state = random_state
self.task_type = task_type
self.tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)
if model_name == 'google/bigbird-roberta-base':
self.tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
self.config = AutoConfig.from_pretrained(model_name, num_labels=6)
#self.pretrained_model = AutoModelForSequenceClassification.from_config(self.config)
self.pretrained_model = AutoModel.from_config(self.config)
self.freeze_layers=freeze_layers
self.num_classifier=num_classifier
self.num_pos_emb_layer=num_pos_emb_layer
self.gpu_num=gpu_num
self.sentence_piece=sentence_piece
self.bertsum=bertsum
if self.max_length is None:
self.padding='longest'
else:
self.padding='max_length'
def training(self, epochs=50, batch_size=4, lr=1e-5, dropout=0.1, data_cut=None, early_stop_count=10,
wandb_log=False, wandb_project=None, wandb_group=None, wandb_name=None, wandb_memo=None):
#os.environ["CUDA_VISIBLE_DEVICES"]= "{0}".format(int(self.gpu_num))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cuda:{0}'.format(int(self.gpu_num)))
#torch.cuda.set_device(device)
set_seed(self.random_state)
torch.set_num_threads(10)
if wandb_log is True:
import wandb
wandb.init(project=wandb_project, reinit=True, group=wandb_group, notes=wandb_memo)
wandb.run.name = wandb_name
wandb.run.save()
parameters = wandb.config
parameters.lr = lr
parameters.batch_size = batch_size
parameters.dropout = dropout
parameters.train_num = data_cut
parameters.max_length = self.max_length
parameters.model_name = self.model_name
parameters.task_type = self.task_type
'''data loading'''
train_dataset, val_dataset = make_dataset(csv_file=self.data_file, tokenizer=self.tokenizer, max_length=self.max_length, padding=self.padding, random_state=self.random_state, data_cut=data_cut, sentence_piece=self.sentence_piece)
'''loader making'''
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset))
val_loader = DataLoader(val_dataset, batch_size=batch_size, sampler=SequentialSampler(val_dataset))
''' model load '''
model=classification_model(self.pretrained_model, self.config, num_classifier=self.num_classifier, num_pos_emb_layer=self.num_pos_emb_layer, bertsum=self.bertsum, device=device)
model=model_freeze(model, self.freeze_layers)
model.to(device)
''' running setting '''
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr, eps=1e-8)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=(len(train_loader)*epochs))
early_stopping = EarlyStopping(patience = early_stop_count, verbose = True)
''' running '''
best_epoch = None
best_val_f1 = None
for epoch in range(epochs):
model.train()
loss_all = 0
step = 0
for data in tqdm(train_loader):
input_ids=data['input_ids'].to(device, dtype=torch.long)
mask = data['attention_mask'].to(device, dtype=torch.long)
token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
if self.task_type=='onehot':
targets=data['label_onehot'].to(device, dtype=torch.float)
elif self.task_type=='scalar':
targets=data['label'].to(device, dtype=torch.long)
position = data['position']
inputs = {'input_ids': input_ids, 'attention_mask': mask, 'token_type_ids': token_type_ids,
'labels': targets, 'position': position}
if self.sentence_piece:
sentence_batch = data['sentence_batch'].to(device, dtype=torch.long)
inputs = {'input_ids': input_ids, 'attention_mask': mask, 'token_type_ids': token_type_ids,
'labels': targets, 'sentence_batch': sentence_batch, 'position': position}
outputs = model(inputs)
output = outputs[1]
loss = outputs[0]
optimizer.zero_grad()
#loss=loss_fn(output, targets)
loss_all += loss.item()
loss.backward()
optimizer.step()
scheduler.step()
#print(optimizer.param_groups[0]['lr'])
train_loss = loss_all/len(train_loader)
val_loss, val_acc, val_precision, val_recall, val_f1 = model_eval(model, device, val_loader, task_type=self.task_type, sentence_piece=self.sentence_piece)
if wandb_log is True:
wandb.log({'train_loss':train_loss, 'val_loss':val_loss, 'val_acc':val_acc,
'val_precision':val_precision, 'val_recall':val_recall, 'val_f1':val_f1})
if best_val_f1 is None or val_f1 >= best_val_f1:
best_epoch = epoch+1
best_val_f1 = val_f1
checkpoint_save(model, val_f1, wandb_name=wandb_name)
print('Epoch: {:03d}, Train Loss: {:.7f}, Val Loss: {:.7f}, Val Acc: {:.7f}, Val Precision: {:.7f}, Val Recall: {:.7f}, Val F1: {:.7f} '.format(epoch+1, train_loss, val_loss, val_acc, val_precision, val_recall, val_f1))
early_stopping(val_f1)
if early_stopping.early_stop:
print("Early stopping")
break
wandb.finish()
def prediction(self, selected_model=None, batch_size=8):
#os.environ["CUDA_VISIBLE_DEVICES"]= "{0}".format(int(self.gpu_num))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
set_seed(self.random_state)
torch.set_num_threads(10)
task_type=self.task_type
'''data loading'''
train_dataset, val_dataset = make_dataset(csv_file=self.data_file, tokenizer=self.tokenizer, max_length=self.max_length, padding=self.padding, random_state=self.random_state, data_cut=None, sentence_piece=self.sentence_piece)
'''loader making'''
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset))
val_loader = DataLoader(val_dataset, batch_size=batch_size, sampler=SequentialSampler(val_dataset))
''' model load '''
model=classification_model(self.pretrained_model, self.config, num_classifier=self.num_classifier, num_pos_emb_layer=self.num_pos_emb_layer, bertsum=self.bertsum, device=device)
model.load_state_dict(torch.load(selected_model))
model.to(device)
''' prediction '''
print('start trainset prediction')
train_results = model_eval(model, device, train_loader, task_type=self.task_type, return_values=True, sentence_piece=self.sentence_piece)
print('start evalset prediction')
eval_results = model_eval(model, device, val_loader, task_type=self.task_type, return_values=True, sentence_piece=self.sentence_piece)
print('train result: acc:{0} | precision:{1} | recall:{2} | f1:{3}'.format(train_results[1], train_results[2], train_results[3], train_results[4]))
print('eval result: acc:{0} | precision:{1} | recall:{2} | f1:{3}'.format(eval_results[1], eval_results[2], eval_results[3], eval_results[4]))
total_text = train_results[7] + eval_results[7]
total_out = train_results[6] + eval_results[6]
total_target = train_results[5] + eval_results[5]
if self.task_type == 'onehot':
total_out = [i.argmax() for i in total_out]
total_target = [i.argmax() for i in total_target]
total_data = {'text':total_text, 'label':total_target, 'predict':total_out}
total_df = pd.DataFrame(total_data)
''' result return '''
return total_df
def get_embedding(self, selected_model=None, batch_size=8, return_hidden=True, return_hidden_pretrained=False):
#os.environ["CUDA_VISIBLE_DEVICES"]= "{0}".format(int(self.gpu_num))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cuda:{0}'.format(int(self.gpu_num)))
#torch.cuda.set_device(device)
set_seed(self.random_state)
torch.set_num_threads(10)
task_type=self.task_type
'''data loading'''
train_dataset, val_dataset = make_dataset(csv_file=self.data_file, tokenizer=self.tokenizer, max_length=self.max_length, padding=self.padding, random_state=self.random_state, data_cut=None, sentence_piece=self.sentence_piece)
'''loader making'''
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset))
val_loader = DataLoader(val_dataset, batch_size=batch_size, sampler=SequentialSampler(val_dataset))
''' model load '''
model=classification_model(self.pretrained_model, self.config, num_classifier=self.num_classifier, num_pos_emb_layer=self.num_pos_emb_layer, bertsum=self.bertsum, device=device)
model.return_hidden = return_hidden
model.return_hidden_pretrained = return_hidden_pretrained
if selected_model is not None:
model.load_state_dict(torch.load(selected_model))
model.to(device)
''' get hidden '''
print('start make hidden states (trainset)')
train_hiddens, train_targets = get_hidden(model, device, train_loader, task_type=self.task_type, sentence_piece=self.sentence_piece)
print('start evalset prediction (eval set)')
eval_hiddens, eval_targets = get_hidden(model, device, val_loader, task_type=self.task_type, sentence_piece=self.sentence_piece)
total_hiddens = np.array(train_hiddens + eval_hiddens)
total_targets = np.array(train_targets + eval_targets)
return total_hiddens, total_targets
def label_extraction(self, paragraphs, positions, selected_model=None, batch_size=16):
label_dict = {'Abstract':0, 'Introduction':1, 'Main':2, 'Methods':3, 'Summary':4, 'Captions':5}
#os.environ["CUDA_VISIBLE_DEVICES"]= "{0}".format(int(self.gpu_num))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
set_seed(self.random_state)
torch.set_num_threads(10)
''' data to list '''
is_list = True
if not isinstance(paragraphs, list):
paragraphs = [paragraphs]
is_list = False
if not isinstance(positions, list):
positions = [positions]
is_list = False
'''data encoding'''
dataset = make_extract_dataset(paragraphs, positions, tokenizer=self.tokenizer, max_length=self.max_length)
'''loader making'''
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
''' model load '''
model=classification_model(self.pretrained_model, self.config, num_classifier=self.num_classifier, num_pos_emb_layer=self.num_pos_emb_layer, bertsum=self.bertsum, device=device)
model.load_state_dict(torch.load(selected_model))
model.to(device)
''' prediction '''
model.eval()
predicts = []
with torch.no_grad():
for batch in tqdm(data_loader):
inputs = {}
inputs['input_ids'] = batch['input_ids'].to(device)
inputs['attention_mask'] = batch['attention_mask'].to(device)
inputs['token_type_ids'] = batch['token_type_ids'].to(device)
inputs['position'] = batch['position']
outputs = model(inputs)
logits = outputs[1]
logits = logits.detach().cpu().numpy()
logits = logits.argmax(axis=1).flatten()
logits = logits.tolist()
predicts.extend(logits)
predicts = [list(label_dict.keys())[list(label_dict.values()).index(i)] for i in predicts]
if not is_list:
predicts = predicts[0]
return predicts