Spaces:
Runtime error
Runtime error
from glob import glob | |
from tqdm import tqdm | |
import numpy as np | |
import pickle | |
#from sklearn.model_selection import train_test_split | |
import torch | |
import os | |
import ast | |
#from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score | |
from transformers import EarlyStoppingCallback | |
from transformers import AutoConfig, AutoModel, AutoTokenizer, AutoModelForSequenceClassification | |
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler | |
#from sklearn.utils import shuffle | |
from transformers import get_cosine_schedule_with_warmup | |
from torch.nn import functional as F | |
import random | |
import pandas as pd | |
from .datas import make_dataset, make_extract_dataset | |
from .utils import set_seed, accuracy_per_class, compute_metrics, model_eval, checkpoint_save, EarlyStopping, model_freeze, get_hidden | |
from .model import classification_model | |
from transformers import BigBirdTokenizer | |
import transformers | |
class NLP_classification(): | |
def __init__(self, model_name=None, data_file=None, max_length=None, random_state=1000, task_type='onehot', freeze_layers=None, num_classifier=1, num_pos_emb_layer=1, gpu_num=0, sentence_piece=True, bertsum=False): | |
self.model_name = model_name | |
self.data_file = data_file | |
self.max_length = max_length | |
self.random_state = random_state | |
self.task_type = task_type | |
self.tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False) | |
if model_name == 'google/bigbird-roberta-base': | |
self.tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base') | |
self.config = AutoConfig.from_pretrained(model_name, num_labels=6) | |
#self.pretrained_model = AutoModelForSequenceClassification.from_config(self.config) | |
self.pretrained_model = AutoModel.from_config(self.config) | |
self.freeze_layers=freeze_layers | |
self.num_classifier=num_classifier | |
self.num_pos_emb_layer=num_pos_emb_layer | |
self.gpu_num=gpu_num | |
self.sentence_piece=sentence_piece | |
self.bertsum=bertsum | |
if self.max_length is None: | |
self.padding='longest' | |
else: | |
self.padding='max_length' | |
def training(self, epochs=50, batch_size=4, lr=1e-5, dropout=0.1, data_cut=None, early_stop_count=10, | |
wandb_log=False, wandb_project=None, wandb_group=None, wandb_name=None, wandb_memo=None): | |
#os.environ["CUDA_VISIBLE_DEVICES"]= "{0}".format(int(self.gpu_num)) | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
#device = torch.device('cuda:{0}'.format(int(self.gpu_num))) | |
#torch.cuda.set_device(device) | |
set_seed(self.random_state) | |
torch.set_num_threads(10) | |
if wandb_log is True: | |
import wandb | |
wandb.init(project=wandb_project, reinit=True, group=wandb_group, notes=wandb_memo) | |
wandb.run.name = wandb_name | |
wandb.run.save() | |
parameters = wandb.config | |
parameters.lr = lr | |
parameters.batch_size = batch_size | |
parameters.dropout = dropout | |
parameters.train_num = data_cut | |
parameters.max_length = self.max_length | |
parameters.model_name = self.model_name | |
parameters.task_type = self.task_type | |
'''data loading''' | |
train_dataset, val_dataset = make_dataset(csv_file=self.data_file, tokenizer=self.tokenizer, max_length=self.max_length, padding=self.padding, random_state=self.random_state, data_cut=data_cut, sentence_piece=self.sentence_piece) | |
'''loader making''' | |
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset)) | |
val_loader = DataLoader(val_dataset, batch_size=batch_size, sampler=SequentialSampler(val_dataset)) | |
''' model load ''' | |
model=classification_model(self.pretrained_model, self.config, num_classifier=self.num_classifier, num_pos_emb_layer=self.num_pos_emb_layer, bertsum=self.bertsum, device=device) | |
model=model_freeze(model, self.freeze_layers) | |
model.to(device) | |
''' running setting ''' | |
loss_fn = torch.nn.BCEWithLogitsLoss() | |
optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr, eps=1e-8) | |
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=(len(train_loader)*epochs)) | |
early_stopping = EarlyStopping(patience = early_stop_count, verbose = True) | |
''' running ''' | |
best_epoch = None | |
best_val_f1 = None | |
for epoch in range(epochs): | |
model.train() | |
loss_all = 0 | |
step = 0 | |
for data in tqdm(train_loader): | |
input_ids=data['input_ids'].to(device, dtype=torch.long) | |
mask = data['attention_mask'].to(device, dtype=torch.long) | |
token_type_ids = data['token_type_ids'].to(device, dtype=torch.long) | |
if self.task_type=='onehot': | |
targets=data['label_onehot'].to(device, dtype=torch.float) | |
elif self.task_type=='scalar': | |
targets=data['label'].to(device, dtype=torch.long) | |
position = data['position'] | |
inputs = {'input_ids': input_ids, 'attention_mask': mask, 'token_type_ids': token_type_ids, | |
'labels': targets, 'position': position} | |
if self.sentence_piece: | |
sentence_batch = data['sentence_batch'].to(device, dtype=torch.long) | |
inputs = {'input_ids': input_ids, 'attention_mask': mask, 'token_type_ids': token_type_ids, | |
'labels': targets, 'sentence_batch': sentence_batch, 'position': position} | |
outputs = model(inputs) | |
output = outputs[1] | |
loss = outputs[0] | |
optimizer.zero_grad() | |
#loss=loss_fn(output, targets) | |
loss_all += loss.item() | |
loss.backward() | |
optimizer.step() | |
scheduler.step() | |
#print(optimizer.param_groups[0]['lr']) | |
train_loss = loss_all/len(train_loader) | |
val_loss, val_acc, val_precision, val_recall, val_f1 = model_eval(model, device, val_loader, task_type=self.task_type, sentence_piece=self.sentence_piece) | |
if wandb_log is True: | |
wandb.log({'train_loss':train_loss, 'val_loss':val_loss, 'val_acc':val_acc, | |
'val_precision':val_precision, 'val_recall':val_recall, 'val_f1':val_f1}) | |
if best_val_f1 is None or val_f1 >= best_val_f1: | |
best_epoch = epoch+1 | |
best_val_f1 = val_f1 | |
checkpoint_save(model, val_f1, wandb_name=wandb_name) | |
print('Epoch: {:03d}, Train Loss: {:.7f}, Val Loss: {:.7f}, Val Acc: {:.7f}, Val Precision: {:.7f}, Val Recall: {:.7f}, Val F1: {:.7f} '.format(epoch+1, train_loss, val_loss, val_acc, val_precision, val_recall, val_f1)) | |
early_stopping(val_f1) | |
if early_stopping.early_stop: | |
print("Early stopping") | |
break | |
wandb.finish() | |
def prediction(self, selected_model=None, batch_size=8): | |
#os.environ["CUDA_VISIBLE_DEVICES"]= "{0}".format(int(self.gpu_num)) | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
set_seed(self.random_state) | |
torch.set_num_threads(10) | |
task_type=self.task_type | |
'''data loading''' | |
train_dataset, val_dataset = make_dataset(csv_file=self.data_file, tokenizer=self.tokenizer, max_length=self.max_length, padding=self.padding, random_state=self.random_state, data_cut=None, sentence_piece=self.sentence_piece) | |
'''loader making''' | |
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset)) | |
val_loader = DataLoader(val_dataset, batch_size=batch_size, sampler=SequentialSampler(val_dataset)) | |
''' model load ''' | |
model=classification_model(self.pretrained_model, self.config, num_classifier=self.num_classifier, num_pos_emb_layer=self.num_pos_emb_layer, bertsum=self.bertsum, device=device) | |
model.load_state_dict(torch.load(selected_model)) | |
model.to(device) | |
''' prediction ''' | |
print('start trainset prediction') | |
train_results = model_eval(model, device, train_loader, task_type=self.task_type, return_values=True, sentence_piece=self.sentence_piece) | |
print('start evalset prediction') | |
eval_results = model_eval(model, device, val_loader, task_type=self.task_type, return_values=True, sentence_piece=self.sentence_piece) | |
print('train result: acc:{0} | precision:{1} | recall:{2} | f1:{3}'.format(train_results[1], train_results[2], train_results[3], train_results[4])) | |
print('eval result: acc:{0} | precision:{1} | recall:{2} | f1:{3}'.format(eval_results[1], eval_results[2], eval_results[3], eval_results[4])) | |
total_text = train_results[7] + eval_results[7] | |
total_out = train_results[6] + eval_results[6] | |
total_target = train_results[5] + eval_results[5] | |
if self.task_type == 'onehot': | |
total_out = [i.argmax() for i in total_out] | |
total_target = [i.argmax() for i in total_target] | |
total_data = {'text':total_text, 'label':total_target, 'predict':total_out} | |
total_df = pd.DataFrame(total_data) | |
''' result return ''' | |
return total_df | |
def get_embedding(self, selected_model=None, batch_size=8, return_hidden=True, return_hidden_pretrained=False): | |
#os.environ["CUDA_VISIBLE_DEVICES"]= "{0}".format(int(self.gpu_num)) | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
#device = torch.device('cuda:{0}'.format(int(self.gpu_num))) | |
#torch.cuda.set_device(device) | |
set_seed(self.random_state) | |
torch.set_num_threads(10) | |
task_type=self.task_type | |
'''data loading''' | |
train_dataset, val_dataset = make_dataset(csv_file=self.data_file, tokenizer=self.tokenizer, max_length=self.max_length, padding=self.padding, random_state=self.random_state, data_cut=None, sentence_piece=self.sentence_piece) | |
'''loader making''' | |
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset)) | |
val_loader = DataLoader(val_dataset, batch_size=batch_size, sampler=SequentialSampler(val_dataset)) | |
''' model load ''' | |
model=classification_model(self.pretrained_model, self.config, num_classifier=self.num_classifier, num_pos_emb_layer=self.num_pos_emb_layer, bertsum=self.bertsum, device=device) | |
model.return_hidden = return_hidden | |
model.return_hidden_pretrained = return_hidden_pretrained | |
if selected_model is not None: | |
model.load_state_dict(torch.load(selected_model)) | |
model.to(device) | |
''' get hidden ''' | |
print('start make hidden states (trainset)') | |
train_hiddens, train_targets = get_hidden(model, device, train_loader, task_type=self.task_type, sentence_piece=self.sentence_piece) | |
print('start evalset prediction (eval set)') | |
eval_hiddens, eval_targets = get_hidden(model, device, val_loader, task_type=self.task_type, sentence_piece=self.sentence_piece) | |
total_hiddens = np.array(train_hiddens + eval_hiddens) | |
total_targets = np.array(train_targets + eval_targets) | |
return total_hiddens, total_targets | |
def label_extraction(self, paragraphs, positions, selected_model=None, batch_size=16): | |
label_dict = {'Abstract':0, 'Introduction':1, 'Main':2, 'Methods':3, 'Summary':4, 'Captions':5} | |
#os.environ["CUDA_VISIBLE_DEVICES"]= "{0}".format(int(self.gpu_num)) | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
set_seed(self.random_state) | |
torch.set_num_threads(10) | |
''' data to list ''' | |
is_list = True | |
if not isinstance(paragraphs, list): | |
paragraphs = [paragraphs] | |
is_list = False | |
if not isinstance(positions, list): | |
positions = [positions] | |
is_list = False | |
'''data encoding''' | |
dataset = make_extract_dataset(paragraphs, positions, tokenizer=self.tokenizer, max_length=self.max_length) | |
'''loader making''' | |
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False) | |
''' model load ''' | |
model=classification_model(self.pretrained_model, self.config, num_classifier=self.num_classifier, num_pos_emb_layer=self.num_pos_emb_layer, bertsum=self.bertsum, device=device) | |
model.load_state_dict(torch.load(selected_model)) | |
model.to(device) | |
''' prediction ''' | |
model.eval() | |
predicts = [] | |
with torch.no_grad(): | |
for batch in tqdm(data_loader): | |
inputs = {} | |
inputs['input_ids'] = batch['input_ids'].to(device) | |
inputs['attention_mask'] = batch['attention_mask'].to(device) | |
inputs['token_type_ids'] = batch['token_type_ids'].to(device) | |
inputs['position'] = batch['position'] | |
outputs = model(inputs) | |
logits = outputs[1] | |
logits = logits.detach().cpu().numpy() | |
logits = logits.argmax(axis=1).flatten() | |
logits = logits.tolist() | |
predicts.extend(logits) | |
predicts = [list(label_dict.keys())[list(label_dict.values()).index(i)] for i in predicts] | |
if not is_list: | |
predicts = predicts[0] | |
return predicts | |