from glob import glob from tqdm import tqdm import numpy as np import pickle #from sklearn.model_selection import train_test_split import torch import os import ast #from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score from transformers import EarlyStoppingCallback from transformers import AutoConfig, AutoModel, AutoTokenizer, AutoModelForSequenceClassification from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler #from sklearn.utils import shuffle from transformers import get_cosine_schedule_with_warmup from torch.nn import functional as F import random import pandas as pd from .datas import make_dataset, make_extract_dataset from .utils import set_seed, accuracy_per_class, compute_metrics, model_eval, checkpoint_save, EarlyStopping, model_freeze, get_hidden from .model import classification_model from transformers import BigBirdTokenizer import transformers class NLP_classification(): def __init__(self, model_name=None, data_file=None, max_length=None, random_state=1000, task_type='onehot', freeze_layers=None, num_classifier=1, num_pos_emb_layer=1, gpu_num=0, sentence_piece=True, bertsum=False): self.model_name = model_name self.data_file = data_file self.max_length = max_length self.random_state = random_state self.task_type = task_type self.tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False) if model_name == 'google/bigbird-roberta-base': self.tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base') self.config = AutoConfig.from_pretrained(model_name, num_labels=6) #self.pretrained_model = AutoModelForSequenceClassification.from_config(self.config) self.pretrained_model = AutoModel.from_config(self.config) self.freeze_layers=freeze_layers self.num_classifier=num_classifier self.num_pos_emb_layer=num_pos_emb_layer self.gpu_num=gpu_num self.sentence_piece=sentence_piece self.bertsum=bertsum if self.max_length is None: self.padding='longest' else: self.padding='max_length' def training(self, epochs=50, batch_size=4, lr=1e-5, dropout=0.1, data_cut=None, early_stop_count=10, wandb_log=False, wandb_project=None, wandb_group=None, wandb_name=None, wandb_memo=None): #os.environ["CUDA_VISIBLE_DEVICES"]= "{0}".format(int(self.gpu_num)) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #device = torch.device('cuda:{0}'.format(int(self.gpu_num))) #torch.cuda.set_device(device) set_seed(self.random_state) torch.set_num_threads(10) if wandb_log is True: import wandb wandb.init(project=wandb_project, reinit=True, group=wandb_group, notes=wandb_memo) wandb.run.name = wandb_name wandb.run.save() parameters = wandb.config parameters.lr = lr parameters.batch_size = batch_size parameters.dropout = dropout parameters.train_num = data_cut parameters.max_length = self.max_length parameters.model_name = self.model_name parameters.task_type = self.task_type '''data loading''' train_dataset, val_dataset = make_dataset(csv_file=self.data_file, tokenizer=self.tokenizer, max_length=self.max_length, padding=self.padding, random_state=self.random_state, data_cut=data_cut, sentence_piece=self.sentence_piece) '''loader making''' train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=batch_size, sampler=SequentialSampler(val_dataset)) ''' model load ''' model=classification_model(self.pretrained_model, self.config, num_classifier=self.num_classifier, num_pos_emb_layer=self.num_pos_emb_layer, bertsum=self.bertsum, device=device) model=model_freeze(model, self.freeze_layers) model.to(device) ''' running setting ''' loss_fn = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr, eps=1e-8) scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=(len(train_loader)*epochs)) early_stopping = EarlyStopping(patience = early_stop_count, verbose = True) ''' running ''' best_epoch = None best_val_f1 = None for epoch in range(epochs): model.train() loss_all = 0 step = 0 for data in tqdm(train_loader): input_ids=data['input_ids'].to(device, dtype=torch.long) mask = data['attention_mask'].to(device, dtype=torch.long) token_type_ids = data['token_type_ids'].to(device, dtype=torch.long) if self.task_type=='onehot': targets=data['label_onehot'].to(device, dtype=torch.float) elif self.task_type=='scalar': targets=data['label'].to(device, dtype=torch.long) position = data['position'] inputs = {'input_ids': input_ids, 'attention_mask': mask, 'token_type_ids': token_type_ids, 'labels': targets, 'position': position} if self.sentence_piece: sentence_batch = data['sentence_batch'].to(device, dtype=torch.long) inputs = {'input_ids': input_ids, 'attention_mask': mask, 'token_type_ids': token_type_ids, 'labels': targets, 'sentence_batch': sentence_batch, 'position': position} outputs = model(inputs) output = outputs[1] loss = outputs[0] optimizer.zero_grad() #loss=loss_fn(output, targets) loss_all += loss.item() loss.backward() optimizer.step() scheduler.step() #print(optimizer.param_groups[0]['lr']) train_loss = loss_all/len(train_loader) val_loss, val_acc, val_precision, val_recall, val_f1 = model_eval(model, device, val_loader, task_type=self.task_type, sentence_piece=self.sentence_piece) if wandb_log is True: wandb.log({'train_loss':train_loss, 'val_loss':val_loss, 'val_acc':val_acc, 'val_precision':val_precision, 'val_recall':val_recall, 'val_f1':val_f1}) if best_val_f1 is None or val_f1 >= best_val_f1: best_epoch = epoch+1 best_val_f1 = val_f1 checkpoint_save(model, val_f1, wandb_name=wandb_name) print('Epoch: {:03d}, Train Loss: {:.7f}, Val Loss: {:.7f}, Val Acc: {:.7f}, Val Precision: {:.7f}, Val Recall: {:.7f}, Val F1: {:.7f} '.format(epoch+1, train_loss, val_loss, val_acc, val_precision, val_recall, val_f1)) early_stopping(val_f1) if early_stopping.early_stop: print("Early stopping") break wandb.finish() def prediction(self, selected_model=None, batch_size=8): #os.environ["CUDA_VISIBLE_DEVICES"]= "{0}".format(int(self.gpu_num)) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') set_seed(self.random_state) torch.set_num_threads(10) task_type=self.task_type '''data loading''' train_dataset, val_dataset = make_dataset(csv_file=self.data_file, tokenizer=self.tokenizer, max_length=self.max_length, padding=self.padding, random_state=self.random_state, data_cut=None, sentence_piece=self.sentence_piece) '''loader making''' train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=batch_size, sampler=SequentialSampler(val_dataset)) ''' model load ''' model=classification_model(self.pretrained_model, self.config, num_classifier=self.num_classifier, num_pos_emb_layer=self.num_pos_emb_layer, bertsum=self.bertsum, device=device) model.load_state_dict(torch.load(selected_model)) model.to(device) ''' prediction ''' print('start trainset prediction') train_results = model_eval(model, device, train_loader, task_type=self.task_type, return_values=True, sentence_piece=self.sentence_piece) print('start evalset prediction') eval_results = model_eval(model, device, val_loader, task_type=self.task_type, return_values=True, sentence_piece=self.sentence_piece) print('train result: acc:{0} | precision:{1} | recall:{2} | f1:{3}'.format(train_results[1], train_results[2], train_results[3], train_results[4])) print('eval result: acc:{0} | precision:{1} | recall:{2} | f1:{3}'.format(eval_results[1], eval_results[2], eval_results[3], eval_results[4])) total_text = train_results[7] + eval_results[7] total_out = train_results[6] + eval_results[6] total_target = train_results[5] + eval_results[5] if self.task_type == 'onehot': total_out = [i.argmax() for i in total_out] total_target = [i.argmax() for i in total_target] total_data = {'text':total_text, 'label':total_target, 'predict':total_out} total_df = pd.DataFrame(total_data) ''' result return ''' return total_df def get_embedding(self, selected_model=None, batch_size=8, return_hidden=True, return_hidden_pretrained=False): #os.environ["CUDA_VISIBLE_DEVICES"]= "{0}".format(int(self.gpu_num)) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #device = torch.device('cuda:{0}'.format(int(self.gpu_num))) #torch.cuda.set_device(device) set_seed(self.random_state) torch.set_num_threads(10) task_type=self.task_type '''data loading''' train_dataset, val_dataset = make_dataset(csv_file=self.data_file, tokenizer=self.tokenizer, max_length=self.max_length, padding=self.padding, random_state=self.random_state, data_cut=None, sentence_piece=self.sentence_piece) '''loader making''' train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=batch_size, sampler=SequentialSampler(val_dataset)) ''' model load ''' model=classification_model(self.pretrained_model, self.config, num_classifier=self.num_classifier, num_pos_emb_layer=self.num_pos_emb_layer, bertsum=self.bertsum, device=device) model.return_hidden = return_hidden model.return_hidden_pretrained = return_hidden_pretrained if selected_model is not None: model.load_state_dict(torch.load(selected_model)) model.to(device) ''' get hidden ''' print('start make hidden states (trainset)') train_hiddens, train_targets = get_hidden(model, device, train_loader, task_type=self.task_type, sentence_piece=self.sentence_piece) print('start evalset prediction (eval set)') eval_hiddens, eval_targets = get_hidden(model, device, val_loader, task_type=self.task_type, sentence_piece=self.sentence_piece) total_hiddens = np.array(train_hiddens + eval_hiddens) total_targets = np.array(train_targets + eval_targets) return total_hiddens, total_targets def label_extraction(self, paragraphs, positions, selected_model=None, batch_size=16): label_dict = {'Abstract':0, 'Introduction':1, 'Main':2, 'Methods':3, 'Summary':4, 'Captions':5} #os.environ["CUDA_VISIBLE_DEVICES"]= "{0}".format(int(self.gpu_num)) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') set_seed(self.random_state) torch.set_num_threads(10) ''' data to list ''' is_list = True if not isinstance(paragraphs, list): paragraphs = [paragraphs] is_list = False if not isinstance(positions, list): positions = [positions] is_list = False '''data encoding''' dataset = make_extract_dataset(paragraphs, positions, tokenizer=self.tokenizer, max_length=self.max_length) '''loader making''' data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False) ''' model load ''' model=classification_model(self.pretrained_model, self.config, num_classifier=self.num_classifier, num_pos_emb_layer=self.num_pos_emb_layer, bertsum=self.bertsum, device=device) model.load_state_dict(torch.load(selected_model)) model.to(device) ''' prediction ''' model.eval() predicts = [] with torch.no_grad(): for batch in tqdm(data_loader): inputs = {} inputs['input_ids'] = batch['input_ids'].to(device) inputs['attention_mask'] = batch['attention_mask'].to(device) inputs['token_type_ids'] = batch['token_type_ids'].to(device) inputs['position'] = batch['position'] outputs = model(inputs) logits = outputs[1] logits = logits.detach().cpu().numpy() logits = logits.argmax(axis=1).flatten() logits = logits.tolist() predicts.extend(logits) predicts = [list(label_dict.keys())[list(label_dict.values()).index(i)] for i in predicts] if not is_list: predicts = predicts[0] return predicts