Spaces:
Sleeping
Sleeping
from nltk.tokenize import sent_tokenize | |
import pandas as pd | |
###################### | |
# prerequisite: | |
# 1. Pip install transformer | |
# 2. Define tokenizer + MAX_LEN | |
# 3. Construct DistillBERTClass_SL class | |
# 4. Construct Triage_SL class | |
# 5. Define predict__SL class | |
# 6. Load model_SL & call eval() | |
# 7. Pre_define predict_params_SL | |
#################### | |
from transformers import DistilBertTokenizer | |
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') | |
import torch | |
"""### DataSet Class -- Triage_SL""" | |
from torch.utils.data import Dataset, DataLoader | |
class Triage_SL(Dataset): | |
# initialize the directory containing the dataframe, the tokenizer, and the max lens of sentences | |
def __init__(self, dataframe, tokenizer, max_len): | |
self.len = len(dataframe) | |
self.data = dataframe | |
self.tokenizer = tokenizer # load in tokenizer, used in _getitem | |
self.max_len = max_len | |
# The __getitem__ function loads and returns a sample from the dataset at the given index idx. | |
def __getitem__(self, index): | |
if index >= len(self): | |
raise StopIteration | |
# preprossessing sentences to standarize format as in: word+""+word | |
sent = str(self.data.sentence[index]) | |
sent = " ".join(sent.split()) | |
# 1.- Split the sentence into tokens. | |
# 2.- Add the special [CLS] and [SEP] tokens. | |
# 3.- Map the tokens to their IDs. | |
# 4.- Pad or truncate all sentences to the same length. | |
# 5.- Create the attention masks which explicitly differentiate real tokens from [PAD] tokens. | |
inputs = self.tokenizer.encode_plus( | |
sent, # Sentence to encode | |
None, # text_pair | |
add_special_tokens=True, # Add '[CLS]' and '[SEP]' | |
max_length=self.max_len, | |
pad_to_max_length=True, # Pad & truncate all sentences. | |
return_token_type_ids=True, | |
truncation=True | |
) | |
ids = inputs['input_ids'] | |
mask = inputs['attention_mask'] | |
return { | |
'ids': torch.tensor(ids, dtype=torch.long), | |
'mask': torch.tensor(mask, dtype=torch.long), | |
# 'targets': torch.tensor(self.data.ENCODE_LABEL[index], dtype=torch.float), # sentence label -> y value | |
# 'combined_label': self.data.combined_label[index] | |
} | |
# The __len__ function returns the number of samples in our dataset. | |
def __len__(self): | |
return self.len | |
# read in an essay and resturns a df in sentence level | |
def essay_to_sent_df(essay): | |
sentences = [] | |
paragraphs = [l for l in essay.split('\n') if len(l) > 0] | |
for para in paragraphs: | |
# tokenize paragraph by "." and concatenate to sentences[] | |
sentences.extend(sent_tokenize(para)) | |
return pd.DataFrame(sentences, columns=['sentence']) | |
# Defining some key variables that will be used later on in the training | |
MAX_LEN = 512 | |
"""### Predefine predict_params_SL""" | |
PREDICT_BATCH_SIZE = 1 | |
predict_params_SL = {'batch_size': PREDICT_BATCH_SIZE, | |
'shuffle': False, | |
'num_workers': 0 | |
} | |
"""### Predict Fn -- predict_SL""" | |
sigmoid = torch.nn.Sigmoid() | |
def predict_SL(model, validation_loader): | |
epoch_val_outputs=[] | |
cpu_device = 'cpu' | |
model.eval() | |
with torch.no_grad(): | |
for _, data in enumerate(validation_loader, 0): | |
ids = data['ids'].to(cpu_device, dtype = torch.long) | |
mask = data['mask'].to(cpu_device, dtype = torch.long) | |
outputs = model(ids, mask)["logits"].squeeze() # ??squeeze?? | |
outputs = (sigmoid(outputs).data>0.5).float() | |
epoch_val_outputs.append(outputs.item()) | |
return epoch_val_outputs | |
def predict_mainidea_sent_old(paragraph, model): | |
# prepare data | |
sent_df = essay_to_sent_df(paragraph) | |
predicting_SL_set = Triage_SL(sent_df, tokenizer, MAX_LEN) | |
predicting_SL_loader = DataLoader(predicting_SL_set, **predict_params_SL) | |
# load model to device | |
device = 'cpu' | |
model.to(device) | |
# predict + roundup | |
sent_label = predict_SL(model, predicting_SL_loader) | |
print(sent_label) | |
return pd.DataFrame([(str(l), s) for l, s in zip(sent_label, sent_df.sentence)], columns=['label', 'sentence']) | |