|
--- |
|
license: mit |
|
--- |
|
How to load the model and generate predictions? |
|
Download the pytorch_model.bin file and execute the following: |
|
|
|
```python |
|
import pandas as pd |
|
import torch |
|
import transformers |
|
from torch.utils.data import Dataset, DataLoader |
|
from transformers import RobertaModel, RobertaTokenizer, BertModel, BertTokenizer |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
MAX_LEN = 128 |
|
BATCH_SIZE = 20 |
|
text_col_name = 'sentence' |
|
category_col = 'label_text' |
|
|
|
#Input should be one dataframe having one column with header as 'sentence' : test_df (do reset_index() if needed) |
|
test_df = pd.DataFrame({"sentence":['a general increase in prices and fall in the purchasing value of money.']}) |
|
|
|
def scoring_data_prep(dataset): |
|
out = [] |
|
target = [] |
|
mask = [] |
|
|
|
for i in range(len(dataset)): |
|
rec = dataset[i] |
|
out.append(rec['ids'].reshape(-1,MAX_LEN)) |
|
mask.append(rec['mask'].reshape(-1,MAX_LEN)) |
|
|
|
out_stack = torch.cat(out, dim = 0) |
|
mask_stack = torch.cat(mask, dim =0 ) |
|
out_stack = out_stack.to(device, dtype = torch.long) |
|
mask_stack = mask_stack.to(device, dtype = torch.long) |
|
|
|
return out_stack, mask_stack |
|
|
|
|
|
class Triage(Dataset): |
|
""" |
|
This is a subclass of torch packages Dataset class. It processes input to create ids, masks and targets required for model training. |
|
""" |
|
|
|
def __init__(self, dataframe, tokenizer, max_len, text_col_name): |
|
self.len = len(dataframe) |
|
self.data = dataframe |
|
self.tokenizer = tokenizer |
|
self.max_len = max_len |
|
self.text_col_name = text_col_name |
|
|
|
|
|
def __getitem__(self, index): |
|
title = str(self.data[self.text_col_name][index]) |
|
title = " ".join(title.split()) |
|
inputs = self.tokenizer.encode_plus( |
|
title, |
|
None, |
|
add_special_tokens=True, |
|
max_length=self.max_len, |
|
pad_to_max_length=True, |
|
return_token_type_ids=True, |
|
truncation=True, |
|
) |
|
ids = inputs["input_ids"] |
|
mask = inputs["attention_mask"] |
|
|
|
return { |
|
"ids": torch.tensor(ids, dtype=torch.long), |
|
"mask": torch.tensor(mask, dtype=torch.long), |
|
|
|
} |
|
|
|
def __len__(self): |
|
return self.len |
|
|
|
class BERTClass(torch.nn.Module): |
|
def __init__(self, num_class): |
|
super(BERTClass, self).__init__() |
|
self.num_class = num_class |
|
self.l1 = BertModel.from_pretrained("ProsusAI/finbert") |
|
self.pre_classifier = torch.nn.Linear(768, 768) |
|
self.dropout = torch.nn.Dropout(0.3) |
|
self.classifier = torch.nn.Linear(768, self.num_class) |
|
self.history = dict() |
|
|
|
def forward(self, input_ids, attention_mask): |
|
output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask) |
|
hidden_state = output_1[0] |
|
pooler = hidden_state[:, 0] |
|
pooler = self.pre_classifier(pooler) |
|
pooler = torch.nn.ReLU()(pooler) |
|
pooler = self.dropout(pooler) |
|
output = self.classifier(pooler) |
|
return output |
|
|
|
def do_predict(model, tokenizer, test_df): |
|
test_set = Triage(test_df, tokenizer, MAX_LEN, text_col_name) |
|
test_params = {'batch_size' : BATCH_SIZE, 'shuffle': False, 'num_workers':0} |
|
test_loader = DataLoader(test_set, **test_params) |
|
out_stack, mask_stack = scoring_data_prep(dataset = test_set) |
|
n = 0 |
|
combined_output = [] |
|
model.eval() |
|
with torch.no_grad(): |
|
while n < test_df.shape[0]: |
|
output = model(out_stack[n:n+BATCH_SIZE,:],mask_stack[n:n+BATCH_SIZE,:]) |
|
n = n + BATCH_SIZE |
|
combined_output.append(output) |
|
combined_output = torch.cat(combined_output, dim = 0) |
|
preds = torch.argsort(combined_output, axis = 1, descending = True) |
|
preds = preds.to('cpu') |
|
actual_predictions = [i[0] for i in preds.tolist()] |
|
return actual_predictions |
|
|
|
model_read = BERTClass(2) |
|
model_read.to(device) |
|
model_read.load_stat_dict(torch.load('pytorch_model.bin', map_location=device)['model_state_dict']) |
|
|
|
tokenizer_read = BertTokenizer.from_pretrained('ProsusAI/finbert') |
|
actual_predictions_read = do_predict(model_read, tokenizer_read, test_df) |
|
|
|
test_df['readability'] = ['readable' if i==1 else 'not_reabale' for i in actual_predictions_read] |
|
|
|
``` |
|
|
|
|
|
|
|
```bibtex |
|
@InProceedings{ghosh-EtAl:2022:FNP, |
|
author = {Ghosh, Sohom and Sengupta, Shovon and Naskar, Sudip and Singh, Sunny Kumar}, |
|
title = {FinRAD: Financial Readability Assessment Dataset - 13,000+ Definitions of Financial Terms for Measuring Readability}, |
|
booktitle = {Proceedings of the The 4th Financial Narrative Processing Workshop @LREC2022}, |
|
month = {June}, |
|
year = {2022}, |
|
address = {Marseille, France}, |
|
publisher = {European Language Resources Association}, |
|
pages = {1--9}, |
|
url = {http://www.lrec-conf.org/proceedings/lrec2022/workshops/FNP/pdf/2022.fnp-1.1.pdf} |
|
} |
|
``` |
|
|
|
```bibtex |
|
@InProceedings{ghosh-2021-finread, |
|
title = "FinRead: A Transfer Learning Based Tool to Assess Readability of Definitions of Financial Terms", |
|
author = "Sohom Ghosh, Shovon Sengupta, Sudip Kumar Naskar, Sunny Kumar Singh", |
|
booktitle = "Proceedings of the 18th International Conference on Natural Language Processing (ICON) : |
|
System Demonstrations", |
|
month = "dec", |
|
year = "2021", |
|
publisher = "NLP Association of India (NLPAI)", |
|
url = "forthcoming", |
|
intype = {to appear in}, |
|
pre-print = "https://easychair.org/publications/preprint/1wvS" |
|
} |
|
``` |