File size: 5,653 Bytes
5a15b3b 0fd8ca6 efe7d95 0fd8ca6 0b585fd 0fd8ca6 ee4af77 0fd8ca6 ee4af77 0fd8ca6 ee4af77 0fd8ca6 9f31d25 0fd8ca6 17ef62c 0fd8ca6 54ef9da 0fd8ca6 9f31d25 0fd8ca6 8ccc59a 989fadb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
---
license: mit
---
How to load the model and generate predictions?
```python
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer, BertModel, BertTokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 128
BATCH_SIZE = 20
text_col_name = 'sentence'
category_col = 'label_text'
#Input should be one dataframe having one column with header as 'sentence' : test_df (do reset_index() if needed)
test_df = pd.DataFrame({"sentence":['a general increase in prices and fall in the purchasing value of money.']})
def scoring_data_prep(dataset):
out = []
target = []
mask = []
for i in range(len(dataset)):
rec = dataset[i]
out.append(rec['ids'].reshape(-1,MAX_LEN))
mask.append(rec['mask'].reshape(-1,MAX_LEN))
out_stack = torch.cat(out, dim = 0)
mask_stack = torch.cat(mask, dim =0 )
out_stack = out_stack.to(device, dtype = torch.long)
mask_stack = mask_stack.to(device, dtype = torch.long)
return out_stack, mask_stack
class Triage(Dataset):
"""
This is a subclass of torch packages Dataset class. It processes input to create ids, masks and targets required for model training.
"""
def __init__(self, dataframe, tokenizer, max_len, text_col_name):
self.len = len(dataframe)
self.data = dataframe
self.tokenizer = tokenizer
self.max_len = max_len
self.text_col_name = text_col_name
def __getitem__(self, index):
title = str(self.data[self.text_col_name][index])
title = " ".join(title.split())
inputs = self.tokenizer.encode_plus(
title,
None,
add_special_tokens=True,
max_length=self.max_len,
pad_to_max_length=True,
return_token_type_ids=True,
truncation=True,
)
ids = inputs["input_ids"]
mask = inputs["attention_mask"]
return {
"ids": torch.tensor(ids, dtype=torch.long),
"mask": torch.tensor(mask, dtype=torch.long),
}
def __len__(self):
return self.len
class BERTClass(torch.nn.Module):
def __init__(self, num_class):
super(BERTClass, self).__init__()
self.num_class = num_class
self.l1 = BertModel.from_pretrained("ProsusAI/finbert")
self.pre_classifier = torch.nn.Linear(768, 768)
self.dropout = torch.nn.Dropout(0.3)
self.classifier = torch.nn.Linear(768, self.num_class)
self.history = dict()
def forward(self, input_ids, attention_mask):
output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
hidden_state = output_1[0]
pooler = hidden_state[:, 0]
pooler = self.pre_classifier(pooler)
pooler = torch.nn.ReLU()(pooler)
pooler = self.dropout(pooler)
output = self.classifier(pooler)
return output
def do_predict(model, tokenizer, test_df):
test_set = Triage(test_df, tokenizer, MAX_LEN, text_col_name)
test_params = {'batch_size' : BATCH_SIZE, 'shuffle': False, 'num_workers':0}
test_loader = DataLoader(test_set, **test_params)
out_stack, mask_stack = scoring_data_prep(dataset = test_set)
n = 0
combined_output = []
model.eval()
with torch.no_grad():
while n < test_df.shape[0]:
output = model(out_stack[n:n+BATCH_SIZE,:],mask_stack[n:n+BATCH_SIZE,:])
n = n + BATCH_SIZE
combined_output.append(output)
combined_output = torch.cat(combined_output, dim = 0)
preds = torch.argsort(combined_output, axis = 1, descending = True)
preds = preds.to('cpu')
actual_predictions = [i[0] for i in preds.tolist()]
return actual_predictions
model_read = BERTClass(2)
model_read.to(device)
model_read.load_stat_dict(torch.load('pytorch_model.bin', map_location=device)['model_state_dict'])
tokenizer_read = BertTokenizer.from_pretrained('ProsusAI/finbert')
actual_predictions_read = do_predict(model_read, tokenizer_read, test_df)
test_df['readability'] = ['readable' if i==1 else 'not_reabale' for i in actual_predictions_read]
```
```bibtex
@InProceedings{ghosh-EtAl:2022:FNP,
author = {Ghosh, Sohom and Sengupta, Shovon and Naskar, Sudip and Singh, Sunny Kumar},
title = {FinRAD: Financial Readability Assessment Dataset - 13,000+ Definitions of Financial Terms for Measuring Readability},
booktitle = {Proceedings of the The 4th Financial Narrative Processing Workshop @LREC2022},
month = {June},
year = {2022},
address = {Marseille, France},
publisher = {European Language Resources Association},
pages = {1--9},
url = {http://www.lrec-conf.org/proceedings/lrec2022/workshops/FNP/pdf/2022.fnp-1.1.pdf}
}
```
``bibtex
@InProceedings{ghosh-2021-finread,
title = "FinRead: A Transfer Learning Based Tool to Assess Readability of Definitions of Financial Terms",
author = "Sohom Ghosh, Shovon Sengupta, Sudip Kumar Naskar, Sunny Kumar Singh",
booktitle = "Proceedings of the 18th International Conference on Natural Language Processing (ICON) :
System Demonstrations",
month = "dec",
year = "2021",
publisher = "NLP Association of India (NLPAI)",
url = "forthcoming",
intype = {to appear in},
pre-print = "https://easychair.org/publications/preprint/1wvS"
}
``` |