finrad_model / README.md

Update README.md

0b585fd over 2 years ago

5.71 kB

	---
	license: mit
	---
	How to load the model and generate predictions?

	```python
	import torch
	import transformers
	from torch.utils.data import Dataset, DataLoader
	from transformers import RobertaModel, RobertaTokenizer, BertModel, BertTokenizer

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	MAX_LEN = 128
	BATCH_SIZE = 20
	text_col_name = 'sentence'
	category_col = 'label_text'

	#Input should be one dataframe having one column with header as 'sentence' : test_df (do reset_index() if needed)
	test_df = pd.DataFrame({"sentence":['a general increase in prices and fall in the purchasing value of money.']})

	def scoring_data_prep(dataset):
	out = []
	target = []
	mask = []

	for i in range(len(dataset)):
	rec = dataset[i]
	out.append(rec['ids'].reshape(-1,MAX_LEN))
	mask.append(rec['mask'].reshape(-1,MAX_LEN))

	out_stack = torch.cat(out, dim = 0)
	mask_stack = torch.cat(mask, dim =0 )
	out_stack = out_stack.to(device, dtype = torch.long)
	mask_stack = mask_stack.to(device, dtype = torch.long)

	return out_stack, mask_stack


	class Triage(Dataset):
	"""
	This is a subclass of torch packages Dataset class. It processes input to create ids, masks and targets required for model training.
	"""

	def __init__(self, dataframe, tokenizer, max_len, text_col_name, category_col):
	self.len = len(dataframe)
	self.data = dataframe
	self.tokenizer = tokenizer
	self.max_len = max_len
	self.text_col_name = text_col_name
	self.category_col = category_col

	def __getitem__(self, index):
	title = str(self.data[self.text_col_name][index])
	title = " ".join(title.split())
	inputs = self.tokenizer.encode_plus(
	title,
	None,
	add_special_tokens=True,
	max_length=self.max_len,
	pad_to_max_length=True,
	return_token_type_ids=True,
	truncation=True,
	)
	ids = inputs["input_ids"]
	mask = inputs["attention_mask"]

	return {
	"ids": torch.tensor(ids, dtype=torch.long),
	"mask": torch.tensor(mask, dtype=torch.long),
	"targets": torch.tensor(
	self.data[self.category_col][index], dtype=torch.long
	),
	}

	def __len__(self):
	return self.len

	class BERTClass(torch.nn.Module):
	def __init__(self, num_class):
	super(BERTClass, self).__init__()
	self.num_class = num_class
	self.l1 = BertModel.from_pretrained("ProsusAI/finbert")
	self.pre_classifier = torch.nn.Linear(768, 768)
	self.dropout = torch.nn.Dropout(0.3)
	self.classifier = torch.nn.Linear(768, self.num_class)
	self.history = dict()

	def forward(self, input_ids, attention_mask):
	output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
	hidden_state = output_1[0]
	pooler = hidden_state[:, 0]
	pooler = self.pre_classifier(pooler)
	pooler = torch.nn.ReLU()(pooler)
	pooler = self.dropout(pooler)
	output = self.classifier(pooler)
	return output

	def do_predict(tokenizer):
	test_set = Triage(test_df, tokenizer, MAX_LEN, text_col_name)
	test_params = {'batch_size' : BATCH_SIZE, 'shuffle': False, 'num_workers':0}
	test_loader = DataLoader(test_set, **test_params)
	out_stack, mask_stack = scoring_data_prep(dataset = test_set)
	n = 0
	combined_output = []
	model.eval()
	with torch.no_grad():
	while n < test_df.shape[0]:
	output = model(out_stack[n:n+BATCH_SIZE,:],mask_stack[n:n+BATCH_SIZE,:])
	n = n + BATCH_SIZE
	combined_output.append(output)
	combined_output = torch.cat(combined_output, dim = 0)
	preds = torch.argsort(combined_output, axis = 1, descending = True)
	preds = preds.to('cpu')
	actual_predictions = [i[0] for i in preds.tolist()]
	return actual_predictions

	model_read = BERTClass(2)
	model_read.load_stat_dict(torch.load('pytorch_model.bin')['model_state_dict'])

	tokenizer_read = BertTokenizer.from_pretrained('ProsusAI/finbert')
	actual_predictions_read = do_predict(tokenizer_read)

	test_df['readability'] = ['readable' if i==1 else 'not_reabale' for i in actual_predictions_read]

	```



	```bibtex
	@InProceedings{ghosh-EtAl:2022:FNP,
	author = {Ghosh, Sohom and Sengupta, Shovon and Naskar, Sudip and Singh, Sunny Kumar},
	title = {FinRAD: Financial Readability Assessment Dataset - 13,000+ Definitions of Financial Terms for Measuring Readability},
	booktitle = {Proceedings of the The 4th Financial Narrative Processing Workshop @LREC2022},
	month = {June},
	year = {2022},
	address = {Marseille, France},
	publisher = {European Language Resources Association},
	pages = {1--9},
	url = {http://www.lrec-conf.org/proceedings/lrec2022/workshops/FNP/pdf/2022.fnp-1.1.pdf}
	}
	```

	``bibtex
	@InProceedings{ghosh-2021-finread,
	title = "FinRead: A Transfer Learning Based Tool to Assess Readability of Definitions of Financial Terms",
	author = "Sohom Ghosh, Shovon Sengupta, Sudip Kumar Naskar, Sunny Kumar Singh",
	booktitle = "Proceedings of the 18th International Conference on Natural Language Processing (ICON) :
	System Demonstrations",
	month = "dec",
	year = "2021",
	publisher = "NLP Association of India (NLPAI)",
	url = "forthcoming",
	intype = {to appear in},
	pre-print = "https://easychair.org/publications/preprint/1wvS"
	}
	```