|
import numpy as np |
|
import pandas as pd |
|
import pickle |
|
|
|
import gradio as gr |
|
|
|
import torch |
|
|
|
import math |
|
|
|
from transformers import AutoTokenizer, AutoModel |
|
|
|
import transformers |
|
|
|
import re |
|
|
|
mlp = pickle.load(open("MLP_over_embeddings.pickle", "rb")) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/sec-bert-num") |
|
model = AutoModel.from_pretrained('nlpaueb/sec-bert-num') |
|
|
|
"""# Input here""" |
|
|
|
def convert_actual_to_num(text, number, offset): |
|
length = len(str(number)) |
|
offset = int(offset) |
|
new_text= text[:offset] + " [NUM] " + text[offset+length:] |
|
return new_text |
|
|
|
def num_detector_highlighter_adv(text): |
|
num_posn = [] |
|
posn = -1 |
|
num = "" |
|
text = text + " " |
|
others = "" |
|
for i in range(len(text)-2): |
|
if (text[i].isdigit() and text[i+1].isdigit()) or (text[i].isdigit() and text[i+1]=="." and text[i+2].isdigit()): |
|
num = num + str(text[i]) |
|
if posn == -1: |
|
posn = i |
|
if others!="": |
|
num_posn.append((others,"")) |
|
others = "" |
|
elif (text[i].isdigit() and text[i+1].isdigit()==False and text[i+1]!=".") or (text[i].isdigit() and text[i+1]=="." and text[i].isdigit() and text[i+2].isdigit()==False): |
|
num = num + str(text[i]) |
|
if len(num)==1: |
|
posn = i |
|
if others!="": |
|
num_posn.append((others,"")) |
|
others = "" |
|
num_posn.append((str(num), "@POSITION " + str(posn))) |
|
num = "" |
|
posn = -1 |
|
elif text[i] == ".": |
|
if text[i+1].isdigit(): |
|
num = num + str(text[i]) |
|
else: |
|
others = others + str(text[i]) |
|
elif text[i]!=' ': |
|
others = others + str(text[i]) |
|
elif text[i]==" ": |
|
if others!="" and others!=" ": |
|
num_posn.append((others,"")) |
|
others = "" |
|
if others!="": |
|
num_posn.append((others,"")) |
|
|
|
return num_posn |
|
|
|
def exnum_evaluator(df): |
|
df['preprocessed_text'] = df.apply(lambda x: convert_actual_to_num(x.text, x.number, x.position), axis = 1) |
|
df['number_processed'] = df['number'].apply(lambda x: str(x)[0:str(x).index(".")+2] if "." in str(x) else str(x)) |
|
|
|
all_preds = [] |
|
for preprocessed_text in df["preprocessed_text"].values: |
|
tokenized_text = tokenizer.tokenize(preprocessed_text) |
|
|
|
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) |
|
index = tokenized_text.index('[NUM]') |
|
tokens_tensor = torch.tensor([indexed_tokens]) |
|
|
|
model.eval() |
|
with torch.no_grad(): |
|
last_hidden_states = model(tokens_tensor)[0] |
|
|
|
embedding_of_num = last_hidden_states[:,index,:] |
|
embedding_of_num_use = list(embedding_of_num[0].cpu().detach().numpy()) |
|
pred = mlp.predict([embedding_of_num_use])[0] |
|
all_preds.append(pred) |
|
df['pred'] = all_preds |
|
df['calculated_magnitude'] = df['number_processed'].apply(lambda x : min(6,int(math.log10(float(x)))+1)) |
|
df["prediction"] = np.where((df['calculated_magnitude'] != df['pred']), "Exaggerated", "Non-Exaggerated") |
|
return df[["number", "position", "prediction"]] |
|
|
|
def change_checkbox_group(text2): |
|
num_posn_inp = [(num, posn) for (num,posn) in eval(text2) if posn!=""] |
|
num_posn_dislay = [str(num) + " " + str(posn) for (num,posn) in num_posn_inp] |
|
return gr.CheckboxGroup.update(choices = num_posn_dislay, label="Numerals", visible=True, value=num_posn_dislay) |
|
|
|
def combined_fns(text, text2, choices=[]): |
|
num_posn_inp = [(num, posn) for (num,posn) in eval(text2) if posn!=""] |
|
|
|
df = pd.DataFrame({"text": [text]*len(num_posn_inp), "number" : [i[0] for i in num_posn_inp], "position" : [i[1].replace("@POSITION ", "") for i in num_posn_inp]}) |
|
df['num_position'] = [str(num) + " " + str(posn) for (num,posn) in num_posn_inp] |
|
if len(choices)>0: |
|
df = df[df['num_position'].isin(choices)] |
|
return exnum_evaluator(df) |
|
|
|
|
|
def set_example_text(example_text): |
|
return gr.Textbox.update(value=example_text[0]) |
|
|
|
demo = gr.Blocks(theme=gr.themes.Soft()) |
|
|
|
with demo: |
|
gr.Markdown("# **Financial Exaggerated Numeral ClassifiEr (FENCE)**") |
|
with gr.Row(): |
|
with gr.Column(): |
|
text = gr.components.Textbox(label="Enter financial text here", lines=2, placeholder="Enter Financial Text here...") |
|
b1 = gr.Button("Get numerals present in the entered text") |
|
b1.click(num_detector_highlighter_adv, inputs = text, outputs = gr.HighlightedText(label='Numerals present in the text')) |
|
text2 = gr.components.Textbox(visible=False) |
|
b1.click(num_detector_highlighter_adv, inputs = text, outputs =text2) |
|
with gr.Row(): |
|
with gr.Tabs(): |
|
with gr.TabItem("All numerals"): |
|
b2 = gr.Button("Predict for all numerals") |
|
b2.click(combined_fns, inputs = [text, text2], outputs = gr.DataFrame()) |
|
with gr.TabItem("Specific numerals"): |
|
b3 = gr.Button("Get option to select numerals") |
|
num_posn_inp_ckbx = gr.CheckboxGroup(choices = [], interactive=True, label='Specific Numerals') |
|
b3.click(change_checkbox_group, inputs=text2, outputs=num_posn_inp_ckbx) |
|
b4 = gr.Button("Predict for specific numerals") |
|
b4.click(combined_fns, inputs = [text, text2, num_posn_inp_ckbx], outputs = gr.DataFrame()) |
|
example_text = gr.Dataset(components=[text], samples=[["Get 30% off Gap denim whilst recycling your old denim for communities in need"], [" Matthew Perry puts Malibu mansion on the market for $13.5 million"], ["Anton Art Center in Mt. Clemens hosts 19th Annual ArtParty Fundraiser - Twilight in the Tropics"], ["Black Friday Sales! - Vegas hotel packages 50% savings from Southwest Vacations"]]) |
|
example_text.click(fn=set_example_text, |
|
inputs=example_text, |
|
outputs=example_text.components) |
|
gr.Markdown("<sub><sup>How to use? [link](https://github.com/sohomghosh/FENCE_Financial_Exaggerated_Numeral_ClassifiEr/blob/main/README.md), Warning: User discretion is advised.</sup></sub>") |
|
demo.launch() |