sohomghosh commited on
Commit
dba2b44
1 Parent(s): 2c0c269

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -0
app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import pickle
4
+
5
+ import gradio as gr
6
+
7
+ import torch
8
+
9
+ import math
10
+
11
+ from transformers import AutoTokenizer, AutoModel
12
+
13
+ import transformers
14
+
15
+ import re
16
+
17
+ mlp = pickle.load(open("MLP_over_embeddings.pickle", "rb"))
18
+
19
+ tokenizer = AutoTokenizer.from_pretrained("nlpaueb/sec-bert-num")
20
+ model = AutoModel.from_pretrained('nlpaueb/sec-bert-num')
21
+
22
+ """# Input here"""
23
+
24
+ def convert_actual_to_num(text, number, offset):
25
+ length = len(str(number))
26
+ offset = int(offset)
27
+ new_text= text[:offset] + " [NUM] " + text[offset+length:]
28
+ return new_text
29
+
30
+ def num_detector_highlighter_adv(text):
31
+ num_posn = []
32
+ posn = -1
33
+ num = ""
34
+ text = text + " "
35
+ others = ""
36
+ for i in range(len(text)-2):
37
+ if (text[i].isdigit() and text[i+1].isdigit()) or (text[i].isdigit() and text[i+1]=="." and text[i+2].isdigit()):
38
+ num = num + str(text[i])
39
+ if posn == -1:
40
+ posn = i
41
+ if others!="":
42
+ num_posn.append((others,""))
43
+ others = ""
44
+ elif (text[i].isdigit() and text[i+1].isdigit()==False and text[i+1]!=".") or (text[i].isdigit() and text[i+1]=="." and text[i].isdigit() and text[i+2].isdigit()==False):
45
+ num = num + str(text[i])
46
+ if len(num)==1:
47
+ posn = i
48
+ if others!="":
49
+ num_posn.append((others,""))
50
+ others = ""
51
+ num_posn.append((str(num), "@POSITION " + str(posn)))
52
+ num = ""
53
+ posn = -1
54
+ elif text[i] == ".":
55
+ if text[i+1].isdigit():
56
+ num = num + str(text[i])
57
+ else:
58
+ others = others + str(text[i])
59
+ elif text[i]!=' ':
60
+ others = others + str(text[i])
61
+ elif text[i]==" ":
62
+ if others!="" and others!=" ":
63
+ num_posn.append((others,""))
64
+ others = ""
65
+ if others!="":
66
+ num_posn.append((others,""))
67
+ #print(num_posn)
68
+ return num_posn
69
+
70
+ def exnum_evaluator(df):
71
+ df['preprocessed_text'] = df.apply(lambda x: convert_actual_to_num(x.text, x.number, x.position), axis = 1)
72
+ df['number_processed'] = df['number'].apply(lambda x: str(x)[0:str(x).index(".")+2] if "." in str(x) else str(x))
73
+ #preprocessed_text = convert_actual_to_num(raw_text,number,offset)
74
+ all_preds = []
75
+ for preprocessed_text in df["preprocessed_text"].values:
76
+ tokenized_text = tokenizer.tokenize(preprocessed_text)
77
+
78
+ indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
79
+ index = tokenized_text.index('[NUM]')
80
+ tokens_tensor = torch.tensor([indexed_tokens])
81
+
82
+ model.eval()
83
+ with torch.no_grad():
84
+ last_hidden_states = model(tokens_tensor)[0]
85
+
86
+ embedding_of_num = last_hidden_states[:,index,:]
87
+ embedding_of_num_use = list(embedding_of_num[0].cpu().detach().numpy())
88
+ pred = mlp.predict([embedding_of_num_use])[0]
89
+ all_preds.append(pred)
90
+ df['pred'] = all_preds
91
+ df['calculated_magnitude'] = df['number_processed'].apply(lambda x : min(6,int(math.log10(float(x)))+1)) # restric upto 2 dp in x if decimal
92
+ df["prediction"] = np.where((df['calculated_magnitude'] != df['pred']), "Exaggerated", "Non-Exaggerated") #df.apply(lambda x : "Exaggerated" if x.calculated_magnitude!=x.prediction else "Non-Exaggerated", axis=1)
93
+ return df[["number", "position", "prediction"]]#, "text", "preprocessed_text",'number_processed', "pred", "calculated_magnitude"]]
94
+
95
+ def change_checkbox_group(text2):
96
+ num_posn_inp = [(num, posn) for (num,posn) in eval(text2) if posn!=""]
97
+ num_posn_dislay = [str(num) + " " + str(posn) for (num,posn) in num_posn_inp]
98
+ return gr.CheckboxGroup.update(choices = num_posn_dislay, label="Numerals", visible=True, value=num_posn_dislay)
99
+
100
+ def combined_fns(text, text2, choices=[]):
101
+ num_posn_inp = [(num, posn) for (num,posn) in eval(text2) if posn!=""]#[(num, posn) for (num,posn) in num_detector_highlighter_adv(text) if posn!=""]
102
+ #num_posn_dislay = [str(num) + " " + str(posn) for (num,posn) in num_posn]
103
+ df = pd.DataFrame({"text": [text]*len(num_posn_inp), "number" : [i[0] for i in num_posn_inp], "position" : [i[1].replace("@POSITION ", "") for i in num_posn_inp]})
104
+ df['num_position'] = [str(num) + " " + str(posn) for (num,posn) in num_posn_inp]
105
+ if len(choices)>0:
106
+ df = df[df['num_position'].isin(choices)]
107
+ return exnum_evaluator(df)
108
+
109
+ #examples
110
+ def set_example_text(example_text):
111
+ return gr.Textbox.update(value=example_text[0])
112
+
113
+ demo = gr.Blocks(theme=gr.themes.Soft())
114
+
115
+ with demo:
116
+ gr.Markdown("# **Financial Exaggerated Numeral ClassifiEr (FENCE)**")
117
+ with gr.Row():
118
+ with gr.Column():
119
+ text = gr.components.Textbox(label="Enter financial text here", lines=2, placeholder="Enter Financial Text here...")
120
+ b1 = gr.Button("Get numerals present in the entered text")
121
+ b1.click(num_detector_highlighter_adv, inputs = text, outputs = gr.HighlightedText(label='Numerals present in the text'))
122
+ text2 = gr.components.Textbox(visible=False)
123
+ b1.click(num_detector_highlighter_adv, inputs = text, outputs =text2)
124
+ with gr.Row():
125
+ with gr.Tabs():
126
+ with gr.TabItem("All numerals"):
127
+ b2 = gr.Button("Predict for all numerals")
128
+ b2.click(combined_fns, inputs = [text, text2], outputs = gr.DataFrame())
129
+ with gr.TabItem("Specific numerals"):
130
+ b3 = gr.Button("Get option to select numerals")
131
+ num_posn_inp_ckbx = gr.CheckboxGroup(choices = [], interactive=True, label='Specific Numerals')
132
+ b3.click(change_checkbox_group, inputs=text2, outputs=num_posn_inp_ckbx)
133
+ b4 = gr.Button("Predict for specific numerals")
134
+ b4.click(combined_fns, inputs = [text, text2, num_posn_inp_ckbx], outputs = gr.DataFrame())
135
+ example_text = gr.Dataset(components=[text], samples=[["Get 30% off Gap denim whilst recycling your old denim for communities in need"], [" Matthew Perry puts Malibu mansion on the market for $13.5 million"], ["Anton Art Center in Mt. Clemens hosts 19th Annual ArtParty Fundraiser - Twilight in the Tropics"], ["Black Friday Sales! - Vegas hotel packages 50% savings from Southwest Vacations"]])
136
+ example_text.click(fn=set_example_text,
137
+ inputs=example_text,
138
+ outputs=example_text.components)
139
+ gr.Markdown("<sub><sup>How to use? [link](https://github.com/sohomghosh/FENCE_Financial_Exaggerated_Numeral_ClassifiEr/blob/main/README.md), Warning: User discretion is advised.</sup></sub>")
140
+ demo.launch(share = True)