AhmedTaha012 commited on
Commit
174db76
1 Parent(s): e89f604

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +171 -0
app.py CHANGED
@@ -4,9 +4,180 @@ import math
4
  sentiment_model = pipeline("text-classification", model="AhmedTaha012/managersFeedback-V1.0.7")
5
  increase_decrease_model = pipeline("text-classification", model="AhmedTaha012/nextQuarter-status-V1.1.9")
6
  ner_model = pipeline("token-classification", model="AhmedTaha012/finance-ner-v0.0.9-finetuned-ner")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  st.title("Transcript Analysis")
9
  transcript = st.text_area("Enter the transcript:", height=200)
 
 
 
 
 
10
  tokens=transcript.split()
11
  splitSize=256
12
  chunks=[tokens[r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens)/splitSize))]
 
4
  sentiment_model = pipeline("text-classification", model="AhmedTaha012/managersFeedback-V1.0.7")
5
  increase_decrease_model = pipeline("text-classification", model="AhmedTaha012/nextQuarter-status-V1.1.9")
6
  ner_model = pipeline("token-classification", model="AhmedTaha012/finance-ner-v0.0.9-finetuned-ner")
7
+ def getSpeakers(data):
8
+ if "Speakers" in data:
9
+ return "\n".join([x for x in data.split("Speakers")[-1].split("\n") if "--" in x])
10
+ elif "Call participants" in data:
11
+ return "\n".join([x for x in data.split("Call participants")[-1].split("\n") if "--" in x])
12
+ elif "Call Participants" in data:
13
+ return "\n".join([x for x in data.split("Call Participants")[-1].split("\n") if "--" in x])
14
+ def removeSpeakers(data):
15
+ if "Speakers" in data:
16
+ return data.split("Speakers")[0]
17
+ elif "Call participants" in data:
18
+ return data.split("Call participants")[0]
19
+ elif "Call Participants" in data:
20
+ return data.split("Call Participants")[0]
21
+ def getQA(data):
22
+ if "Questions and Answers" in data:
23
+ return data.split("Questions and Answers")[-1]
24
+ elif "Questions & Answers" in data:
25
+ return data.split("Questions & Answers")[-1]
26
+ elif "Q&A" in data:
27
+ return data.split("Q&A")[-1]
28
+ else:
29
+ return ""
30
+ def removeQA(data):
31
+ if "Questions and Answers" in data:
32
+ return data.split("Questions and Answers")[0]
33
+ elif "Questions & Answers" in data:
34
+ return data.split("Questions & Answers")[0]
35
+ elif "Q&A" in data:
36
+ return data.split("Q&A")[0]
37
+ else:
38
+ return ""
39
+ def clean_and_preprocess(text):
40
+ text=[x for x in text.split("\n") if len(x)>100]
41
+ l=[]
42
+ for t in text:
43
+ # Convert to lowercase
44
+ t = t.lower()
45
+ # Tokenize text into words
46
+ words = nltk.word_tokenize(t)
47
+ # Remove stopwords
48
+ stop_words = set(stopwords.words('english'))
49
+ filtered_words = [word for word in words if word not in stop_words]
50
+
51
+ # Join the words back into a cleaned text
52
+ cleaned_text = ' '.join(filtered_words)
53
+ l.append(cleaned_text)
54
+ return "\n".join(l)
55
+ def replace_abbreviations(text):
56
+
57
+ replacements = {
58
+ 'Q1': 'first quarter',
59
+ 'Q2': 'second quarter',
60
+ 'Q3': 'third quarter',
61
+ 'Q4': 'fourth quarter',
62
+ 'q1': 'first quarter',
63
+ 'q2': 'second quarter',
64
+ 'q3': 'third quarter',
65
+ 'q4': 'fourth quarter',
66
+ 'FY': 'fiscal year',
67
+ 'YoY': 'year over year',
68
+ 'MoM': 'month over month',
69
+ 'EBITDA': 'earnings before interest, taxes, depreciation, and amortization',
70
+ 'ROI': 'return on investment',
71
+ 'EPS': 'earnings per share',
72
+ 'P/E': 'price-to-earnings',
73
+ 'DCF': 'discounted cash flow',
74
+ 'CAGR': 'compound annual growth rate',
75
+ 'GDP': 'gross domestic product',
76
+ 'CFO': 'chief financial officer',
77
+ 'GAAP': 'generally accepted accounting principles',
78
+ 'SEC': 'U.S. Securities and Exchange Commission',
79
+ 'IPO': 'initial public offering',
80
+ 'M&A': 'mergers and acquisitions',
81
+ 'EBIT': 'earnings before interest and taxes',
82
+ 'IRR': 'internal rate of return',
83
+ 'ROA': 'return on assets',
84
+ 'ROE': 'return on equity',
85
+ 'NAV': 'net asset value',
86
+ 'PE ratio': 'price-to-earnings ratio',
87
+ 'EPS growth': 'earnings per share growth',
88
+ 'Fiscal Year': 'financial year',
89
+ 'CAPEX': 'capital expenditure',
90
+ 'APR': 'annual percentage rate',
91
+ 'P&L': 'profit and loss',
92
+ 'NPM': 'net profit margin',
93
+ 'EBT': 'earnings before taxes',
94
+ 'EBITDAR': 'earnings before interest, taxes, depreciation, amortization, and rent',
95
+ 'PAT': 'profit after tax',
96
+ 'COGS': 'cost of goods sold',
97
+ 'EBTIDA': 'earnings before taxes, interest, depreciation, and amortization',
98
+ 'E&Y': 'Ernst & Young',
99
+ 'B2B': 'business to business',
100
+ 'B2C': 'business to consumer',
101
+ 'LIFO': 'last in, first out',
102
+ 'FIFO': 'first in, first out',
103
+ 'FCF': 'free cash flow',
104
+ 'LTM': 'last twelve months',
105
+ 'OPEX': 'operating expenses',
106
+ 'TSR': 'total shareholder return',
107
+ 'PP&E': 'property, plant, and equipment',
108
+ 'PBT': 'profit before tax',
109
+ 'EBITDAR margin': 'earnings before interest, taxes, depreciation, amortization, and rent margin',
110
+ 'ROIC': 'return on invested capital',
111
+ 'EPS': 'earnings per share',
112
+ 'P/E': 'price-to-earnings',
113
+ 'EBITDA': 'earnings before interest, taxes, depreciation, and amortization',
114
+ 'YOY': 'year-over-year',
115
+ 'MOM': 'month-over-month',
116
+ 'CAGR': 'compound annual growth rate',
117
+ 'GDP': 'gross domestic product',
118
+ 'ROI': 'return on investment',
119
+ 'ROE': 'return on equity',
120
+ 'EBIT': 'earnings before interest and taxes',
121
+ 'DCF': 'discounted cash flow',
122
+ 'GAAP': 'Generally Accepted Accounting Principles',
123
+ 'LTM': 'last twelve months',
124
+ 'EBIT margin': 'earnings before interest and taxes margin',
125
+ 'EBT': 'earnings before taxes',
126
+ 'EBTA': 'earnings before taxes and amortization',
127
+ 'FTE': 'full-time equivalent',
128
+ 'EBIDTA': 'earnings before interest, depreciation, taxes, and amortization',
129
+ 'EBTIDA': 'earnings before taxes, interest, depreciation, and amortization',
130
+ 'EBITDAR': 'earnings before interest, taxes, depreciation, amortization, and rent',
131
+ 'COGS': 'cost of goods sold',
132
+ 'APR': 'annual percentage rate',
133
+ 'PESTEL': 'Political, Economic, Social, Technological, Environmental, and Legal',
134
+ 'KPI': 'key performance indicator',
135
+ 'SWOT': 'Strengths, Weaknesses, Opportunities, Threats',
136
+ 'CAPEX': 'capital expenditures',
137
+ 'EBITDARM': 'earnings before interest, taxes, depreciation, amortization, rent, and management fees',
138
+ 'EBITDAX': 'earnings before interest, taxes, depreciation, amortization, and exploration expenses',
139
+ 'EBITDAS': 'earnings before interest, taxes, depreciation, amortization, and restructuring costs',
140
+ 'EBITDAX-C': 'earnings before interest, taxes, depreciation, amortization, exploration expenses, and commodity derivatives',
141
+ 'EBITDAX-R': 'earnings before interest, taxes, depreciation, amortization, exploration expenses, and asset retirement obligations',
142
+ 'EBITDAX-E': 'earnings before interest, taxes, depreciation, amortization, exploration expenses, and environmental liabilities'
143
+
144
+ # Add more abbreviations and replacements as needed
145
+ }
146
+ for abbreviation, full_form in replacements.items():
147
+ text = text.replace(abbreviation, full_form)
148
+
149
+ return text
150
+
151
+ def clean_and_preprocess(text):
152
+ text=[x for x in text.split("\n") if len(x)>100]
153
+ l=[]
154
+ for t in text:
155
+ # Convert to lowercase
156
+ t = t.lower()
157
+ # Tokenize text into words
158
+ words = nltk.word_tokenize(t)
159
+ # Remove stopwords
160
+ stop_words = set(stopwords.words('english'))
161
+ filtered_words = [word for word in words if word not in stop_words]
162
+
163
+ # Join the words back into a cleaned text
164
+ cleaned_text = ' '.join(filtered_words)
165
+ l.append(cleaned_text)
166
+ return "\n".join(l)
167
+
168
+
169
+
170
+
171
+
172
+
173
 
174
  st.title("Transcript Analysis")
175
  transcript = st.text_area("Enter the transcript:", height=200)
176
+ transcript=replace_abbreviations(transcript)
177
+ transcript=replace_abbreviations(transcript)
178
+ transcript=removeSpeakers(transcript)
179
+ transcript=removeQA(transcript)
180
+ transcript=clean_and_preprocess(transcript)
181
  tokens=transcript.split()
182
  splitSize=256
183
  chunks=[tokens[r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens)/splitSize))]