AhmedTaha012 commited on
Commit
c36b36a
1 Parent(s): f4946a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -4
app.py CHANGED
@@ -8,14 +8,20 @@ from nltk.corpus import stopwords
8
  import spacy
9
  from spacy import displacy
10
  from word2number import w2n
11
-
 
12
  nltk.download('punkt')
13
  nltk.download('stopwords')
14
 
 
15
  sentiment_model = pipeline("text-classification", model="AhmedTaha012/managersFeedback-V1.0.7")
16
  increase_decrease_model = pipeline("text-classification", model="AhmedTaha012/nextQuarter-status-V1.1.9")
 
 
 
17
  tokenizer = AutoTokenizer.from_pretrained("AhmedTaha012/finance-ner-v0.0.9-finetuned-ner")
18
  model = AutoModelForTokenClassification.from_pretrained("AhmedTaha012/finance-ner-v0.0.9-finetuned-ner")
 
19
  # torch.compile(model)
20
  nlpPipe = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
21
  def getSpeakers(data):
@@ -183,12 +189,46 @@ def convert_amount_to_number(amount_str):
183
  return w2n.word_to_num(amount_str)
184
  except ValueError:
185
  return 0 # Return 0 if the conversion fails
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  st.header("Transcript Analysis", divider='rainbow')
188
- transcript = st.text_area("Enter the transcript:", height=100)
189
-
190
  if st.button("Analyze"):
191
- transcript=replace_abbreviations(transcript)
192
  transcript=replace_abbreviations(transcript)
193
  transcript=removeSpeakers(transcript)
194
  transcript=removeQA(transcript)
 
8
  import spacy
9
  from spacy import displacy
10
  from word2number import w2n
11
+ from sentence_transformers import SentenceTransformer
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
  nltk.download('punkt')
14
  nltk.download('stopwords')
15
 
16
+ similarityModel = SentenceTransformer('BAAI/bge-small-en')
17
  sentiment_model = pipeline("text-classification", model="AhmedTaha012/managersFeedback-V1.0.7")
18
  increase_decrease_model = pipeline("text-classification", model="AhmedTaha012/nextQuarter-status-V1.1.9")
19
+ tokenizerTopic = AutoTokenizer.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification",use_fast=True)
20
+ modelTopic = AutoModelForSequenceClassification.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification")
21
+ torch.compile(modelTopic)
22
  tokenizer = AutoTokenizer.from_pretrained("AhmedTaha012/finance-ner-v0.0.9-finetuned-ner")
23
  model = AutoModelForTokenClassification.from_pretrained("AhmedTaha012/finance-ner-v0.0.9-finetuned-ner")
24
+ torch.compile(model)
25
  # torch.compile(model)
26
  nlpPipe = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
27
  def getSpeakers(data):
 
189
  return w2n.word_to_num(amount_str)
190
  except ValueError:
191
  return 0 # Return 0 if the conversion fails
192
+ def getTopic(encoded_input):
193
+ modelTopic.to("cuda")
194
+ with torch.no_grad():
195
+ logits = modelTopic(**encoded_input).logits
196
+ predicted_class_id = logits.argmax().item()
197
+ return modelTopic.config.id2label[predicted_class_id]
198
+ def selectedCorpusForNextQuarterModel(x,quarter):
199
+ number_word_dict = {
200
+ "1": "first",
201
+ "2": "second",
202
+ "3": "third",
203
+ "4": "fourth",
204
+ # Add more entries as needed
205
+ }
206
+ tokens=tokenizerTopic(x, padding=True, truncation=True, return_tensors='pt')
207
+ splitSize=256
208
+ chunksInput_ids=[tokens["input_ids"][r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens["input_ids"])/splitSize))]
209
+ chunksToken_type_ids=[tokens["token_type_ids"][r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens["token_type_ids"])/splitSize))]
210
+ chunksAttention_mask=[tokens["attention_mask"][r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens["attention_mask"])/splitSize))]
211
+ l=[]
212
+ for idx in range(len(chunksInput_ids)):
213
+ l.append({"input_ids":torch.tensor([list(x[idx])]).to("cuda"),
214
+ "token_type_ids":torch.tensor([list(y[idx])]).to("cuda"),
215
+ "attention_mask":torch.tensor([list(z[idx])]).to("cuda")
216
+ })
217
+ selectedTopics = ["Stock Movement", "Earnings", "IPO", "Stock Commentary", "Currencies", "M&A | Investments", "Financials", "Macro", "Analyst Update", "Company | Product News"]
218
+ result = [tokenizerTopic.decode(x["input_ids"][0], skip_special_tokens=True) for x in l if getTopic(x) in selectedTopics]
219
+ result=[x for x in result if len(x)>10]
220
+ des=f"the {number_word_dict[str(quarter)]} quarter results of the {usedData['quad-date'].iloc[i]}"
221
+ courpus=result.split("\n")
222
+ embeddings_1 = similarityModel.encode([des]+courpus, normalize_embeddings=True,device='cuda',show_progress_bar=False)
223
+ sents=[des]+courpus
224
+ rest=[sents[f] for f in [list(cosine_similarity(embeddings_1)[0][1:]).index(value)+1 for value in sorted(list(cosine_similarity(embeddings_1)[0][1:]),reverse=True)][:3]]
225
+ return selectedCourpusForTraing.append(",".join(rest))
226
 
227
  st.header("Transcript Analysis", divider='rainbow')
228
+ mainTranscript = st.text_area("Enter the transcript:", height=100)
229
+ quarter = st.text_input('Enter your quarter', 'quarter of transcript')
230
  if st.button("Analyze"):
231
+ transcript=replace_abbreviations(mainTranscript)
232
  transcript=replace_abbreviations(transcript)
233
  transcript=removeSpeakers(transcript)
234
  transcript=removeQA(transcript)