Spaces:
Build error
Build error
Commit
·
c36b36a
1
Parent(s):
f4946a4
Update app.py
Browse files
app.py
CHANGED
@@ -8,14 +8,20 @@ from nltk.corpus import stopwords
|
|
8 |
import spacy
|
9 |
from spacy import displacy
|
10 |
from word2number import w2n
|
11 |
-
|
|
|
12 |
nltk.download('punkt')
|
13 |
nltk.download('stopwords')
|
14 |
|
|
|
15 |
sentiment_model = pipeline("text-classification", model="AhmedTaha012/managersFeedback-V1.0.7")
|
16 |
increase_decrease_model = pipeline("text-classification", model="AhmedTaha012/nextQuarter-status-V1.1.9")
|
|
|
|
|
|
|
17 |
tokenizer = AutoTokenizer.from_pretrained("AhmedTaha012/finance-ner-v0.0.9-finetuned-ner")
|
18 |
model = AutoModelForTokenClassification.from_pretrained("AhmedTaha012/finance-ner-v0.0.9-finetuned-ner")
|
|
|
19 |
# torch.compile(model)
|
20 |
nlpPipe = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
|
21 |
def getSpeakers(data):
|
@@ -183,12 +189,46 @@ def convert_amount_to_number(amount_str):
|
|
183 |
return w2n.word_to_num(amount_str)
|
184 |
except ValueError:
|
185 |
return 0 # Return 0 if the conversion fails
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
st.header("Transcript Analysis", divider='rainbow')
|
188 |
-
|
189 |
-
|
190 |
if st.button("Analyze"):
|
191 |
-
transcript=replace_abbreviations(
|
192 |
transcript=replace_abbreviations(transcript)
|
193 |
transcript=removeSpeakers(transcript)
|
194 |
transcript=removeQA(transcript)
|
|
|
8 |
import spacy
|
9 |
from spacy import displacy
|
10 |
from word2number import w2n
|
11 |
+
from sentence_transformers import SentenceTransformer
|
12 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
13 |
nltk.download('punkt')
|
14 |
nltk.download('stopwords')
|
15 |
|
16 |
+
similarityModel = SentenceTransformer('BAAI/bge-small-en')
|
17 |
sentiment_model = pipeline("text-classification", model="AhmedTaha012/managersFeedback-V1.0.7")
|
18 |
increase_decrease_model = pipeline("text-classification", model="AhmedTaha012/nextQuarter-status-V1.1.9")
|
19 |
+
tokenizerTopic = AutoTokenizer.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification",use_fast=True)
|
20 |
+
modelTopic = AutoModelForSequenceClassification.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification")
|
21 |
+
torch.compile(modelTopic)
|
22 |
tokenizer = AutoTokenizer.from_pretrained("AhmedTaha012/finance-ner-v0.0.9-finetuned-ner")
|
23 |
model = AutoModelForTokenClassification.from_pretrained("AhmedTaha012/finance-ner-v0.0.9-finetuned-ner")
|
24 |
+
torch.compile(model)
|
25 |
# torch.compile(model)
|
26 |
nlpPipe = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
|
27 |
def getSpeakers(data):
|
|
|
189 |
return w2n.word_to_num(amount_str)
|
190 |
except ValueError:
|
191 |
return 0 # Return 0 if the conversion fails
|
192 |
+
def getTopic(encoded_input):
|
193 |
+
modelTopic.to("cuda")
|
194 |
+
with torch.no_grad():
|
195 |
+
logits = modelTopic(**encoded_input).logits
|
196 |
+
predicted_class_id = logits.argmax().item()
|
197 |
+
return modelTopic.config.id2label[predicted_class_id]
|
198 |
+
def selectedCorpusForNextQuarterModel(x,quarter):
|
199 |
+
number_word_dict = {
|
200 |
+
"1": "first",
|
201 |
+
"2": "second",
|
202 |
+
"3": "third",
|
203 |
+
"4": "fourth",
|
204 |
+
# Add more entries as needed
|
205 |
+
}
|
206 |
+
tokens=tokenizerTopic(x, padding=True, truncation=True, return_tensors='pt')
|
207 |
+
splitSize=256
|
208 |
+
chunksInput_ids=[tokens["input_ids"][r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens["input_ids"])/splitSize))]
|
209 |
+
chunksToken_type_ids=[tokens["token_type_ids"][r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens["token_type_ids"])/splitSize))]
|
210 |
+
chunksAttention_mask=[tokens["attention_mask"][r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens["attention_mask"])/splitSize))]
|
211 |
+
l=[]
|
212 |
+
for idx in range(len(chunksInput_ids)):
|
213 |
+
l.append({"input_ids":torch.tensor([list(x[idx])]).to("cuda"),
|
214 |
+
"token_type_ids":torch.tensor([list(y[idx])]).to("cuda"),
|
215 |
+
"attention_mask":torch.tensor([list(z[idx])]).to("cuda")
|
216 |
+
})
|
217 |
+
selectedTopics = ["Stock Movement", "Earnings", "IPO", "Stock Commentary", "Currencies", "M&A | Investments", "Financials", "Macro", "Analyst Update", "Company | Product News"]
|
218 |
+
result = [tokenizerTopic.decode(x["input_ids"][0], skip_special_tokens=True) for x in l if getTopic(x) in selectedTopics]
|
219 |
+
result=[x for x in result if len(x)>10]
|
220 |
+
des=f"the {number_word_dict[str(quarter)]} quarter results of the {usedData['quad-date'].iloc[i]}"
|
221 |
+
courpus=result.split("\n")
|
222 |
+
embeddings_1 = similarityModel.encode([des]+courpus, normalize_embeddings=True,device='cuda',show_progress_bar=False)
|
223 |
+
sents=[des]+courpus
|
224 |
+
rest=[sents[f] for f in [list(cosine_similarity(embeddings_1)[0][1:]).index(value)+1 for value in sorted(list(cosine_similarity(embeddings_1)[0][1:]),reverse=True)][:3]]
|
225 |
+
return selectedCourpusForTraing.append(",".join(rest))
|
226 |
|
227 |
st.header("Transcript Analysis", divider='rainbow')
|
228 |
+
mainTranscript = st.text_area("Enter the transcript:", height=100)
|
229 |
+
quarter = st.text_input('Enter your quarter', 'quarter of transcript')
|
230 |
if st.button("Analyze"):
|
231 |
+
transcript=replace_abbreviations(mainTranscript)
|
232 |
transcript=replace_abbreviations(transcript)
|
233 |
transcript=removeSpeakers(transcript)
|
234 |
transcript=removeQA(transcript)
|