import streamlit as st from transformers import pipeline from transformers import AutoTokenizer,AutoModelForTokenClassification,AutoModelForSequenceClassification,BertForSequenceClassification import math import nltk import torch from nltk.corpus import stopwords import spacy from spacy import displacy from word2number import w2n from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import en_core_web_sm from annotated_text import annotated_text import datetime nlp = en_core_web_sm.load() nltk.download('punkt') nltk.download('stopwords') similarityModel = SentenceTransformer('BAAI/bge-small-en') sentiment_model = pipeline("text-classification", model="AhmedTaha012/managersFeedback-V1.0.7") tokenizerQuarter = AutoTokenizer.from_pretrained('AhmedTaha012/nextQuarter-status-V1.1.9') modelQuarter = BertForSequenceClassification.from_pretrained('AhmedTaha012/nextQuarter-status-V1.1.9') tokenizerTopic = AutoTokenizer.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification",use_fast=True,token="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") modelTopic = AutoModelForSequenceClassification.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification",token="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") # torch.compile(modelTopic) tokenizer = AutoTokenizer.from_pretrained("AhmedTaha012/finance-ner-v0.0.9-finetuned-ner") model = AutoModelForTokenClassification.from_pretrained("AhmedTaha012/finance-ner-v0.0.9-finetuned-ner") # torch.compile(model) # torch.compile(model) nlpPipe = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True) if "disabled" not in st.session_state: st.session_state["disabled"] = False def disable(): st.session_state["disabled"] = True def getSpeakers(data): if "Speakers" in data: return "\n".join([x for x in data.split("Speakers")[-1].split("\n") if "--" in x]) elif "Call participants" in data: return "\n".join([x for x in data.split("Call participants")[-1].split("\n") if "--" in x]) elif "Call Participants" in data: return "\n".join([x for x in data.split("Call Participants")[-1].split("\n") if "--" in x]) def removeSpeakers(data): if "Speakers" in data: return data.split("Speakers")[0] elif "Call participants" in data: return data.split("Call participants")[0] elif "Call Participants" in data: return data.split("Call Participants")[0] def getQA(data): if "Questions and Answers" in data: return data.split("Questions and Answers")[-1] elif "Questions & Answers" in data: return data.split("Questions & Answers")[-1] elif "Q&A" in data: return data.split("Q&A")[-1] else: return "" def removeQA(data): if "Questions and Answers" in data: return data.split("Questions and Answers")[0] elif "Questions & Answers" in data: return data.split("Questions & Answers")[0] elif "Q&A" in data: return data.split("Q&A")[0] else: return "" def clean_and_preprocess(text): text=[x for x in text.split("\n") if len(x)>100] l=[] for t in text: # Convert to lowercase t = t.lower() # Tokenize text into words words = nltk.word_tokenize(t) # Remove stopwords stop_words = set(stopwords.words('english')) filtered_words = [word for word in words if word not in stop_words] # Join the words back into a cleaned text cleaned_text = ' '.join(filtered_words) l.append(cleaned_text) return "\n".join(l) def replace_abbreviations(text): replacements = { 'Q1': 'first quarter', 'Q2': 'second quarter', 'Q3': 'third quarter', 'Q4': 'fourth quarter', 'q1': 'first quarter', 'q2': 'second quarter', 'q3': 'third quarter', 'q4': 'fourth quarter', 'FY': 'fiscal year', 'YoY': 'year over year', 'MoM': 'month over month', 'EBITDA': 'earnings before interest, taxes, depreciation, and amortization', 'ROI': 'return on investment', 'EPS': 'earnings per share', 'P/E': 'price-to-earnings', 'DCF': 'discounted cash flow', 'CAGR': 'compound annual growth rate', 'GDP': 'gross domestic product', 'CFO': 'chief financial officer', 'GAAP': 'generally accepted accounting principles', 'SEC': 'U.S. Securities and Exchange Commission', 'IPO': 'initial public offering', 'M&A': 'mergers and acquisitions', 'EBIT': 'earnings before interest and taxes', 'IRR': 'internal rate of return', 'ROA': 'return on assets', 'ROE': 'return on equity', 'NAV': 'net asset value', 'PE ratio': 'price-to-earnings ratio', 'EPS growth': 'earnings per share growth', 'Fiscal Year': 'financial year', 'CAPEX': 'capital expenditure', 'APR': 'annual percentage rate', 'P&L': 'profit and loss', 'NPM': 'net profit margin', 'EBT': 'earnings before taxes', 'EBITDAR': 'earnings before interest, taxes, depreciation, amortization, and rent', 'PAT': 'profit after tax', 'COGS': 'cost of goods sold', 'EBTIDA': 'earnings before taxes, interest, depreciation, and amortization', 'E&Y': 'Ernst & Young', 'B2B': 'business to business', 'B2C': 'business to consumer', 'LIFO': 'last in, first out', 'FIFO': 'first in, first out', 'FCF': 'free cash flow', 'LTM': 'last twelve months', 'OPEX': 'operating expenses', 'TSR': 'total shareholder return', 'PP&E': 'property, plant, and equipment', 'PBT': 'profit before tax', 'EBITDAR margin': 'earnings before interest, taxes, depreciation, amortization, and rent margin', 'ROIC': 'return on invested capital', 'EPS': 'earnings per share', 'P/E': 'price-to-earnings', 'EBITDA': 'earnings before interest, taxes, depreciation, and amortization', 'YOY': 'year-over-year', 'MOM': 'month-over-month', 'CAGR': 'compound annual growth rate', 'GDP': 'gross domestic product', 'ROI': 'return on investment', 'ROE': 'return on equity', 'EBIT': 'earnings before interest and taxes', 'DCF': 'discounted cash flow', 'GAAP': 'Generally Accepted Accounting Principles', 'LTM': 'last twelve months', 'EBIT margin': 'earnings before interest and taxes margin', 'EBT': 'earnings before taxes', 'EBTA': 'earnings before taxes and amortization', 'FTE': 'full-time equivalent', 'EBIDTA': 'earnings before interest, depreciation, taxes, and amortization', 'EBTIDA': 'earnings before taxes, interest, depreciation, and amortization', 'EBITDAR': 'earnings before interest, taxes, depreciation, amortization, and rent', 'COGS': 'cost of goods sold', 'APR': 'annual percentage rate', 'PESTEL': 'Political, Economic, Social, Technological, Environmental, and Legal', 'KPI': 'key performance indicator', 'SWOT': 'Strengths, Weaknesses, Opportunities, Threats', 'CAPEX': 'capital expenditures', 'EBITDARM': 'earnings before interest, taxes, depreciation, amortization, rent, and management fees', 'EBITDAX': 'earnings before interest, taxes, depreciation, amortization, and exploration expenses', 'EBITDAS': 'earnings before interest, taxes, depreciation, amortization, and restructuring costs', 'EBITDAX-C': 'earnings before interest, taxes, depreciation, amortization, exploration expenses, and commodity derivatives', 'EBITDAX-R': 'earnings before interest, taxes, depreciation, amortization, exploration expenses, and asset retirement obligations', 'EBITDAX-E': 'earnings before interest, taxes, depreciation, amortization, exploration expenses, and environmental liabilities' # Add more abbreviations and replacements as needed } for abbreviation, full_form in replacements.items(): text = text.replace(abbreviation, full_form) return text def clean_and_preprocess(text): text=[x for x in text.split("\n") if len(x)>100] l=[] for t in text: # Convert to lowercase t = t.lower() # Tokenize text into words words = nltk.word_tokenize(t) # Remove stopwords stop_words = set(stopwords.words('english')) filtered_words = [word for word in words if word not in stop_words] # Join the words back into a cleaned text cleaned_text = ' '.join(filtered_words) l.append(cleaned_text) return "\n".join(l) def convert_amount_to_number(amount_str): try: return w2n.word_to_num(amount_str) except ValueError: return 0 # Return 0 if the conversion fails def getTopic(encoded_input): # modelTopic.to("cuda") with torch.no_grad(): logits = modelTopic(**encoded_input).logits predicted_class_id = logits.argmax().item() return modelTopic.config.id2label[predicted_class_id] def selectedCorpusForNextQuarterModel(x,quarter,year): number_word_dict = { "1": "first", "2": "second", "3": "third", "4": "fourth", # Add more entries as needed } tokens=tokenizerTopic(x, padding=True, truncation=True, return_tensors='pt') splitSize=256 chunksInput_ids=[tokens["input_ids"][0][r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens["input_ids"][0])/splitSize))] chunksToken_type_ids=[tokens["token_type_ids"][0][r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens["token_type_ids"][0])/splitSize))] chunksAttention_mask=[tokens["attention_mask"][0][r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens["attention_mask"][0])/splitSize))] l=[] for idx in range(len(chunksInput_ids)): l.append({"input_ids":torch.tensor([list(chunksInput_ids[idx])]), "token_type_ids":torch.tensor([list(chunksToken_type_ids[idx])]), "attention_mask":torch.tensor([list(chunksAttention_mask[idx])]) }) selectedTopics = ["Stock Movement", "Earnings", "IPO", "Stock Commentary", "Currencies", "M&A | Investments", "Financials", "Macro", "Analyst Update", "Company | Product News"] result = [tokenizerTopic.decode(x["input_ids"][0], skip_special_tokens=True) for x in l if getTopic(x) in selectedTopics] result=[x for x in result if len(x)>10] des=f"the {number_word_dict[str(quarter)]} quarter results of the {year}" courpus=result embeddings_1 = similarityModel.encode([des]+courpus, normalize_embeddings=True,show_progress_bar=False) sents=[des]+courpus rest=[sents[f] for f in [list(cosine_similarity(embeddings_1)[0][1:]).index(value)+1 for value in sorted(list(cosine_similarity(embeddings_1)[0][1:]),reverse=True)][:3]] return ",".join(rest) def getQuarterPrediction(text): tokens=tokenizerQuarter(text,padding=True,max_length=512,return_overflowing_tokens=False,add_special_tokens=True,truncation=True,return_tensors="pt") with torch.no_grad(): logits = modelQuarter(**tokens).logits predicted_class_id = logits.argmax().item() return modelQuarter.config.id2label[predicted_class_id] def getSentence(listOfSentences,value): for sent in listOfSentences: if value in sent: return sent return value def get_annotated_text(text,value,entity): doc = nlp(text) doc.ents = [doc.char_span(text.index(value), text.index(value)+len(value), label=entity)] ent_html = displacy.render(doc, style='ent', jupyter=False)# Display the entity visualization in the browser: st.markdown(ent_html, unsafe_allow_html=True) return [text.split(value)[0],(value,entity),text.split(value)[1]] st.header("Transcript Analysis", divider='rainbow') mainTranscript = st.text_area("Enter the transcript:", height=100) doc = nlp(mainTranscript) sentences = [sent.text for sent in doc.sents] quarter= st.selectbox('Select your quarter',('1', '2', '3','4')) year = st.selectbox('Select your year',tuple([str(x) for x in range(int(datetime.datetime.now().year),1900,-1)])) if st.button("Analyze"): transcript=replace_abbreviations(mainTranscript) transcript=removeSpeakers(transcript) transcript=removeQA(transcript) transcript=clean_and_preprocess(transcript) tokens=transcript.split() splitSize=256 chunks=[tokens[r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens)/splitSize))] chunks=[" ".join(chuk) for chuk in chunks] st.subheader("Management Sentiment", divider='rainbow') sentiment = [sentiment_model(x)[0]['label'] for x in chunks] sentiment=max(sentiment,key=sentiment.count) sentiment_color = "green" if sentiment == "postive" else "red" st.markdown(f'{sentiment}', unsafe_allow_html=True) st.subheader("Next Quarter Perdiction", divider='rainbow') # increase_decrease = [increase_decrease_model(x)[0]['label'] for x in chunks] increase_decrease=getQuarterPrediction(selectedCorpusForNextQuarterModel(mainTranscript,quarter,year)) increase_decrease_color = "green" if increase_decrease == "Increase" else "red" st.markdown(f'{increase_decrease}', unsafe_allow_html=True) st.subheader("Financial Metrics", divider='rainbow') ner_result=[] savedchunks=[] idx=0 while idx=1: idxx=0 savedchunks.append(idx) while idxxexpencesAmount: st.markdown(f'{"This is a great chance for investment. Do consider it."}', unsafe_allow_html=True) else: st.markdown(f'{"Not the best chance for investment."}', unsafe_allow_html=True)