Spaces:

AhmedTaha012
/

Finance

Build error

File size: 14,603 Bytes

import streamlit as st
from transformers import pipeline
from transformers import AutoTokenizer,AutoModelForTokenClassification,AutoModelForSequenceClassification
import math
import nltk
import torch
from nltk.corpus import stopwords
import spacy
from spacy import displacy
from word2number import w2n
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')
nltk.download('stopwords')

similarityModel = SentenceTransformer('BAAI/bge-small-en')
sentiment_model = pipeline("text-classification", model="AhmedTaha012/managersFeedback-V1.0.7")
increase_decrease_model = pipeline("text-classification", model="AhmedTaha012/nextQuarter-status-V1.1.9")
tokenizerTopic = AutoTokenizer.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification",use_fast=True,token="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")
modelTopic = AutoModelForSequenceClassification.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification",token="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")
# torch.compile(modelTopic)
tokenizer = AutoTokenizer.from_pretrained("AhmedTaha012/finance-ner-v0.0.9-finetuned-ner")
model = AutoModelForTokenClassification.from_pretrained("AhmedTaha012/finance-ner-v0.0.9-finetuned-ner")
# torch.compile(model)
# torch.compile(model)
nlpPipe = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
def getSpeakers(data):
    if "Speakers" in data:
        return "\n".join([x for x in data.split("Speakers")[-1].split("\n") if "--" in x])
    elif "Call participants" in data:
        return "\n".join([x for x in data.split("Call participants")[-1].split("\n") if "--" in x])
    elif "Call Participants" in data:
        return "\n".join([x for x in data.split("Call Participants")[-1].split("\n") if "--" in x])
def removeSpeakers(data):
    if "Speakers" in data:
        return data.split("Speakers")[0]
    elif "Call participants" in data:
        return data.split("Call participants")[0]
    elif "Call Participants" in data:
        return data.split("Call Participants")[0]
def getQA(data):
    if "Questions and Answers" in data:    
        return data.split("Questions and Answers")[-1]
    elif  "Questions & Answers" in data:
        return data.split("Questions & Answers")[-1]
    elif "Q&A" in data:
        return data.split("Q&A")[-1]
    else:
        return ""
def removeQA(data):
    if "Questions and Answers" in data:    
        return data.split("Questions and Answers")[0]
    elif  "Questions & Answers" in data:
        return data.split("Questions & Answers")[0]
    elif "Q&A" in data:
        return data.split("Q&A")[0]
    else:
        return ""
def clean_and_preprocess(text):
    text=[x for x in text.split("\n") if len(x)>100]
    l=[]
    for t in text:
        # Convert to lowercase
        t = t.lower()    
        # Tokenize text into words
        words = nltk.word_tokenize(t)
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        filtered_words = [word for word in words if word not in stop_words]

        # Join the words back into a cleaned text
        cleaned_text = ' '.join(filtered_words)
        l.append(cleaned_text)
    return "\n".join(l)
def replace_abbreviations(text):
    
    replacements = {
        'Q1': 'first quarter',
        'Q2': 'second quarter',
        'Q3': 'third quarter',
        'Q4': 'fourth quarter',
        'q1': 'first quarter',
        'q2': 'second quarter',
        'q3': 'third quarter',
        'q4': 'fourth quarter',
        'FY': 'fiscal year',
        'YoY': 'year over year',
        'MoM': 'month over month',
        'EBITDA': 'earnings before interest, taxes, depreciation, and amortization',
        'ROI': 'return on investment',
        'EPS': 'earnings per share',
        'P/E': 'price-to-earnings',
        'DCF': 'discounted cash flow',
        'CAGR': 'compound annual growth rate',
        'GDP': 'gross domestic product',
        'CFO': 'chief financial officer',
        'GAAP': 'generally accepted accounting principles',
        'SEC': 'U.S. Securities and Exchange Commission',
        'IPO': 'initial public offering',
        'M&A': 'mergers and acquisitions',
        'EBIT': 'earnings before interest and taxes',
        'IRR': 'internal rate of return',
        'ROA': 'return on assets',
        'ROE': 'return on equity',
        'NAV': 'net asset value',
        'PE ratio': 'price-to-earnings ratio',
        'EPS growth': 'earnings per share growth',
        'Fiscal Year': 'financial year',
        'CAPEX': 'capital expenditure',
        'APR': 'annual percentage rate',
        'P&L': 'profit and loss',
        'NPM': 'net profit margin',
        'EBT': 'earnings before taxes',
        'EBITDAR': 'earnings before interest, taxes, depreciation, amortization, and rent',
        'PAT': 'profit after tax',
        'COGS': 'cost of goods sold',
        'EBTIDA': 'earnings before taxes, interest, depreciation, and amortization',
        'E&Y': 'Ernst & Young',
        'B2B': 'business to business',
        'B2C': 'business to consumer',
        'LIFO': 'last in, first out',
        'FIFO': 'first in, first out',
        'FCF': 'free cash flow',
        'LTM': 'last twelve months',
        'OPEX': 'operating expenses',
        'TSR': 'total shareholder return',
        'PP&E': 'property, plant, and equipment',
        'PBT': 'profit before tax',
        'EBITDAR margin': 'earnings before interest, taxes, depreciation, amortization, and rent margin',
        'ROIC': 'return on invested capital',
        'EPS': 'earnings per share',
    'P/E': 'price-to-earnings',
    'EBITDA': 'earnings before interest, taxes, depreciation, and amortization',
    'YOY': 'year-over-year',
    'MOM': 'month-over-month',
    'CAGR': 'compound annual growth rate',
    'GDP': 'gross domestic product',
    'ROI': 'return on investment',
    'ROE': 'return on equity',
    'EBIT': 'earnings before interest and taxes',
    'DCF': 'discounted cash flow',
    'GAAP': 'Generally Accepted Accounting Principles',
    'LTM': 'last twelve months',
    'EBIT margin': 'earnings before interest and taxes margin',
    'EBT': 'earnings before taxes',
    'EBTA': 'earnings before taxes and amortization',
    'FTE': 'full-time equivalent',
    'EBIDTA': 'earnings before interest, depreciation, taxes, and amortization',
    'EBTIDA': 'earnings before taxes, interest, depreciation, and amortization',
    'EBITDAR': 'earnings before interest, taxes, depreciation, amortization, and rent',
    'COGS': 'cost of goods sold',
    'APR': 'annual percentage rate',
    'PESTEL': 'Political, Economic, Social, Technological, Environmental, and Legal',
    'KPI': 'key performance indicator',
    'SWOT': 'Strengths, Weaknesses, Opportunities, Threats',
    'CAPEX': 'capital expenditures',
    'EBITDARM': 'earnings before interest, taxes, depreciation, amortization, rent, and management fees',
    'EBITDAX': 'earnings before interest, taxes, depreciation, amortization, and exploration expenses',
    'EBITDAS': 'earnings before interest, taxes, depreciation, amortization, and restructuring costs',
    'EBITDAX-C': 'earnings before interest, taxes, depreciation, amortization, exploration expenses, and commodity derivatives',
    'EBITDAX-R': 'earnings before interest, taxes, depreciation, amortization, exploration expenses, and asset retirement obligations',
    'EBITDAX-E': 'earnings before interest, taxes, depreciation, amortization, exploration expenses, and environmental liabilities'
        
        # Add more abbreviations and replacements as needed
    }
    for abbreviation, full_form in replacements.items():
        text = text.replace(abbreviation, full_form)
    
    return text

def clean_and_preprocess(text):
    text=[x for x in text.split("\n") if len(x)>100]
    l=[]
    for t in text:
        # Convert to lowercase
        t = t.lower()    
        # Tokenize text into words
        words = nltk.word_tokenize(t)
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        filtered_words = [word for word in words if word not in stop_words]

        # Join the words back into a cleaned text
        cleaned_text = ' '.join(filtered_words)
        l.append(cleaned_text)
    return "\n".join(l)
def convert_amount_to_number(amount_str):
    try:
        return w2n.word_to_num(amount_str)
    except ValueError:
        return 0  # Return 0 if the conversion fails
def getTopic(encoded_input):
    modelTopic.to("cuda")
    with torch.no_grad():
        logits = modelTopic(**encoded_input).logits
    predicted_class_id = logits.argmax().item()
    return modelTopic.config.id2label[predicted_class_id]
def selectedCorpusForNextQuarterModel(x,quarter):
    number_word_dict = {
    "1": "first",
    "2": "second",
    "3": "third",
    "4": "fourth",
    # Add more entries as needed
    }
    tokens=tokenizerTopic(x, padding=True, truncation=True, return_tensors='pt')
    splitSize=256
    chunksInput_ids=[tokens["input_ids"][r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens["input_ids"])/splitSize))]
    chunksToken_type_ids=[tokens["token_type_ids"][r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens["token_type_ids"])/splitSize))]
    chunksAttention_mask=[tokens["attention_mask"][r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens["attention_mask"])/splitSize))]
    l=[]
    for idx in range(len(chunksInput_ids)):
        l.append({"input_ids":torch.tensor([list(chunksInput_ids[idx])]).to("cuda"),
         "token_type_ids":torch.tensor([list(chunksToken_type_ids[idx])]).to("cuda"),
          "attention_mask":torch.tensor([list(chunksAttention_mask[idx])]).to("cuda")
        })
      
    selectedTopics = ["Stock Movement", "Earnings", "IPO", "Stock Commentary", "Currencies", "M&A | Investments", "Financials", "Macro", "Analyst Update", "Company | Product News"]
    result = [tokenizerTopic.decode(x["input_ids"][0], skip_special_tokens=True) for x in l if getTopic(x) in selectedTopics]
    result=[x for x in result if len(x)>10]
    des=f"the {number_word_dict[str(quarter)]} quarter results of the {usedData['quad-date'].iloc[i]}"
    courpus=result.split("\n")
    embeddings_1 = similarityModel.encode([des]+courpus, normalize_embeddings=True,device='cuda',show_progress_bar=False) 
    sents=[des]+courpus
    rest=[sents[f] for f in [list(cosine_similarity(embeddings_1)[0][1:]).index(value)+1 for value in sorted(list(cosine_similarity(embeddings_1)[0][1:]),reverse=True)][:3]]
    return selectedCourpusForTraing.append(",".join(rest))

st.header("Transcript Analysis", divider='rainbow')
mainTranscript = st.text_area("Enter the transcript:", height=100)
quarter = st.text_input('Enter your quarter', 'quarter of transcript')
if st.button("Analyze"):
    transcript=replace_abbreviations(mainTranscript)
    transcript=replace_abbreviations(transcript)
    transcript=removeSpeakers(transcript)
    transcript=removeQA(transcript)
    transcript=clean_and_preprocess(transcript)
    tokens=transcript.split()
    splitSize=256
    chunks=[tokens[r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens)/splitSize))]
    chunks=[" ".join(chuk) for chuk in chunks]
    st.subheader("Management Sentiment", divider='rainbow')
    sentiment = [sentiment_model(x)[0]['label'] for x in chunks]
    sentiment=max(sentiment,key=sentiment.count)
    sentiment_color = "green" if sentiment == "postive" else "red"
    st.markdown(f'<span style="color:{sentiment_color}">{sentiment}</span>', unsafe_allow_html=True)
    st.subheader("Next Quarter Perdiction", divider='rainbow')
    # increase_decrease = [increase_decrease_model(x)[0]['label'] for x in chunks]
    increase_decrease=increase_decrease_model(selectedCorpusForNextQuarterModel(mainTranscript,quarter))[0]['label']
    increase_decrease=max(increase_decrease,key=increase_decrease.count)
    increase_decrease_color = "green" if increase_decrease == "Increase" else "red"
    st.markdown(f'<span style="color:{increase_decrease_color}">{increase_decrease}</span>', unsafe_allow_html=True)
    st.subheader("Financial Metrics", divider='rainbow')
    ner_result=[]
    savedchunks=[]
    idx=0
    while idx<len(chunks):
        ents=nlpPipe(chunks[idx])
        if len(ents)>=1:
            idxx=0
            savedchunks.append(idx)
            while idxx<len(ents):
                if len(ents[idxx]["word"].split())==2:
                    ner_result.append({ents[idxx]["entity_group"]:ents[idxx]["word"]})
                elif len(ents[idxx]["word"].split())==1:
                    try:
                        ner_result.append({ents[idxx]["entity_group"]:ents[idxx]["word"]+ents[idxx+1]["word"]+ents[idxx+2]["word"]})
                        idxx=idxx+2
                    except:
                        pass
                idxx=idxx+1
        idx=idx+1
    profits=[x["profit"] for x in ner_result if "profit" in x]
    revenues=[x["revenue"] for x in ner_result if "revenue" in x]
    expences=[x["expense"] for x in ner_result if "expense" in x]
    for idx in range(len(revenues)):
        st.text_input(f'Revenue:{idx+1}', revenues[idx])
    for idx in range(len(profits)):
        st.text_input(f'Profit:{idx+1}', profits[idx])
    for idx in range(len(expences)):
        st.text_input(f'Expences:{idx+1}', expences[idx])
    # st.subheader("Parts from transcript that contais financial metrics", divider='rainbow')
    # for idx in savedchunks:
    #     doc = nlp(chunks[idx])
    #     entity_list=nlpPipe(chunks[idx])
    #     entities = []
    #     for entity in entity_list:
    #         span = doc.char_span(entity['start'], entity['end'], label=entity['entity_group'])
    #         entities.append(span)
    #     try:
    #         doc.ents = entities
    #         ent_html = displacy.render(doc, style="ent", jupyter=False)
    #         st.markdown(ent_html, unsafe_allow_html=True)
    #     except:
    #         pass
    st.subheader("Investment Recommendation", divider='rainbow')
    profitAmount=sum([convert_amount_to_number(x) for x in profits])
    expencesAmount=sum([convert_amount_to_number(x) for x in expences])
    if increase_decrease=="Increase" and sentiment=="postive" and profitAmount>expencesAmount:
        st.markdown(f'<span style="color:green">{"This is a great chance for investment. Do consider it."}</span>', unsafe_allow_html=True)
    else:
        st.markdown(f'<span style="color:red">{"Not the best chance for investment."}</span>', unsafe_allow_html=True)