Spaces:

AhmedTaha012
/

Finance

Build error

App Files Files Community

AhmedTaha012 commited on Aug 30, 2023

Commit

174db76

1 Parent(s): e89f604

Update app.py

Browse files

Files changed (1) hide show

app.py +171 -0

app.py CHANGED Viewed

@@ -4,9 +4,180 @@ import math
 sentiment_model = pipeline("text-classification", model="AhmedTaha012/managersFeedback-V1.0.7")
 increase_decrease_model = pipeline("text-classification", model="AhmedTaha012/nextQuarter-status-V1.1.9")
 ner_model = pipeline("token-classification", model="AhmedTaha012/finance-ner-v0.0.9-finetuned-ner")
 st.title("Transcript Analysis")
 transcript = st.text_area("Enter the transcript:", height=200)
 tokens=transcript.split()
 splitSize=256
 chunks=[tokens[r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens)/splitSize))]

 sentiment_model = pipeline("text-classification", model="AhmedTaha012/managersFeedback-V1.0.7")
 increase_decrease_model = pipeline("text-classification", model="AhmedTaha012/nextQuarter-status-V1.1.9")
 ner_model = pipeline("token-classification", model="AhmedTaha012/finance-ner-v0.0.9-finetuned-ner")
+def getSpeakers(data):
+    if "Speakers" in data:
+        return "\n".join([x for x in data.split("Speakers")[-1].split("\n") if "--" in x])
+    elif "Call participants" in data:
+        return "\n".join([x for x in data.split("Call participants")[-1].split("\n") if "--" in x])
+    elif "Call Participants" in data:
+        return "\n".join([x for x in data.split("Call Participants")[-1].split("\n") if "--" in x])
+def removeSpeakers(data):
+    if "Speakers" in data:
+        return data.split("Speakers")[0]
+    elif "Call participants" in data:
+        return data.split("Call participants")[0]
+    elif "Call Participants" in data:
+        return data.split("Call Participants")[0]
+def getQA(data):
+    if "Questions and Answers" in data:
+        return data.split("Questions and Answers")[-1]
+    elif  "Questions & Answers" in data:
+        return data.split("Questions & Answers")[-1]
+    elif "Q&A" in data:
+        return data.split("Q&A")[-1]
+    else:
+        return ""
+def removeQA(data):
+    if "Questions and Answers" in data:
+        return data.split("Questions and Answers")[0]
+    elif  "Questions & Answers" in data:
+        return data.split("Questions & Answers")[0]
+    elif "Q&A" in data:
+        return data.split("Q&A")[0]
+    else:
+        return ""
+def clean_and_preprocess(text):
+    text=[x for x in text.split("\n") if len(x)>100]
+    l=[]
+    for t in text:
+        # Convert to lowercase
+        t = t.lower()
+        # Tokenize text into words
+        words = nltk.word_tokenize(t)
+        # Remove stopwords
+        stop_words = set(stopwords.words('english'))
+        filtered_words = [word for word in words if word not in stop_words]
+        # Join the words back into a cleaned text
+        cleaned_text = ' '.join(filtered_words)
+        l.append(cleaned_text)
+    return "\n".join(l)
+def replace_abbreviations(text):
+    replacements = {
+        'Q1': 'first quarter',
+        'Q2': 'second quarter',
+        'Q3': 'third quarter',
+        'Q4': 'fourth quarter',
+        'q1': 'first quarter',
+        'q2': 'second quarter',
+        'q3': 'third quarter',
+        'q4': 'fourth quarter',
+        'FY': 'fiscal year',
+        'YoY': 'year over year',
+        'MoM': 'month over month',
+        'EBITDA': 'earnings before interest, taxes, depreciation, and amortization',
+        'ROI': 'return on investment',
+        'EPS': 'earnings per share',
+        'P/E': 'price-to-earnings',
+        'DCF': 'discounted cash flow',
+        'CAGR': 'compound annual growth rate',
+        'GDP': 'gross domestic product',
+        'CFO': 'chief financial officer',
+        'GAAP': 'generally accepted accounting principles',
+        'SEC': 'U.S. Securities and Exchange Commission',
+        'IPO': 'initial public offering',
+        'M&A': 'mergers and acquisitions',
+        'EBIT': 'earnings before interest and taxes',
+        'IRR': 'internal rate of return',
+        'ROA': 'return on assets',
+        'ROE': 'return on equity',
+        'NAV': 'net asset value',
+        'PE ratio': 'price-to-earnings ratio',
+        'EPS growth': 'earnings per share growth',
+        'Fiscal Year': 'financial year',
+        'CAPEX': 'capital expenditure',
+        'APR': 'annual percentage rate',
+        'P&L': 'profit and loss',
+        'NPM': 'net profit margin',
+        'EBT': 'earnings before taxes',
+        'EBITDAR': 'earnings before interest, taxes, depreciation, amortization, and rent',
+        'PAT': 'profit after tax',
+        'COGS': 'cost of goods sold',
+        'EBTIDA': 'earnings before taxes, interest, depreciation, and amortization',
+        'E&Y': 'Ernst & Young',
+        'B2B': 'business to business',
+        'B2C': 'business to consumer',
+        'LIFO': 'last in, first out',
+        'FIFO': 'first in, first out',
+        'FCF': 'free cash flow',
+        'LTM': 'last twelve months',
+        'OPEX': 'operating expenses',
+        'TSR': 'total shareholder return',
+        'PP&E': 'property, plant, and equipment',
+        'PBT': 'profit before tax',
+        'EBITDAR margin': 'earnings before interest, taxes, depreciation, amortization, and rent margin',
+        'ROIC': 'return on invested capital',
+        'EPS': 'earnings per share',
+    'P/E': 'price-to-earnings',
+    'EBITDA': 'earnings before interest, taxes, depreciation, and amortization',
+    'YOY': 'year-over-year',
+    'MOM': 'month-over-month',
+    'CAGR': 'compound annual growth rate',
+    'GDP': 'gross domestic product',
+    'ROI': 'return on investment',
+    'ROE': 'return on equity',
+    'EBIT': 'earnings before interest and taxes',
+    'DCF': 'discounted cash flow',
+    'GAAP': 'Generally Accepted Accounting Principles',
+    'LTM': 'last twelve months',
+    'EBIT margin': 'earnings before interest and taxes margin',
+    'EBT': 'earnings before taxes',
+    'EBTA': 'earnings before taxes and amortization',
+    'FTE': 'full-time equivalent',
+    'EBIDTA': 'earnings before interest, depreciation, taxes, and amortization',
+    'EBTIDA': 'earnings before taxes, interest, depreciation, and amortization',
+    'EBITDAR': 'earnings before interest, taxes, depreciation, amortization, and rent',
+    'COGS': 'cost of goods sold',
+    'APR': 'annual percentage rate',
+    'PESTEL': 'Political, Economic, Social, Technological, Environmental, and Legal',
+    'KPI': 'key performance indicator',
+    'SWOT': 'Strengths, Weaknesses, Opportunities, Threats',
+    'CAPEX': 'capital expenditures',
+    'EBITDARM': 'earnings before interest, taxes, depreciation, amortization, rent, and management fees',
+    'EBITDAX': 'earnings before interest, taxes, depreciation, amortization, and exploration expenses',
+    'EBITDAS': 'earnings before interest, taxes, depreciation, amortization, and restructuring costs',
+    'EBITDAX-C': 'earnings before interest, taxes, depreciation, amortization, exploration expenses, and commodity derivatives',
+    'EBITDAX-R': 'earnings before interest, taxes, depreciation, amortization, exploration expenses, and asset retirement obligations',
+    'EBITDAX-E': 'earnings before interest, taxes, depreciation, amortization, exploration expenses, and environmental liabilities'
+        # Add more abbreviations and replacements as needed
+    }
+    for abbreviation, full_form in replacements.items():
+        text = text.replace(abbreviation, full_form)
+    return text
+def clean_and_preprocess(text):
+    text=[x for x in text.split("\n") if len(x)>100]
+    l=[]
+    for t in text:
+        # Convert to lowercase
+        t = t.lower()
+        # Tokenize text into words
+        words = nltk.word_tokenize(t)
+        # Remove stopwords
+        stop_words = set(stopwords.words('english'))
+        filtered_words = [word for word in words if word not in stop_words]
+        # Join the words back into a cleaned text
+        cleaned_text = ' '.join(filtered_words)
+        l.append(cleaned_text)
+    return "\n".join(l)
 st.title("Transcript Analysis")
 transcript = st.text_area("Enter the transcript:", height=200)
+transcript=replace_abbreviations(transcript)
+transcript=replace_abbreviations(transcript)
+transcript=removeSpeakers(transcript)
+transcript=removeQA(transcript)
+transcript=clean_and_preprocess(transcript)
 tokens=transcript.split()
 splitSize=256
 chunks=[tokens[r*splitSize:(r+1)*splitSize] for r in range(math.ceil(len(tokens)/splitSize))]