Spaces:

awinml
/

earnings-calls-qa

Sleeping

File size: 9,303 Bytes

from ast import literal_eval
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Entity Extraction


def generate_ner_docs_prompt(query):
    prompt = """USER: Extract the company names and time duration mentioned in the question. The entities should be extracted in the following format: {"companies": list of companies mentioned in the question,"start-duration": ("start-quarter", "start-year"), "end-duration": ("end-quarter", "end-year")}. Return {"companies": None, "start-duration": (None, None), "end-duration": (None, None)} if the entities are not found.

Examples:
What is Nvidia's visibility in the data center business in Q2 2020?
{"companies": ["Nvidia"], "start-duration": ("Q2", "2020"), "end-duration": ("Q2", "2020")}
What is Intel's update on the server chip road map and strategy for 2019?
{"companies": ["Intel"], "start-duration": ("Q1", "2019"), "end-duration": ("Q4", "2019")}
What are the opportunities and challenges in the Indian market for Amazon in 2016?
{"companies": ["Amazon"], "start-duration": ("Q1", "2016"), "end-duration": ("Q4", "2016")}
What did analysts ask about the Cisco's Webex?
{"companies": ["Cisco"], "start-duration": (None, None), "end-duration": (None, None)}
What is the comparative performance analysis between Intel and AMD in key overlapping segments such as PC, Gaming, and Data Centers in Q1 to Q3 2016?
{"companies": ["Intel", "AMD"], "start-duration": ("Q1", "2016"), "end-duration": ("Q3", "2016")}
How has the growth been for AMD in the PC market in 2020?
{"companies": ["AMD"], "start-duration": ("Q1", "2020"), "end-duration": ("Q4", "2020")}
How did Microsoft and Amazon perform in terms of reliability and scalability of cloud for the years 2016 and 2017?
{"companies": ["Microsoft", "Amazon"], "start-duration": ("Q1", "2016"), "end-duration": ("Q4", "2017")}"""
    input_prompt = f"""###Input: {query}
ASSISTANT:"""
    final_prompt = prompt + "\n\n" + input_prompt
    return final_prompt


def extract_entities_docs(query, model):
    """
    Takes input a string which contains a dictionary of entities of the format:
    {"companies": list of companies mentioned in the question,"start-duration": ("start-quarter", "start-year"), "end-duration": ("end-quarter", "end-year")}
    """
    prompt = generate_ner_docs_prompt(query)
    string_of_dict = model.predict(prompt, api_name="/predict")

    entities_dict = literal_eval(string_of_dict)
    start_quarter, start_year = entities_dict["start-duration"]
    end_quarter, end_year = entities_dict["end-duration"]
    companies = entities_dict["companies"]
    print((companies, start_quarter, start_year, end_quarter, end_year))
    return companies, start_quarter, start_year, end_quarter, end_year


def year_quarter_range(start_quarter, start_year, end_quarter, end_year):
    """
    Creates a list of all (year, quarter) pairs that lie in the range including the start and end quarters.

    Example:
    year_quarter_range("Q2", "2020", "Q3", "2021")
    [('Q2', '2020'), ('Q3', '2020'), ('Q4', '2020'), ('Q1', '2021'), ('Q2', '2021'), ('Q3', '2021')]
    """
    if (
        start_quarter is None
        or start_year is None
        or end_quarter is None
        or end_year is None
    ):
        return []
    else:
        quarters = ["Q1", "Q2", "Q3", "Q4"]
        start_index = quarters.index(start_quarter)
        end_index = quarters.index(end_quarter)

        years = range(int(start_year), int(end_year) + 1)
        year_quarter_range_list = []

        for year in years:
            if year == int(start_year):
                start = start_index
            else:
                start = 0

            if year == int(end_year):
                end = end_index + 1
            else:
                end = len(quarters)

            for quarter_index in range(start, end):
                year_quarter_range_list.append(
                    (quarters[quarter_index], str(year))
                )
        print(year_quarter_range_list)
        return year_quarter_range_list


def clean_companies(company_list):
    """Returns list of Tickers from list of companies"""
    company_ticker_map = {
        "apple": "AAPL",
        "amd": "AMD",
        "amazon": "AMZN",
        "cisco": "CSCO",
        "google": "GOOGL",
        "microsoft": "MSFT",
        "nvidia": "NVDA",
        "asml": "ASML",
        "intel": "INTC",
        "micron": "MU",
    }

    tickers = [
        "AAPL",
        "CSCO",
        "MSFT",
        "ASML",
        "NVDA",
        "GOOGL",
        "MU",
        "INTC",
        "AMZN",
        "AMD",
        "aapl",
        "csco",
        "msft",
        "asml",
        "nvda",
        "googl",
        "mu",
        "intc",
        "amzn",
        "amd",
    ]

    ticker_list = []
    for company in company_list:
        if company.lower() in company_ticker_map.keys():
            ticker = company_ticker_map[company.lower()]
            ticker_list.append(ticker)
        elif company.lower() in tickers:
            ticker_list.append(company.upper())
    return ticker_list


def ticker_year_quarter_tuples_creator(ticker_list, year_quarter_range_list):
    ticker_year_quarter_tuples_list = []
    for ticker in ticker_list:
        if year_quarter_range_list == []:
            return []
        else:
            for quarter, year in year_quarter_range_list:
                ticker_year_quarter_tuples_list.append((ticker, quarter, year))
    return ticker_year_quarter_tuples_list


# Keyword Extraction


def generate_ner_keywords_prompt(query):
    prompt = """USER: Extract the entities which describe the key theme and topics being asked in the question. Extract the entities in the following format: {"entities":["keywords"]}.
Examples:
What is Intel's update on the server chip roadmap and strategy for Q1 2019?
{"entities":["server"]}
What are the opportunities and challenges in the Indian market for Amazon from Q1 to Q3 in 2016?
{"entities":["indian"]}
What is the comparative performance analysis between Intel and AMD in key overlapping segments such as PC, Gaming, and Data Centers in Q1 2016?
{"entities":["PC","Gaming","Data Centers"]}
What was Google's and Microsoft's capex spend for the last 2 years?
{"entities":["capex"]}
What did analysts ask about the cloud during Microsoft's earnings call in Q1 2018?
{"entities":["cloud"]}
What was the growth in Apple services revenue for 2017 Q3?
{"entities":["services"]}"""
    input_prompt = f"""###Input: {query}
ASSISTANT:"""
    final_prompt = prompt + "\n" + input_prompt
    return final_prompt


def extract_entities_keywords(query, model):
    """
    Takes input a string which contains a dictionary of entities of the format:
    {"entities":["keywords"]}
    """
    prompt = generate_ner_keywords_prompt(query)
    string_of_dict = model.predict(prompt, api_name="/predict")

    entities_dict = literal_eval(string_of_dict)
    keywords_list = entities_dict["entities"]
    return keywords_list


def expand_list_of_lists(list_of_lists):
    """
    Expands a list of lists of strings to a list of strings.
    Args:
      list_of_lists: A list of lists of strings.
    Returns:
      A list of strings.
    """

    expanded_list = []
    for inner_list in list_of_lists:
        for string in inner_list:
            expanded_list.append(string)
    return expanded_list


def all_keywords_combs(list_of_cleaned_keywords):
    # Convert all strings to lowercase.
    lower_texts = [text.lower() for text in list_of_cleaned_keywords]

    # Stem the words in each string.
    stemmer = PorterStemmer()
    stem_texts = [stemmer.stem(text) for text in list_of_cleaned_keywords]

    # Lemmatize the words in each string.
    lemmatizer = WordNetLemmatizer()
    lemm_texts = [
        lemmatizer.lemmatize(text) for text in list_of_cleaned_keywords
    ]

    list_of_cleaned_keywords.extend(lower_texts)
    list_of_cleaned_keywords.extend(stem_texts)
    list_of_cleaned_keywords.extend(lemm_texts)

    list_of_cleaned_keywords = list(set(list_of_cleaned_keywords))

    return list_of_cleaned_keywords


def create_incorrect_entities_list():
    words_to_remove = [
        "q1",
        "q2",
        "q3",
        "q4",
        "2016",
        "2017",
        "2018",
        "2019",
        "2020",
        "apple",
        "amd",
        "amazon",
        "cisco",
        "google",
        "microsoft",
        "nvidia",
        "asml",
        "intel",
        "micron",
        "strategy",
        "roadmap",
        "impact",
        "opportunities",
        "challenges",
        "growth",
        "performance",
        "analysis",
        "segments",
        "comparative",
        "overlapping",
        "acquisition",
        "revenue",
    ]
    words_to_remove = all_keywords_combs(words_to_remove)
    return words_to_remove


def clean_keywords_all_combs(keywords_list):

    words_to_remove = create_incorrect_entities_list()

    texts = [text.split(" ") for text in keywords_list]
    texts = expand_list_of_lists(texts)

    # Convert all strings to lowercase.
    lower_texts = [text.lower() for text in texts]
    cleaned_keywords = [
        text for text in lower_texts if text not in words_to_remove
    ]
    all_cleaned_keywords = all_keywords_combs(cleaned_keywords)
    return all_cleaned_keywords