File size: 1,579 Bytes
92808fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93b6d4c
92808fd
 
 
 
 
93b6d4c
92808fd
 
 
 
 
93b6d4c
92808fd
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import pandas as pd
from gradio_client import Client
import streamlit as st

from rank_bm25 import BM25Okapi, BM25L, BM25Plus
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re


def tokenizer(
    string, reg="[a-zA-Z'-]+|[0-9]{1,}%|[0-9]{1,}\.[0-9]{1,}%|\d+\.\d+%}"
):
    regex = reg
    string = string.replace("-", " ")
    return " ".join(re.findall(regex, string))


def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]
    # Stem the tokens
    porter_stemmer = PorterStemmer()
    tokens = [porter_stemmer.stem(token) for token in tokens]
    # Join the tokens back into a single string
    preprocessed_text = " ".join(tokens)
    preprocessed_text = tokenizer(preprocessed_text)

    return preprocessed_text


@st.cache_resource
def get_data():
    data = pd.read_csv("AMD_Q1_2020_earnings_call_data_keywords.csv")
    return data


@st.cache_resource
def get_instructor_embedding_model():
    client = Client("https://awinml-api-instructor-xl-1.hf.space/")
    return client


@st.cache_resource
def get_bm25_model(data):
    corpus = data.Text.tolist()
    corpus_clean = [preprocess_text(x) for x in corpus]
    tokenized_corpus = [doc.split(" ") for doc in corpus_clean]
    bm25 = BM25Plus(tokenized_corpus)
    return corpus, bm25