Spaces:
Runtime error
Runtime error
File size: 1,603 Bytes
92808fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import pandas as pd
from gradio_client import Client
import streamlit as st
from rank_bm25 import BM25Okapi, BM25L, BM25Plus
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
def tokenizer(
string, reg="[a-zA-Z'-]+|[0-9]{1,}%|[0-9]{1,}\.[0-9]{1,}%|\d+\.\d+%}"
):
regex = reg
string = string.replace("-", " ")
return " ".join(re.findall(regex, string))
def preprocess_text(text):
# Convert to lowercase
text = text.lower()
# Tokenize the text
tokens = word_tokenize(text)
# Remove stop words
stop_words = set(stopwords.words("english"))
tokens = [token for token in tokens if token not in stop_words]
# Stem the tokens
porter_stemmer = PorterStemmer()
tokens = [porter_stemmer.stem(token) for token in tokens]
# Join the tokens back into a single string
preprocessed_text = " ".join(tokens)
preprocessed_text = tokenizer(preprocessed_text)
return preprocessed_text
@st.experimental_singleton
def get_data():
data = pd.read_csv("AMD_Q1_2020_earnings_call_data_keywords.csv")
return data
@st.experimental_singleton
def get_instructor_embedding_model():
client = Client("https://awinml-api-instructor-xl-1.hf.space/")
return client
@st.experimental_singleton
def get_bm25_model(data):
corpus = data.Text.tolist()
corpus_clean = [preprocess_text(x) for x in corpus]
tokenized_corpus = [doc.split(" ") for doc in corpus_clean]
bm25 = BM25Plus(tokenized_corpus)
return corpus, bm25
|