Spaces:
Runtime error
Runtime error
import pandas as pd | |
from gradio_client import Client | |
import streamlit as st | |
from rank_bm25 import BM25Okapi, BM25L, BM25Plus | |
import numpy as np | |
import nltk | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
from nltk.stem.porter import PorterStemmer | |
import re | |
def tokenizer( | |
string, reg="[a-zA-Z'-]+|[0-9]{1,}%|[0-9]{1,}\.[0-9]{1,}%|\d+\.\d+%}" | |
): | |
regex = reg | |
string = string.replace("-", " ") | |
return " ".join(re.findall(regex, string)) | |
def preprocess_text(text): | |
# Convert to lowercase | |
text = text.lower() | |
# Tokenize the text | |
tokens = word_tokenize(text) | |
# Remove stop words | |
stop_words = set(stopwords.words("english")) | |
tokens = [token for token in tokens if token not in stop_words] | |
# Stem the tokens | |
porter_stemmer = PorterStemmer() | |
tokens = [porter_stemmer.stem(token) for token in tokens] | |
# Join the tokens back into a single string | |
preprocessed_text = " ".join(tokens) | |
preprocessed_text = tokenizer(preprocessed_text) | |
return preprocessed_text | |
def get_data(): | |
data = pd.read_csv("AMD_Q1_2020_earnings_call_data_keywords.csv") | |
return data | |
def get_instructor_embedding_model(): | |
client = Client("https://awinml-api-instructor-xl-1.hf.space/") | |
return client | |
def get_bm25_model(data): | |
corpus = data.Text.tolist() | |
corpus_clean = [preprocess_text(x) for x in corpus] | |
tokenized_corpus = [doc.split(" ") for doc in corpus_clean] | |
bm25 = BM25Plus(tokenized_corpus) | |
return corpus, bm25 | |