Spaces:

awinml
/

instructor-xl-embeddings

Runtime error

App Files Files Community

Upload 7 files

by awinml - opened Jun 4, 2023

base: refs/heads/main

←

from: refs/pr/5

Discussion Files changed

+280

-0

Files changed (7) hide show

app.py +160 -0
requirements.txt +11 -0
utils/__init__.py +0 -0
utils/bm25.py +0 -0
utils/models.py +58 -0
utils/nltkmodules.py +5 -0
utils/retriever.py +46 -0

app.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import numpy as np
+import streamlit_scrollable_textbox as stx
+import pinecone
+import streamlit as st
+st.set_page_config(layout="wide")  # isort: split
+from utils import nltkmodules
+from utils.models import (
+    get_bm25_model,
+    tokenizer,
+    get_data,
+    get_instructor_embedding_model,
+    preprocess_text,
+)
+from utils.retriever import (
+    query_pinecone,
+    format_context,
+    format_query,
+    get_bm25_search_hits,
+    retrieve_transcript,
+)
+st.title("Instructor XL Embeddings")
+st.write(
+    "The app compares the performance of the Instructor-XL Embedding Model on the text from AMD's Q1 2020 Earnings Call Transcript.'"
+)
+data = get_data()
+col1, col2 = st.columns([3, 3], gap="medium")
+instructor_model = get_instructor_embedding_model()
+question_choice = [
+    "What was discussed regarding Ryzen revenue performance?",
+    "What is the impact of the enterprise and cloud on AMD's growth",
+    "What was the impact of situation in China on the sales and revenue?",
+]
+question_instruction_choice = [
+    "Represent the financial question for retrieving supporting documents:",
+    "Represent the financial question for retrieving supporting sentences:",
+    "Represent the finance query for retrieving supporting documents:",
+    "Represent the finance query for retrieving related documents:",
+    "Represent a finance query for retrieving relevant documents:",
+]
+with col1:
+    st.subheader("Question")
+    st.write(
+        "Choose a preset question example from the dropdown or enter a question in the text box."
+    )
+    default_query = st.selectbox("Question Examples", question_choice)
+    query_text = st.text_area(
+        "Question",
+        value=default_query,
+    )
+    st.subheader("Question Embedding-Instruction")
+    st.write(
+        "Choose a preset instruction example from the dropdown or enter a instruction in the text box."
+    )
+    default_query_embedding_instruction = st.selectbox(
+        "Question Embedding-Instruction Examples", question_instruction_choice
+    )
+    query_embedding_instruction = st.text_area(
+        "Question Embedding-Instruction",
+        value=default_query_embedding_instruction,
+    )
+    num_results = int(
+        st.number_input("Number of Results to query", 1, 15, value=5)
+    )
+corpus, bm25 = get_bm25_model(data)
+tokenized_query = preprocess_text(query_text).split()
+sparse_scores = np.argsort(bm25.get_scores(tokenized_query), axis=0)[::-1]
+indices = get_bm25_search_hits(corpus, sparse_scores, 50)
+dense_embedding = instructor_model.predict(
+    query_embedding_instruction,
+    query_text,
+    api_name="/predict",
+)
+text_embedding_instructions_choice = [
+    "Represent the financial statement for retrieval:",
+    "Represent the financial document for retrieval:",
+    "Represent the finance passage for retrieval:",
+    "Represent the earnings call transcript for retrieval:",
+    "Represent the earnings call transcript sentence for retrieval:",
+    "Represent the earnings call transcript answer for retrieval:",
+]
+index_mapping = {
+    "Represent the financial statement for retrieval:": "week14-instructor-xl-amd-fsr-1",
+    "Represent the financial document for retrieval:": "week14-instructor-xl-amd-fdr-2",
+    "Represent the finance passage for retrieval:": "week14-instructor-xl-amd-fpr-3",
+    "Represent the earnings call transcript for retrieval:": "week14-instructor-xl-amd-ectr-4",
+    "Represent the earnings call transcript sentence for retrieval:": "week14-instructor-xl-amd-ects-5",
+    "Represent the earnings call transcript answer for retrieval:": "week14-instructor-xl-amd-ecta-6",
+}
+with st.form("my_form"):
+    text_embedding_instruction = st.selectbox(
+        "Select instruction for Text Embedding",
+        text_embedding_instructions_choice,
+    )
+    pinecone_index_name = index_mapping[text_embedding_instruction]
+    pinecone.init(
+        api_key=st.secrets[f"pinecone_{pinecone_index_name}"],
+        environment="asia-southeast1-gcp-free",
+    )
+    pinecone_index = pinecone.Index(pinecone_index_name)
+    submitted = st.form_submit_button("Submit")
+    if submitted:
+        matches = query_pinecone(
+            dense_embedding, num_results, pinecone_index, indices
+        )
+        context = format_query(matches)
+        output_text = format_context(context)
+tab1 = st.tabs(["View transcript"])
+with col2:
+    st.subheader("Retrieved Text:")
+    for output in output_text:
+        output = f"""{output}"""
+        st.write(
+            f"<ul><li><p>{output}</p></li></ul>",
+            unsafe_allow_html=True,
+        )
+with tab1:
+    file_text = retrieve_transcript()
+    with st.expander("See Transcript"):
+        st.subheader("AMD Q1 2020 Earnings Call Transcript:")
+        stx.scrollableTextbox(
+            file_text, height=700, border=False, fontFamily="Helvetica"
+        )

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+pandas
+nltk
+tqdm
+pinecone-client
+torch
+git+https://github.com/UKPLab/sentence-transformers.git
+streamlit
+streamlit-scrollable-textbox
+InstructorEmbedding
+gradio_client
+rank_bm25

utils/__init__.py ADDED Viewed

File without changes

utils/bm25.py ADDED Viewed

File without changes

utils/models.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import pandas as pd
+from gradio_client import Client
+import streamlit as st
+from rank_bm25 import BM25Okapi, BM25L, BM25Plus
+import numpy as np
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem.porter import PorterStemmer
+import re
+def tokenizer(
+    string, reg="[a-zA-Z'-]+|[0-9]{1,}%|[0-9]{1,}\.[0-9]{1,}%|\d+\.\d+%}"
+):
+    regex = reg
+    string = string.replace("-", " ")
+    return " ".join(re.findall(regex, string))
+def preprocess_text(text):
+    # Convert to lowercase
+    text = text.lower()
+    # Tokenize the text
+    tokens = word_tokenize(text)
+    # Remove stop words
+    stop_words = set(stopwords.words("english"))
+    tokens = [token for token in tokens if token not in stop_words]
+    # Stem the tokens
+    porter_stemmer = PorterStemmer()
+    tokens = [porter_stemmer.stem(token) for token in tokens]
+    # Join the tokens back into a single string
+    preprocessed_text = " ".join(tokens)
+    preprocessed_text = tokenizer(preprocessed_text)
+    return preprocessed_text
+@st.experimental_singleton
+def get_data():
+    data = pd.read_csv("AMD_Q1_2020_earnings_call_data_keywords.csv")
+    return data
+@st.experimental_singleton
+def get_instructor_embedding_model():
+    client = Client("https://awinml-api-instructor-xl-1.hf.space/")
+    return client
+@st.experimental_singleton
+def get_bm25_model(data):
+    corpus = data.Text.tolist()
+    corpus_clean = [preprocess_text(x) for x in corpus]
+    tokenized_corpus = [doc.split(" ") for doc in corpus_clean]
+    bm25 = BM25Plus(tokenized_corpus)
+    return corpus, bm25

utils/nltkmodules.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import nltk
+nltk.download("wordnet")
+nltk.download("punkt")
+nltk.download("stopwords")

utils/retriever.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import numpy as np
+def query_pinecone(dense_vec, top_k, index, indices):
+    xc = index.query(
+        vector=dense_vec,
+        top_k=top_k,
+        filter={"QA_Flag": {"$eq": "Answer"}, "index": {"$in": indices}},
+        include_metadata=True,
+    )
+    return xc["matches"]
+def format_query(query_results):
+    # extract passage_text from Pinecone search result
+    context = [
+        (result["metadata"]["Text"], result["score"])
+        for result in query_results
+    ]
+    return context
+def format_context(context):
+    output_text = []
+    for text, score in context:
+        output_text.append(f"Text: {text}\nCosine Similarity: {score}")
+    return output_text
+def get_bm25_search_hits(corpus, sparse_scores, top_n=50):
+    bm25_search = []
+    indices = []
+    for idx in sparse_scores:
+        if len(bm25_search) <= top_n:
+            bm25_search.append(corpus[idx])
+            indices.append(idx)
+    return indices
+def retrieve_transcript():
+    open_file = open(
+        f"2020-Apr-28-AMD.txt",
+        "r",
+    )
+    file_text = open_file.read()
+    return f"""{file_text}"""