Spaces:

Adr740
/

Temporal-RAG-Benchmark

Running

App Files Files Community

Adr740 commited on Dec 22, 2023

Commit

844cee8

1 Parent(s): a63beb7

Upload 6 files

Browse files

Files changed (6) hide show

app.py +39 -0
naive_rag.py +98 -0
rag.py +134 -0
rag_benchmark.py +21 -0
requirements.txt +4 -0
temporal_augmented_retrival.py +176 -0

app.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import gradio as gr
+from functools import partial
+from rag_benchmark import get_benchmark
+title = "Prototype Temporal Augmented Retrieval (TAR)"
+desc = "Database: 22.4k tweets related to finance dated from July 12,2018 to July 19,2018 - know more about the approach: [link to medium]\ncontact: adrida.github.io"
+with gr.Blocks(title=title,theme='nota-ai/theme') as demo:
+    gr.Markdown(f"# {title}\n{desc}")
+    with gr.Row():
+        with gr.Column(scale = 10):
+            text_area = gr.Textbox(placeholder="Write here", lines=1, label="Ask anything")
+        with gr.Column(scale = 2):
+            api_key = gr.Textbox(placeholder="Paste your OpenAI API key here", lines=1)
+            search_button = gr.Button(value="Ask")
+    with gr.Row():
+        with gr.Tab("Dynamic Temporal Augmented Retrieval (ours)"):
+            gr.Markdown("## Dynamic Temporal Augmented Retrieval (ours)\n---")
+            tempo = gr.Markdown()
+        with gr.Tab("Naive Semantic Search"):
+            gr.Markdown("## Simple Semantic Search\n---")
+            naive = gr.Markdown()
+        with gr.Tab("Traditional RAG (Langchain type)"):
+            gr.Markdown("## Augmented Indexed Retrieval\n---")
+            classic = gr.Markdown()
+    search_function = partial(get_benchmark)
+    search_button.click(fn=search_function, inputs=[text_area, api_key], outputs=[tempo, classic, naive],
+    )
+demo.queue(concurrency_count=100,status_update_rate=500).launch(max_threads=100, show_error=True, debug = True, inline =False)

naive_rag.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import openai
+import time
+import time
+import numpy as np
+GPT_MODEL_ANSWER = "gpt-3.5-turbo-16k"
+def cosine_similarity(a, b):
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+def get_embedding(text, model="text-embedding-ada-002"):
+    try:
+        text = text.replace("\n", " ")
+    except:
+        None
+    try:
+        return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
+    except:
+        time.sleep(2)
+def format_query(query):
+    resp = {
+            "timestamps": [],
+            "query": query
+            }
+    return resp
+def semantic_search(df_loc, query, nb_programs_to_display=15):
+    embedding = get_embedding(query, model='text-embedding-ada-002')
+    filtered_df = df_loc.drop(columns=["url"])
+    def wrap_cos(x,y):
+        try:
+            res = cosine_similarity(x,y)
+        except:
+            res = 0
+        return res
+    filtered_df['similarity']  = filtered_df.embedding.apply(lambda x: wrap_cos(x, embedding))
+    results = filtered_df.sort_values('similarity', ascending=False).head(nb_programs_to_display)
+    return results
+def get_relevant_documents(df, query, nb_programs_to_display=15):
+    all_retrieved= [{
+        "timestamp" : "",
+        "tweets" : semantic_search(df, query["query"], nb_programs_to_display=nb_programs_to_display)
+    }]
+    return all_retrieved
+def get_final_answer(relevant_documents, query):
+    response = relevant_documents[0]
+    tweet_entry = response["tweets"]
+    context = "\nList of tweets:\n" + str((tweet_entry["text"] + "   --- Tweeted by: @" +tweet_entry["source"] +  " \n").to_list()) + "\n---"
+    USER_PROMPT = f"""
+    "We have provided context information below.
+    ---------------------
+    {context}
+    "\n---------------------\n"
+    Given the information above, please answer the question: {query}
+    """
+    response = openai.chat.completions.create(
+                                            model=GPT_MODEL_ANSWER,
+                                            messages=[
+                                                {
+                                                "role": "user",
+                                                "content": USER_PROMPT
+                                                }
+                                            ],
+                                            temperature=1,
+                                            max_tokens=1000,
+                                            top_p=1,
+                                            frequency_penalty=0,
+                                            presence_penalty=0,
+                                            ).choices[0].message.content
+    return response
+def get_answer(query, df, api_key):
+    """This approach is considered naive because it doesn't augment the user query.
+    This means that we try to retrieve documents directly relevant to the user query and then combine them into an answer.
+    The query is formatted to have the same structure given to the LLM as the other two approaches
+    Args:
+        query (String): Query given by the user
+        df (pd.DataFrame()): corpus with embeddings
+        api_key (String): OpenAI API key
+    Returns:
+        String: Answer to the original query
+    """
+    openai.api_key = api_key
+    formatted_query = format_query(query)
+    relevant_documents = get_relevant_documents(df, formatted_query,nb_programs_to_display=15)
+    response = get_final_answer(relevant_documents, formatted_query)
+    return response

rag.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import os
+import openai
+import time
+import numpy as np
+import time
+import pandas as pd
+GPT_MODEL_AUGMENT = "gpt-3.5-turbo-16k"
+GPT_MODEL_ANSWER = "gpt-3.5-turbo-16k"
+def cosine_similarity(a, b):
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+def get_embedding(text, model="text-embedding-ada-002"):
+    try:
+        text = text.replace("\n", " ")
+    except:
+        None
+    try:
+        return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
+    except:
+        time.sleep(2)
+def augment_query(query):
+    SYS_PROMPT = """
+        On [current date: 19 July] Generate a JSON response with the following structure:
+        {
+        "timestamps": # Relevant timestamps in which to get data to answer the query,
+        "query": # Repeat the user's query,
+        }
+        Allowed timestamps:
+        ['2018-07-18', '2018-07-19', '2018-07-08', '2018-07-09', '2018-07-10', '2018-07-11', '2018-07-12', '2018-07-13', '2018-07-14', '2018-07-15', '2018-07-16', '2018-07-17']
+        Ensure the output is always in JSON format and never provide any other response.
+        """
+    response = openai.chat.completions.create(
+                                            model=GPT_MODEL_AUGMENT,
+                                            messages=
+                                            [
+                                                {
+                                                "role": "system",
+                                                "content": SYS_PROMPT
+                                                },
+                                                {
+                                                "role": "user",
+                                                "content": query
+                                                }
+                                            ],
+                                            temperature=1,
+                                            max_tokens=1000,
+                                            top_p=1,
+                                            frequency_penalty=0,
+                                            presence_penalty=0,
+                                            ).choices[0].message.content
+    return response
+def semantic_search(df_loc, query,timestamp, nb_programs_to_display=15):
+    timestamp = str(timestamp).strip()
+    embedding = get_embedding(query, model='text-embedding-ada-002')
+    filtered_df = df_loc[df_loc["timestamp"]==timestamp].drop(columns=["url"])
+    def wrap_cos(x,y):
+        try:
+            res = cosine_similarity(x,y)
+        except:
+            res = 0
+        return res
+    filtered_df['similarity']  = filtered_df.embedding.apply(lambda x: wrap_cos(x, embedding))
+    results = filtered_df.sort_values('similarity', ascending=False).head(nb_programs_to_display)
+    return results
+def get_relevant_documents(df, query, nb_programs_to_display=15):
+    query = eval(query)
+    all_retrieved = []
+    for timestamp in query["timestamps"]:
+        all_retrieved.append({
+            "timestamp" : timestamp,
+            "tweets" : semantic_search(df, query["query"],timestamp, nb_programs_to_display=nb_programs_to_display)
+        })
+    return all_retrieved
+def get_final_answer(relevant_documents, query):
+    context = ""
+    for relevant_timestamp in relevant_documents:
+        list_tweets = relevant_timestamp["tweets"]
+        context += "\nTimestamp: " + relevant_timestamp["timestamp"] + "\nList of tweets:\n" + str((list_tweets["text"] + "   --- Tweeted by: @" +list_tweets["source"] +  " \n").to_list()) + "\n---"
+    USER_PROMPT = f"""
+    "We have provided context information below.
+    ---------------------
+    {context}
+    "\n---------------------\n"
+    Given this information, please answer the question: {query}
+    """
+    response = openai.chat.completions.create(
+                                                model=GPT_MODEL_ANSWER,
+                                                messages=[
+                                                    {
+                                                    "role": "user",
+                                                    "content": USER_PROMPT
+                                                    }
+                                                ],
+                                                temperature=1,
+                                                max_tokens=1000,
+                                                top_p=1,
+                                                frequency_penalty=0,
+                                                presence_penalty=0,
+                                                ).choices[0].message.content
+    return response
+def get_answer(query, df, api_key):
+    """This traditional RAG approach has been implemented without using deidcated libraries and include different steps.
+    It starts by augmenting the query and then perform a semantic search on the augmented query. Finally it combines the augmented query and the retrieved documents into an answer.
+    Args:
+        query (String): Query given by the user
+        df (pd.DataFrame()): corpus with embeddings
+        api_key (String): OpenAI API key
+    Returns:
+        String: Answer to the original query
+    """
+    openai.api_key = api_key
+    augmented_query = augment_query(query)
+    relevant_documents = get_relevant_documents(df, augmented_query,nb_programs_to_display=10)
+    response = get_final_answer(relevant_documents, augmented_query,)
+    return response

rag_benchmark.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import pandas as pd
+from temporal_augmented_retrival import get_answer as get_temporal_answer
+from retrieval_classic_retrieval import get_answer as get_rag_answer
+from retrieval_naive import get_answer as get_naive_answer
+path_to_csv = "../contenu_embedded_august2023_1.csv"
+path_to_raw = "stockerbot-export.csv"
+df = pd.read_csv(path_to_csv, on_bad_lines='skip').reset_index(drop=True).drop(columns=['Unnamed: 0'])
+df["embedding"] = df.embedding.apply(lambda x: eval(x)).to_list()
+df["timestamp"] = pd.to_datetime(df["timestamp"]).dt.strftime('%Y-%m-%d')
+def get_benchmark(text_query, api_key):
+    global df
+    tempo = get_temporal_answer(text_query, df, api_key)
+    rag = get_rag_answer(text_query, df, api_key)
+    naive = get_naive_answer(text_query, df, api_key)
+    return(tempo, rag, naive)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+openai
+pandas
+numpy
+gradio

temporal_augmented_retrival.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import os
+import openai
+import numpy as np
+import time
+import time
+import pandas as pd
+MODEL_AUGMENT = "gpt-3.5-turbo-16k"
+MODEL_ANSWER = "gpt-3.5-turbo-16k"
+def cosine_similarity(a, b):
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+def get_embedding(text, model="text-embedding-ada-002"):
+    try:
+        text = text.replace("\n", " ")
+    except:
+        None
+    try:
+        return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
+    except:
+        time.sleep(2)
+def augment_query(query):
+    SYS_PROMPT = """
+        On [current date: 19 July], you'll receive a finance-related question from a sales manager, without direct interaction. Generate a JSON response with the following structure, considering the temporal aspect:
+        {
+        "timestamps": # Relevant timestamps to study corresponding tweets for a temporal dynamic aspect (e.g., topic drift). USE THE MINIMAL NUMBER OF TIMESTAMP POSSIBLE ALWAYS ALWAYS!,
+        "query": # Repeat the user's query,
+        "similarity_boilerplate": # Boilerplate of relevant documents for cosine similarity search after embedding (it could look like example of tweets that might help answer the query),
+        }
+        Allowed historical timestamps:
+        ['2018-07-18', '2018-07-19', '2018-07-08', '2018-07-09', '2018-07-10', '2018-07-11', '2018-07-12', '2018-07-13', '2018-07-14', '2018-07-15', '2018-07-16', '2018-07-17']
+        Ensure the output is always in JSON format and never provide any other response.
+        """
+    response = openai.chat.completions.create(
+        model=MODEL_AUGMENT,
+        messages=
+        [
+            {
+            "role": "system",
+            "content": SYS_PROMPT
+             },
+            {
+            "role": "user",
+            "content": query
+            }
+        ],
+        temperature=1,
+        max_tokens=1000,
+        top_p=1,
+        frequency_penalty=0,
+        presence_penalty=0,
+        ).choices[0].message.content
+    return response
+def semantic_search(df_loc, query,timestamp, nb_elements_to_consider=15):
+    timestamp = str(timestamp).strip()
+    embedding = get_embedding(query, model='text-embedding-ada-002')
+    filtered_df = df_loc[df_loc["timestamp"]==timestamp].drop(columns=["url"])
+    def wrap_cos(x,y):
+        try:
+            res = cosine_similarity(x,y)
+        except:
+            res = 0
+        return res
+    filtered_df['similarity']  = filtered_df.embedding.apply(lambda x: wrap_cos(x, embedding))
+    results = filtered_df.sort_values('similarity', ascending=False).head(nb_elements_to_consider)
+    return results
+def condition_check(tweet, query):
+    response = openai.chat.completions.create(model=MODEL_AUGMENT,messages=[    {
+        "role": "system",
+        "content": "Only answer with True or False no matter what"
+        },
+        {
+        "role": "user",
+        "content": f"Consider this tweet:\n\n{tweet}\n\nIs it relevant to the following query:\n\n\{query}"
+        }
+    ],
+    temperature=1,
+    max_tokens=1000,
+    top_p=1,
+    frequency_penalty=0,
+    presence_penalty=0
+    ).choices[0].message.content
+    return bool(response)
+def get_number_relevant_tweets(df,timestamp, query):
+    sorted_df = semantic_search(df, str(str(query["query"]) + "\n"+  str(query["similarity_boilerplate"])),timestamp, nb_elements_to_consider=len(df))
+    left, right = 0, len(sorted_df) - 1
+    while left <= right:
+        mid = (left + right) // 2
+        print(f"Currently searching with max range at {mid}")
+        if condition_check(sorted_df['text'].iloc[mid], query):
+            left = mid + 1
+        else:
+            right = mid - 1
+    print(f"Dichotomy done, found relevant tweets: {left}")
+    return left
+def get_relevant_documents(df, query,nb_elements_to_consider = 10):
+    query = eval(query)
+    all_retrieved = []
+    for timestamp in query["timestamps"]:
+        number_of_relevant_tweets = get_number_relevant_tweets(df,timestamp, query)
+        all_retrieved.append({
+            "timestamp" : timestamp,
+            "number_of_relevant_tweets": str(number_of_relevant_tweets),
+            "tweets" : semantic_search(df, str(str(query["query"]) + "\n"+  str(query["similarity_boilerplate"])),timestamp, nb_elements_to_consider=min(nb_elements_to_consider,number_of_relevant_tweets))
+        })
+    return all_retrieved
+def get_final_answer(relevant_documents, query):
+    context = ""
+    for document in relevant_documents:
+        print("TIMESTAMP: ", document["timestamp"] )
+        tweet_entry = document["tweets"]
+        context += "\nTimestamp: " + document["timestamp"] + " - Number of relevant tweets in database (EXACT VOLUME OF TWEETS): +"+ document["number_of_relevant_tweets"] + "\nList of tweets:\n" + str((tweet_entry["text"] + "   --- Tweeted by: @" +tweet_entry["source"] +  " \n").to_list()) + "\n---"
+    SYS_PROMPT =  f"""
+        You will be fed a list of tweets each at a specific timestamp and the number of relevant tweets. You need to take into account (if needed) the number of tweets relevant to the query and how this number evolved. Your task is to use those tweets to answer to the best of your knowledge the following question:
+        QUESTION: {query}
+        SPECIFIC INSTRUCTIONS AND SYSTEM WARNINGS: You redact a properly structured markdown string containing a professional report.
+        You ALWAYS specify your sources by citing them (no urls though). Those tweets are samples from the data and are the closest to the query, you should also take into account the volume of tweets obtained.
+        Otherwise, it will be considered highly misleading and harmful content.
+        You should however always try your best to answer and you need to study in depth the historical relationship between the timestamps and how it answers the QUESTION.
+        You never refer to yourself.
+        Make it as if a real human provided a well constructed and structured report/answer extracting the best of the knowledge contained in the context."
+        """
+    response = openai.chat.completions.create(
+                                                model=MODEL_ANSWER,
+                                                messages=[
+                                                    {
+                                                    "role": "system",
+                                                    "content": SYS_PROMPT
+                                                            },
+                                                    {
+                                                    "role": "user",
+                                                    "content": str(context)
+                                                    }
+                                                ],
+                                                temperature=1,
+                                                max_tokens=3000,
+                                                top_p=1,
+                                                frequency_penalty=0,
+                                                presence_penalty=0,
+                                                ).choices[0].message.content
+    return response
+def get_answer(query, df,api_key,nb_elements_to_consider=10):
+    openai.api_key = api_key
+    augmented_query = augment_query(query)
+    relevant_documents = get_relevant_documents(df, augmented_query,nb_elements_to_consider=nb_elements_to_consider)
+    response = get_final_answer(relevant_documents, augmented_query)
+    print(response)
+    return response