Adr740 commited on
Commit
844cee8
·
1 Parent(s): a63beb7

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +39 -0
  2. naive_rag.py +98 -0
  3. rag.py +134 -0
  4. rag_benchmark.py +21 -0
  5. requirements.txt +4 -0
  6. temporal_augmented_retrival.py +176 -0
app.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ from functools import partial
4
+ from rag_benchmark import get_benchmark
5
+
6
+
7
+
8
+ title = "Prototype Temporal Augmented Retrieval (TAR)"
9
+ desc = "Database: 22.4k tweets related to finance dated from July 12,2018 to July 19,2018 - know more about the approach: [link to medium]\ncontact: adrida.github.io"
10
+
11
+
12
+ with gr.Blocks(title=title,theme='nota-ai/theme') as demo:
13
+ gr.Markdown(f"# {title}\n{desc}")
14
+ with gr.Row():
15
+ with gr.Column(scale = 10):
16
+ text_area = gr.Textbox(placeholder="Write here", lines=1, label="Ask anything")
17
+ with gr.Column(scale = 2):
18
+ api_key = gr.Textbox(placeholder="Paste your OpenAI API key here", lines=1)
19
+ search_button = gr.Button(value="Ask")
20
+
21
+ with gr.Row():
22
+ with gr.Tab("Dynamic Temporal Augmented Retrieval (ours)"):
23
+
24
+ gr.Markdown("## Dynamic Temporal Augmented Retrieval (ours)\n---")
25
+ tempo = gr.Markdown()
26
+ with gr.Tab("Naive Semantic Search"):
27
+ gr.Markdown("## Simple Semantic Search\n---")
28
+ naive = gr.Markdown()
29
+ with gr.Tab("Traditional RAG (Langchain type)"):
30
+ gr.Markdown("## Augmented Indexed Retrieval\n---")
31
+ classic = gr.Markdown()
32
+
33
+ search_function = partial(get_benchmark)
34
+
35
+ search_button.click(fn=search_function, inputs=[text_area, api_key], outputs=[tempo, classic, naive],
36
+ )
37
+
38
+ demo.queue(concurrency_count=100,status_update_rate=500).launch(max_threads=100, show_error=True, debug = True, inline =False)
39
+
naive_rag.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import time
3
+
4
+ import time
5
+ import numpy as np
6
+
7
+ GPT_MODEL_ANSWER = "gpt-3.5-turbo-16k"
8
+
9
+ def cosine_similarity(a, b):
10
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
11
+
12
+ def get_embedding(text, model="text-embedding-ada-002"):
13
+ try:
14
+ text = text.replace("\n", " ")
15
+ except:
16
+ None
17
+ try:
18
+ return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
19
+ except:
20
+ time.sleep(2)
21
+
22
+ def format_query(query):
23
+
24
+ resp = {
25
+ "timestamps": [],
26
+ "query": query
27
+ }
28
+
29
+ return resp
30
+
31
+ def semantic_search(df_loc, query, nb_programs_to_display=15):
32
+
33
+ embedding = get_embedding(query, model='text-embedding-ada-002')
34
+ filtered_df = df_loc.drop(columns=["url"])
35
+ def wrap_cos(x,y):
36
+ try:
37
+ res = cosine_similarity(x,y)
38
+ except:
39
+ res = 0
40
+ return res
41
+ filtered_df['similarity'] = filtered_df.embedding.apply(lambda x: wrap_cos(x, embedding))
42
+ results = filtered_df.sort_values('similarity', ascending=False).head(nb_programs_to_display)
43
+ return results
44
+
45
+ def get_relevant_documents(df, query, nb_programs_to_display=15):
46
+ all_retrieved= [{
47
+ "timestamp" : "",
48
+ "tweets" : semantic_search(df, query["query"], nb_programs_to_display=nb_programs_to_display)
49
+ }]
50
+ return all_retrieved
51
+
52
+ def get_final_answer(relevant_documents, query):
53
+ response = relevant_documents[0]
54
+ tweet_entry = response["tweets"]
55
+ context = "\nList of tweets:\n" + str((tweet_entry["text"] + " --- Tweeted by: @" +tweet_entry["source"] + " \n").to_list()) + "\n---"
56
+ USER_PROMPT = f"""
57
+ "We have provided context information below.
58
+ ---------------------
59
+ {context}
60
+ "\n---------------------\n"
61
+ Given the information above, please answer the question: {query}
62
+ """
63
+
64
+ response = openai.chat.completions.create(
65
+ model=GPT_MODEL_ANSWER,
66
+ messages=[
67
+ {
68
+ "role": "user",
69
+ "content": USER_PROMPT
70
+ }
71
+ ],
72
+
73
+ temperature=1,
74
+ max_tokens=1000,
75
+ top_p=1,
76
+ frequency_penalty=0,
77
+ presence_penalty=0,
78
+ ).choices[0].message.content
79
+ return response
80
+
81
+ def get_answer(query, df, api_key):
82
+ """This approach is considered naive because it doesn't augment the user query.
83
+ This means that we try to retrieve documents directly relevant to the user query and then combine them into an answer.
84
+ The query is formatted to have the same structure given to the LLM as the other two approaches
85
+
86
+ Args:
87
+ query (String): Query given by the user
88
+ df (pd.DataFrame()): corpus with embeddings
89
+ api_key (String): OpenAI API key
90
+
91
+ Returns:
92
+ String: Answer to the original query
93
+ """
94
+ openai.api_key = api_key
95
+ formatted_query = format_query(query)
96
+ relevant_documents = get_relevant_documents(df, formatted_query,nb_programs_to_display=15)
97
+ response = get_final_answer(relevant_documents, formatted_query)
98
+ return response
rag.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import time
4
+ import numpy as np
5
+ import time
6
+ import pandas as pd
7
+
8
+ GPT_MODEL_AUGMENT = "gpt-3.5-turbo-16k"
9
+ GPT_MODEL_ANSWER = "gpt-3.5-turbo-16k"
10
+
11
+
12
+ def cosine_similarity(a, b):
13
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
14
+
15
+ def get_embedding(text, model="text-embedding-ada-002"):
16
+ try:
17
+ text = text.replace("\n", " ")
18
+ except:
19
+ None
20
+ try:
21
+ return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
22
+ except:
23
+ time.sleep(2)
24
+
25
+ def augment_query(query):
26
+
27
+ SYS_PROMPT = """
28
+ On [current date: 19 July] Generate a JSON response with the following structure:
29
+
30
+ {
31
+ "timestamps": # Relevant timestamps in which to get data to answer the query,
32
+ "query": # Repeat the user's query,
33
+ }
34
+ Allowed timestamps:
35
+ ['2018-07-18', '2018-07-19', '2018-07-08', '2018-07-09', '2018-07-10', '2018-07-11', '2018-07-12', '2018-07-13', '2018-07-14', '2018-07-15', '2018-07-16', '2018-07-17']
36
+
37
+ Ensure the output is always in JSON format and never provide any other response.
38
+ """
39
+ response = openai.chat.completions.create(
40
+ model=GPT_MODEL_AUGMENT,
41
+ messages=
42
+ [
43
+ {
44
+ "role": "system",
45
+ "content": SYS_PROMPT
46
+ },
47
+ {
48
+ "role": "user",
49
+ "content": query
50
+ }
51
+ ],
52
+ temperature=1,
53
+ max_tokens=1000,
54
+ top_p=1,
55
+ frequency_penalty=0,
56
+ presence_penalty=0,
57
+ ).choices[0].message.content
58
+ return response
59
+
60
+ def semantic_search(df_loc, query,timestamp, nb_programs_to_display=15):
61
+ timestamp = str(timestamp).strip()
62
+ embedding = get_embedding(query, model='text-embedding-ada-002')
63
+ filtered_df = df_loc[df_loc["timestamp"]==timestamp].drop(columns=["url"])
64
+ def wrap_cos(x,y):
65
+ try:
66
+ res = cosine_similarity(x,y)
67
+ except:
68
+ res = 0
69
+ return res
70
+ filtered_df['similarity'] = filtered_df.embedding.apply(lambda x: wrap_cos(x, embedding))
71
+
72
+ results = filtered_df.sort_values('similarity', ascending=False).head(nb_programs_to_display)
73
+ return results
74
+
75
+ def get_relevant_documents(df, query, nb_programs_to_display=15):
76
+
77
+ query = eval(query)
78
+ all_retrieved = []
79
+ for timestamp in query["timestamps"]:
80
+ all_retrieved.append({
81
+ "timestamp" : timestamp,
82
+ "tweets" : semantic_search(df, query["query"],timestamp, nb_programs_to_display=nb_programs_to_display)
83
+ })
84
+
85
+ return all_retrieved
86
+
87
+ def get_final_answer(relevant_documents, query):
88
+ context = ""
89
+ for relevant_timestamp in relevant_documents:
90
+ list_tweets = relevant_timestamp["tweets"]
91
+ context += "\nTimestamp: " + relevant_timestamp["timestamp"] + "\nList of tweets:\n" + str((list_tweets["text"] + " --- Tweeted by: @" +list_tweets["source"] + " \n").to_list()) + "\n---"
92
+
93
+
94
+ USER_PROMPT = f"""
95
+ "We have provided context information below.
96
+ ---------------------
97
+ {context}
98
+ "\n---------------------\n"
99
+ Given this information, please answer the question: {query}
100
+ """
101
+ response = openai.chat.completions.create(
102
+ model=GPT_MODEL_ANSWER,
103
+ messages=[
104
+ {
105
+ "role": "user",
106
+ "content": USER_PROMPT
107
+ }
108
+ ],
109
+
110
+ temperature=1,
111
+ max_tokens=1000,
112
+ top_p=1,
113
+ frequency_penalty=0,
114
+ presence_penalty=0,
115
+ ).choices[0].message.content
116
+ return response
117
+
118
+ def get_answer(query, df, api_key):
119
+ """This traditional RAG approach has been implemented without using deidcated libraries and include different steps.
120
+ It starts by augmenting the query and then perform a semantic search on the augmented query. Finally it combines the augmented query and the retrieved documents into an answer.
121
+
122
+ Args:
123
+ query (String): Query given by the user
124
+ df (pd.DataFrame()): corpus with embeddings
125
+ api_key (String): OpenAI API key
126
+
127
+ Returns:
128
+ String: Answer to the original query
129
+ """
130
+ openai.api_key = api_key
131
+ augmented_query = augment_query(query)
132
+ relevant_documents = get_relevant_documents(df, augmented_query,nb_programs_to_display=10)
133
+ response = get_final_answer(relevant_documents, augmented_query,)
134
+ return response
rag_benchmark.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ from temporal_augmented_retrival import get_answer as get_temporal_answer
4
+ from retrieval_classic_retrieval import get_answer as get_rag_answer
5
+ from retrieval_naive import get_answer as get_naive_answer
6
+
7
+ path_to_csv = "../contenu_embedded_august2023_1.csv"
8
+ path_to_raw = "stockerbot-export.csv"
9
+ df = pd.read_csv(path_to_csv, on_bad_lines='skip').reset_index(drop=True).drop(columns=['Unnamed: 0'])
10
+ df["embedding"] = df.embedding.apply(lambda x: eval(x)).to_list()
11
+ df["timestamp"] = pd.to_datetime(df["timestamp"]).dt.strftime('%Y-%m-%d')
12
+
13
+
14
+ def get_benchmark(text_query, api_key):
15
+ global df
16
+ tempo = get_temporal_answer(text_query, df, api_key)
17
+ rag = get_rag_answer(text_query, df, api_key)
18
+ naive = get_naive_answer(text_query, df, api_key)
19
+ return(tempo, rag, naive)
20
+
21
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ openai
2
+ pandas
3
+ numpy
4
+ gradio
temporal_augmented_retrival.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import numpy as np
4
+ import time
5
+
6
+ import time
7
+ import pandas as pd
8
+
9
+ MODEL_AUGMENT = "gpt-3.5-turbo-16k"
10
+ MODEL_ANSWER = "gpt-3.5-turbo-16k"
11
+
12
+ def cosine_similarity(a, b):
13
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
14
+
15
+ def get_embedding(text, model="text-embedding-ada-002"):
16
+ try:
17
+ text = text.replace("\n", " ")
18
+ except:
19
+ None
20
+ try:
21
+ return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
22
+ except:
23
+ time.sleep(2)
24
+
25
+ def augment_query(query):
26
+ SYS_PROMPT = """
27
+ On [current date: 19 July], you'll receive a finance-related question from a sales manager, without direct interaction. Generate a JSON response with the following structure, considering the temporal aspect:
28
+
29
+ {
30
+ "timestamps": # Relevant timestamps to study corresponding tweets for a temporal dynamic aspect (e.g., topic drift). USE THE MINIMAL NUMBER OF TIMESTAMP POSSIBLE ALWAYS ALWAYS!,
31
+ "query": # Repeat the user's query,
32
+ "similarity_boilerplate": # Boilerplate of relevant documents for cosine similarity search after embedding (it could look like example of tweets that might help answer the query),
33
+ }
34
+
35
+ Allowed historical timestamps:
36
+ ['2018-07-18', '2018-07-19', '2018-07-08', '2018-07-09', '2018-07-10', '2018-07-11', '2018-07-12', '2018-07-13', '2018-07-14', '2018-07-15', '2018-07-16', '2018-07-17']
37
+
38
+ Ensure the output is always in JSON format and never provide any other response.
39
+ """
40
+ response = openai.chat.completions.create(
41
+ model=MODEL_AUGMENT,
42
+ messages=
43
+ [
44
+ {
45
+ "role": "system",
46
+ "content": SYS_PROMPT
47
+ },
48
+ {
49
+ "role": "user",
50
+ "content": query
51
+ }
52
+ ],
53
+ temperature=1,
54
+ max_tokens=1000,
55
+ top_p=1,
56
+ frequency_penalty=0,
57
+ presence_penalty=0,
58
+ ).choices[0].message.content
59
+ return response
60
+
61
+
62
+ def semantic_search(df_loc, query,timestamp, nb_elements_to_consider=15):
63
+ timestamp = str(timestamp).strip()
64
+ embedding = get_embedding(query, model='text-embedding-ada-002')
65
+ filtered_df = df_loc[df_loc["timestamp"]==timestamp].drop(columns=["url"])
66
+ def wrap_cos(x,y):
67
+ try:
68
+ res = cosine_similarity(x,y)
69
+ except:
70
+ res = 0
71
+ return res
72
+ filtered_df['similarity'] = filtered_df.embedding.apply(lambda x: wrap_cos(x, embedding))
73
+
74
+ results = filtered_df.sort_values('similarity', ascending=False).head(nb_elements_to_consider)
75
+
76
+ return results
77
+
78
+ def condition_check(tweet, query):
79
+ response = openai.chat.completions.create(model=MODEL_AUGMENT,messages=[ {
80
+ "role": "system",
81
+ "content": "Only answer with True or False no matter what"
82
+ },
83
+ {
84
+ "role": "user",
85
+ "content": f"Consider this tweet:\n\n{tweet}\n\nIs it relevant to the following query:\n\n\{query}"
86
+ }
87
+ ],
88
+ temperature=1,
89
+ max_tokens=1000,
90
+ top_p=1,
91
+ frequency_penalty=0,
92
+ presence_penalty=0
93
+ ).choices[0].message.content
94
+ return bool(response)
95
+
96
+ def get_number_relevant_tweets(df,timestamp, query):
97
+ sorted_df = semantic_search(df, str(str(query["query"]) + "\n"+ str(query["similarity_boilerplate"])),timestamp, nb_elements_to_consider=len(df))
98
+ left, right = 0, len(sorted_df) - 1
99
+ while left <= right:
100
+ mid = (left + right) // 2
101
+ print(f"Currently searching with max range at {mid}")
102
+ if condition_check(sorted_df['text'].iloc[mid], query):
103
+ left = mid + 1
104
+ else:
105
+ right = mid - 1
106
+ print(f"Dichotomy done, found relevant tweets: {left}")
107
+ return left
108
+
109
+
110
+
111
+ def get_relevant_documents(df, query,nb_elements_to_consider = 10):
112
+ query = eval(query)
113
+ all_retrieved = []
114
+ for timestamp in query["timestamps"]:
115
+ number_of_relevant_tweets = get_number_relevant_tweets(df,timestamp, query)
116
+ all_retrieved.append({
117
+ "timestamp" : timestamp,
118
+ "number_of_relevant_tweets": str(number_of_relevant_tweets),
119
+ "tweets" : semantic_search(df, str(str(query["query"]) + "\n"+ str(query["similarity_boilerplate"])),timestamp, nb_elements_to_consider=min(nb_elements_to_consider,number_of_relevant_tweets))
120
+ })
121
+ return all_retrieved
122
+
123
+ def get_final_answer(relevant_documents, query):
124
+ context = ""
125
+ for document in relevant_documents:
126
+ print("TIMESTAMP: ", document["timestamp"] )
127
+ tweet_entry = document["tweets"]
128
+ context += "\nTimestamp: " + document["timestamp"] + " - Number of relevant tweets in database (EXACT VOLUME OF TWEETS): +"+ document["number_of_relevant_tweets"] + "\nList of tweets:\n" + str((tweet_entry["text"] + " --- Tweeted by: @" +tweet_entry["source"] + " \n").to_list()) + "\n---"
129
+
130
+
131
+ SYS_PROMPT = f"""
132
+ You will be fed a list of tweets each at a specific timestamp and the number of relevant tweets. You need to take into account (if needed) the number of tweets relevant to the query and how this number evolved. Your task is to use those tweets to answer to the best of your knowledge the following question:
133
+
134
+ QUESTION: {query}
135
+
136
+ SPECIFIC INSTRUCTIONS AND SYSTEM WARNINGS: You redact a properly structured markdown string containing a professional report.
137
+ You ALWAYS specify your sources by citing them (no urls though). Those tweets are samples from the data and are the closest to the query, you should also take into account the volume of tweets obtained.
138
+ Otherwise, it will be considered highly misleading and harmful content.
139
+ You should however always try your best to answer and you need to study in depth the historical relationship between the timestamps and how it answers the QUESTION.
140
+ You never refer to yourself.
141
+ Make it as if a real human provided a well constructed and structured report/answer extracting the best of the knowledge contained in the context."
142
+ """
143
+ response = openai.chat.completions.create(
144
+ model=MODEL_ANSWER,
145
+ messages=[
146
+ {
147
+ "role": "system",
148
+ "content": SYS_PROMPT
149
+ },
150
+ {
151
+ "role": "user",
152
+ "content": str(context)
153
+ }
154
+ ],
155
+
156
+ temperature=1,
157
+ max_tokens=3000,
158
+ top_p=1,
159
+ frequency_penalty=0,
160
+ presence_penalty=0,
161
+ ).choices[0].message.content
162
+ return response
163
+
164
+ def get_answer(query, df,api_key,nb_elements_to_consider=10):
165
+ openai.api_key = api_key
166
+ augmented_query = augment_query(query)
167
+
168
+ relevant_documents = get_relevant_documents(df, augmented_query,nb_elements_to_consider=nb_elements_to_consider)
169
+
170
+ response = get_final_answer(relevant_documents, augmented_query)
171
+ print(response)
172
+
173
+
174
+ return response
175
+
176
+