Temporal-RAG-Benchmark / naive_rag.py
Adr740's picture
Upload 6 files
844cee8
import openai
import time
import time
import numpy as np
GPT_MODEL_ANSWER = "gpt-3.5-turbo-16k"
def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def get_embedding(text, model="text-embedding-ada-002"):
try:
text = text.replace("\n", " ")
except:
None
try:
return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
except:
time.sleep(2)
def format_query(query):
resp = {
"timestamps": [],
"query": query
}
return resp
def semantic_search(df_loc, query, nb_programs_to_display=15):
embedding = get_embedding(query, model='text-embedding-ada-002')
filtered_df = df_loc.drop(columns=["url"])
def wrap_cos(x,y):
try:
res = cosine_similarity(x,y)
except:
res = 0
return res
filtered_df['similarity'] = filtered_df.embedding.apply(lambda x: wrap_cos(x, embedding))
results = filtered_df.sort_values('similarity', ascending=False).head(nb_programs_to_display)
return results
def get_relevant_documents(df, query, nb_programs_to_display=15):
all_retrieved= [{
"timestamp" : "",
"tweets" : semantic_search(df, query["query"], nb_programs_to_display=nb_programs_to_display)
}]
return all_retrieved
def get_final_answer(relevant_documents, query):
response = relevant_documents[0]
tweet_entry = response["tweets"]
context = "\nList of tweets:\n" + str((tweet_entry["text"] + " --- Tweeted by: @" +tweet_entry["source"] + " \n").to_list()) + "\n---"
USER_PROMPT = f"""
"We have provided context information below.
---------------------
{context}
"\n---------------------\n"
Given the information above, please answer the question: {query}
"""
response = openai.chat.completions.create(
model=GPT_MODEL_ANSWER,
messages=[
{
"role": "user",
"content": USER_PROMPT
}
],
temperature=1,
max_tokens=1000,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
).choices[0].message.content
return response
def get_answer(query, df, api_key):
"""This approach is considered naive because it doesn't augment the user query.
This means that we try to retrieve documents directly relevant to the user query and then combine them into an answer.
The query is formatted to have the same structure given to the LLM as the other two approaches
Args:
query (String): Query given by the user
df (pd.DataFrame()): corpus with embeddings
api_key (String): OpenAI API key
Returns:
String: Answer to the original query
"""
openai.api_key = api_key
formatted_query = format_query(query)
relevant_documents = get_relevant_documents(df, formatted_query,nb_programs_to_display=15)
response = get_final_answer(relevant_documents, formatted_query)
return response