Spaces:

ssaiteja16
/

RagBenchCapstone10

Running

App Files Files Community

Saiteja Solleti commited on 14 days ago

Commit

a46269a

1 Parent(s): 748ac82

fine tuning and reranking is pushed

Browse files

Files changed (4) hide show

app.py +7 -2
finetuneresults.py +61 -0
generationhelper.py +8 -0
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from createmilvusschema import CreateMilvusDbSchema
 from insertmilvushelper import EmbedAllDocumentsAndInsert
 from sentence_transformers import SentenceTransformer
 from searchmilvushelper import SearchTopKDocuments
 from model import generate_response
 from huggingface_hub import login
@@ -15,6 +16,7 @@ from huggingface_hub import dataset_info
 # Load embedding model
 QUERY_EMBEDDING_MODEL = SentenceTransformer('all-MiniLM-L6-v2')
 WINDOW_SIZE = 5
 OVERLAP = 2
 RETRIVE_TOP_K_SIZE=10
@@ -38,8 +40,11 @@ EmbedAllDocumentsAndInsert(QUERY_EMBEDDING_MODEL, rag_extracted_data, db_collect
 """
 query = "what would the net revenue have been in 2015 if there wasn't a stipulated settlement from the business combination in october 2015?"
-results_for_top5_chunks = SearchTopKDocuments(db_collection, query, QUERY_EMBEDDING_MODEL, top_k=RETRIVE_TOP_K_SIZE)
-print(results_for_top5_chunks)
 def chatbot(prompt):

 from insertmilvushelper import EmbedAllDocumentsAndInsert
 from sentence_transformers import SentenceTransformer
 from searchmilvushelper import SearchTopKDocuments
+from finetuneresults import FineTuneAndRerankSearchResults
 from model import generate_response
 from huggingface_hub import login
 # Load embedding model
 QUERY_EMBEDDING_MODEL = SentenceTransformer('all-MiniLM-L6-v2')
+RERANKING_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
 WINDOW_SIZE = 5
 OVERLAP = 2
 RETRIVE_TOP_K_SIZE=10
 """
 query = "what would the net revenue have been in 2015 if there wasn't a stipulated settlement from the business combination in october 2015?"
+results_for_top10_chunks = SearchTopKDocuments(db_collection, query, QUERY_EMBEDDING_MODEL, top_k=RETRIVE_TOP_K_SIZE)
+reranked_results = FineTuneAndRerankSearchResults(results_for_top10_chunks, rag_extracted_data, query, RERANKING_MODEL)
+print(reranked_results)
 def chatbot(prompt):

finetuneresults.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from sentence_transformers import CrossEncoder
+"""
+    Retrieves unique full documents based on the top-ranked document IDs.
+    Args:
+        top_documents (list): List of dictionaries containing 'doc_id'.
+        df (pd.DataFrame): The dataset containing document IDs and text.
+    Returns:
+        pd.DataFrame: A DataFrame with 'doc_id' and 'document'.
+"""
+def retrieve_full_documents(top_documents, df):
+    # Extract unique doc_ids
+    unique_doc_ids = list(set(doc["doc_id"] for doc in top_documents))
+    # Print for debugging
+    print(f"Extracted Doc IDs: {unique_doc_ids}")
+    # Filter DataFrame where 'id' matches any of the unique_doc_ids
+    filtered_df = df[df["id"].isin(unique_doc_ids)][["id", "documents"]].drop_duplicates(subset="id")
+    # Rename columns for clarity
+    filtered_df = filtered_df.rename(columns={"id": "doc_id", "documents": "document"})
+    return filtered_df
+"""
+Reranks the retrieved documents based on their relevance to the query using a Cross-Encoder model.
+Args:
+     query (str): The search query.
+     retrieved_docs (pd.DataFrame): DataFrame with 'doc_id' and 'document'.
+     model_name (str): Name of the Cross-Encoder model.
+Returns:
+     pd.DataFrame: A sorted DataFrame with doc_id, document, and reranking score.
+"""
+def rerank_documents(query, retrieved_docs_df, model_name="cross-encoder/ms-marco-MiniLM-L-6-v2"):
+    # Load Cross-Encoder model
+    model = CrossEncoder(model_name)
+    # Prepare query-document pairs
+    query_doc_pairs = [(query, " ".join(doc)) for doc in retrieved_docs_df["document"]]
+    # Compute relevance scores
+    scores = model.predict(query_doc_pairs)
+    # Add scores to the DataFrame
+    retrieved_docs_df["relevance_score"] = scores
+    # Sort by score in descending order (higher score = more relevant)
+    reranked_docs_df = retrieved_docs_df.sort_values(by="relevance_score", ascending=False).reset_index(drop=True)
+    return reranked_docs_df
+def FineTuneAndRerankSearchResults(top_10_chunk_results, rag_extarcted_data, question, reranking_model):
+    unique_docs= retrieve_full_documents(top_10_chunk_results, rag_extarcted_data)
+    reranked_results = rerank_documents(question, unique_docs, reranking_model)
+    return rerank_documents

generationhelper.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import os
+from groq import Groq
+groq_token = os.getenv("GROQ_TOKEN")
+groq_client = Groq(
+    api_key = groq_token
+)

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ torch
 huggingface_hub
 pymilvus
 nltk
-sentence-transformers

 huggingface_hub
 pymilvus
 nltk
+sentence-transformers
+Groq