File size: 4,575 Bytes
245e9ef d22e77e 3c564f4 245e9ef 6d67ef6 245e9ef 1b48070 245e9ef 594e600 54a9816 245e9ef 3c564f4 d22e77e 3c564f4 d22e77e 91af966 d22e77e 91af966 d22e77e 3c564f4 d22e77e 3c564f4 d22e77e 3c564f4 d22e77e 3c564f4 5828798 3c564f4 d22e77e 594e600 3c564f4 d22e77e 3c564f4 d22e77e 3c564f4 d22e77e 3c564f4 d22e77e 594e600 3c564f4 b96d79f 245e9ef b96d79f 245e9ef b96d79f 245e9ef 5828798 3c564f4 245e9ef 3c564f4 b96d79f 3c564f4 b96d79f 3c564f4 245e9ef 3c564f4 245e9ef a0c9ed5 245e9ef 3c564f4 d1f6b4c 245e9ef d1f6b4c 245e9ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import gradio as gr
import pandas as pd
import pickle
import torch
from sentence_transformers import SentenceTransformer, util, CrossEncoder
from duckduckgo_search import DDGS
from fuzzywuzzy import process
# Load book dataset
df = pd.read_csv("data/books_summary_cleaned.csv")
# Load precomputed BERT embeddings
with open("model/sbert_embeddings2.pkl", "rb") as f:
book_embeddings = pickle.load(f)
# Load models
retriever_model = SentenceTransformer("all-mpnet-base-v2") # More accurate than MiniLM
reranker_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") # More accurate ranking
def fetch_summary_duckduckgo(book_title, log):
"""Fetch book summary from DuckDuckGo API (search engine)."""
log.append(f"Searching the internet for '{book_title}' summary...")
with DDGS() as ddgs:
search_results = list(ddgs.text(f"{book_title} book summary", max_results=3))
itr = 0
for result in search_results:
if itr<=1:
itr+=1
continue
if "body" in result:
log.append("Summary found from the web.")
return result["body"], log
log.append("No summary found on the web.")
return None, log
def get_best_match(book_title, book_list, log):
"""Find the closest matching book title in the dataset using fuzzy matching."""
best_match, score = process.extractOne(book_title, book_list)
if score > 90:
if(book_title!=best_match):
log.append(f"Typo detected! Corrected '{book_title}' to '{best_match}'.")
return best_match, log
log.append(f"No correction needed for '{book_title}'.")
return book_title, log
def retrieve_candidates(book_title, top_n=10):
"""Retrieve top-N similar books using SBERT embeddings"""
log = ["Starting book recommendation process..."]
book_title, log = get_best_match(book_title, df["book_name"].values.tolist(), log)
if book_title in df["book_name"].values:
book_idx = df[df["book_name"] == book_title].index[0]
query_embedding = book_embeddings[book_idx]
summary = df[df["book_name"] == book_title]["summaries"].values[0]
log.append(f"Book '{book_title}' found in the dataset.")
else:
log.append(f"Book '{book_title}' not found in the dataset.")
summary, log = fetch_summary_duckduckgo(book_title, log)
if summary is None:
log.append("No summary found. Cannot proceed with recommendation.")
return None, None, None, log
query_embedding = retriever_model.encode(summary, convert_to_tensor=True)
scores = util.cos_sim(query_embedding, book_embeddings)[0]
top_indices = torch.argsort(scores, descending=True)[1:top_n+1]
log.append(f"Top {top_n} similar books retrieved from the dataset.")
return book_title, summary, df.iloc[top_indices][["book_name", "summaries"]].values.tolist(), log
def rerank_books(query_title, query_summary, candidates, log):
"""Re-rank books using a cross-encoder"""
# query_summary = df[df["book_name"] == query_title]["summaries"].values[0]
pairs = [(query_summary, cand_summary) for _, cand_summary in candidates]
scores = reranker_model.predict(pairs)
ranked_books = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
log.append("Books re-ranked based on cross-encoder model and returning top 5 books")
return [book[0][0] for book in ranked_books[:5]], log
def recommend_books(book_title):
"""Complete recommendation pipeline with logging"""
book_title, summary, candidates, log = retrieve_candidates(book_title, top_n=10)
if book_title is None:
log.append("Book not found. Exiting recommendation process.")
return "Book not found", "\n".join(log)
recommendations, log = rerank_books(book_title, summary, candidates, log)
log.append("Recommendation process complete.")
return ", ".join(recommendations), "\n".join(log)
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# Book Recommendation System")
gr.Markdown("Enter a book title to find similar books based on summaries.")
with gr.Row():
book_input = gr.Textbox(label="Enter Book Title")
submit_btn = gr.Button("Recommend")
output = gr.Textbox(label="Recommended Books", interactive=False)
log_output = gr.Textbox(label="Logs", interactive=False, lines=10) # Log display
submit_btn.click(recommend_books, inputs=book_input, outputs=[output, log_output])
# Run the app
if __name__ == "__main__":
demo.launch()
|