import gradio as gr import pandas as pd import pickle import torch from sentence_transformers import SentenceTransformer, util, CrossEncoder from duckduckgo_search import DDGS from fuzzywuzzy import process # Load book dataset df = pd.read_csv("data/books_summary_cleaned.csv") # Load precomputed BERT embeddings with open("model/sbert_embeddings2.pkl", "rb") as f: book_embeddings = pickle.load(f) # Load models retriever_model = SentenceTransformer("all-mpnet-base-v2") # More accurate than MiniLM reranker_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") # More accurate ranking def fetch_summary_duckduckgo(book_title, log): """Fetch book summary from DuckDuckGo API (search engine).""" log.append(f"Searching the internet for '{book_title}' summary...") with DDGS() as ddgs: search_results = list(ddgs.text(f"{book_title} book summary", max_results=3)) itr = 0 for result in search_results: if itr<=1: itr+=1 continue if "body" in result: log.append("Summary found from the web.") return result["body"], log log.append("No summary found on the web.") return None, log def get_best_match(book_title, book_list, log): """Find the closest matching book title in the dataset using fuzzy matching.""" best_match, score = process.extractOne(book_title, book_list) if score > 90: if(book_title!=best_match): log.append(f"Typo detected! Corrected '{book_title}' to '{best_match}'.") return best_match, log log.append(f"No correction needed for '{book_title}'.") return book_title, log def retrieve_candidates(book_title, top_n=10): """Retrieve top-N similar books using SBERT embeddings""" log = ["Starting book recommendation process..."] book_title, log = get_best_match(book_title, df["book_name"].values.tolist(), log) if book_title in df["book_name"].values: book_idx = df[df["book_name"] == book_title].index[0] query_embedding = book_embeddings[book_idx] summary = df[df["book_name"] == book_title]["summaries"].values[0] log.append(f"Book '{book_title}' found in the dataset.") else: log.append(f"Book '{book_title}' not found in the dataset.") summary, log = fetch_summary_duckduckgo(book_title, log) if summary is None: log.append("No summary found. Cannot proceed with recommendation.") return None, None, None, log query_embedding = retriever_model.encode(summary, convert_to_tensor=True) scores = util.cos_sim(query_embedding, book_embeddings)[0] top_indices = torch.argsort(scores, descending=True)[1:top_n+1] log.append(f"Top {top_n} similar books retrieved from the dataset.") return book_title, summary, df.iloc[top_indices][["book_name", "summaries"]].values.tolist(), log def rerank_books(query_title, query_summary, candidates, log): """Re-rank books using a cross-encoder""" # query_summary = df[df["book_name"] == query_title]["summaries"].values[0] pairs = [(query_summary, cand_summary) for _, cand_summary in candidates] scores = reranker_model.predict(pairs) ranked_books = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True) log.append("Books re-ranked based on cross-encoder model and returning top 5 books") return [book[0][0] for book in ranked_books[:5]], log def recommend_books(book_title): """Complete recommendation pipeline with logging""" book_title, summary, candidates, log = retrieve_candidates(book_title, top_n=10) if book_title is None: log.append("Book not found. Exiting recommendation process.") return "Book not found", "\n".join(log) recommendations, log = rerank_books(book_title, summary, candidates, log) log.append("Recommendation process complete.") return ", ".join(recommendations), "\n".join(log) # Gradio Interface with gr.Blocks() as demo: gr.Markdown("# Book Recommendation System") gr.Markdown("Enter a book title to find similar books based on summaries.") with gr.Row(): book_input = gr.Textbox(label="Enter Book Title") submit_btn = gr.Button("Recommend") output = gr.Textbox(label="Recommended Books", interactive=False) log_output = gr.Textbox(label="Logs", interactive=False, lines=10) # Log display submit_btn.click(recommend_books, inputs=book_input, outputs=[output, log_output]) # Run the app if __name__ == "__main__": demo.launch()