|
import gradio as gr |
|
import pandas as pd |
|
import pickle |
|
import torch |
|
from sentence_transformers import SentenceTransformer, util, CrossEncoder |
|
from duckduckgo_search import DDGS |
|
from fuzzywuzzy import process |
|
|
|
|
|
df = pd.read_csv("data/books_summary_cleaned.csv") |
|
|
|
|
|
with open("model/sbert_embeddings2.pkl", "rb") as f: |
|
book_embeddings = pickle.load(f) |
|
|
|
|
|
retriever_model = SentenceTransformer("all-mpnet-base-v2") |
|
reranker_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") |
|
|
|
def fetch_summary_duckduckgo(book_title, log): |
|
"""Fetch book summary from DuckDuckGo API (search engine).""" |
|
log.append(f"Searching the internet for '{book_title}' summary...") |
|
|
|
with DDGS() as ddgs: |
|
search_results = list(ddgs.text(f"{book_title} book summary", max_results=3)) |
|
itr = 0 |
|
for result in search_results: |
|
if itr<=1: |
|
itr+=1 |
|
continue |
|
if "body" in result: |
|
log.append("Summary found from the web.") |
|
return result["body"], log |
|
|
|
log.append("No summary found on the web.") |
|
return None, log |
|
|
|
def get_best_match(book_title, book_list, log): |
|
"""Find the closest matching book title in the dataset using fuzzy matching.""" |
|
best_match, score = process.extractOne(book_title, book_list) |
|
if score > 90: |
|
if(book_title!=best_match): |
|
log.append(f"Typo detected! Corrected '{book_title}' to '{best_match}'.") |
|
return best_match, log |
|
log.append(f"No correction needed for '{book_title}'.") |
|
return book_title, log |
|
|
|
def retrieve_candidates(book_title, top_n=10): |
|
"""Retrieve top-N similar books using SBERT embeddings""" |
|
log = ["Starting book recommendation process..."] |
|
book_title, log = get_best_match(book_title, df["book_name"].values.tolist(), log) |
|
|
|
if book_title in df["book_name"].values: |
|
book_idx = df[df["book_name"] == book_title].index[0] |
|
query_embedding = book_embeddings[book_idx] |
|
summary = df[df["book_name"] == book_title]["summaries"].values[0] |
|
log.append(f"Book '{book_title}' found in the dataset.") |
|
else: |
|
log.append(f"Book '{book_title}' not found in the dataset.") |
|
summary, log = fetch_summary_duckduckgo(book_title, log) |
|
|
|
if summary is None: |
|
log.append("No summary found. Cannot proceed with recommendation.") |
|
return None, None, None, log |
|
|
|
query_embedding = retriever_model.encode(summary, convert_to_tensor=True) |
|
|
|
scores = util.cos_sim(query_embedding, book_embeddings)[0] |
|
top_indices = torch.argsort(scores, descending=True)[1:top_n+1] |
|
|
|
log.append(f"Top {top_n} similar books retrieved from the dataset.") |
|
return book_title, summary, df.iloc[top_indices][["book_name", "summaries"]].values.tolist(), log |
|
|
|
def rerank_books(query_title, query_summary, candidates, log): |
|
"""Re-rank books using a cross-encoder""" |
|
|
|
pairs = [(query_summary, cand_summary) for _, cand_summary in candidates] |
|
|
|
scores = reranker_model.predict(pairs) |
|
ranked_books = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True) |
|
|
|
log.append("Books re-ranked based on cross-encoder model and returning top 5 books") |
|
return [book[0][0] for book in ranked_books[:5]], log |
|
|
|
def recommend_books(book_title): |
|
"""Complete recommendation pipeline with logging""" |
|
book_title, summary, candidates, log = retrieve_candidates(book_title, top_n=10) |
|
|
|
if book_title is None: |
|
log.append("Book not found. Exiting recommendation process.") |
|
return "Book not found", "\n".join(log) |
|
|
|
recommendations, log = rerank_books(book_title, summary, candidates, log) |
|
log.append("Recommendation process complete.") |
|
|
|
return ", ".join(recommendations), "\n".join(log) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Book Recommendation System") |
|
gr.Markdown("Enter a book title to find similar books based on summaries.") |
|
|
|
with gr.Row(): |
|
book_input = gr.Textbox(label="Enter Book Title") |
|
submit_btn = gr.Button("Recommend") |
|
|
|
output = gr.Textbox(label="Recommended Books", interactive=False) |
|
log_output = gr.Textbox(label="Logs", interactive=False, lines=10) |
|
|
|
submit_btn.click(recommend_books, inputs=book_input, outputs=[output, log_output]) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|