Spaces:

areebbashir13
/

books_recommendation

Running

qsaheeb

Final changes

54a9816 1 day ago

4.58 kB

	import gradio as gr
	import pandas as pd
	import pickle
	import torch
	from sentence_transformers import SentenceTransformer, util, CrossEncoder
	from duckduckgo_search import DDGS
	from fuzzywuzzy import process

	# Load book dataset
	df = pd.read_csv("data/books_summary_cleaned.csv")

	# Load precomputed BERT embeddings
	with open("model/sbert_embeddings2.pkl", "rb") as f:
	book_embeddings = pickle.load(f)

	# Load models
	retriever_model = SentenceTransformer("all-mpnet-base-v2") # More accurate than MiniLM
	reranker_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") # More accurate ranking

	def fetch_summary_duckduckgo(book_title, log):
	"""Fetch book summary from DuckDuckGo API (search engine)."""
	log.append(f"Searching the internet for '{book_title}' summary...")

	with DDGS() as ddgs:
	search_results = list(ddgs.text(f"{book_title} book summary", max_results=3))
	itr = 0
	for result in search_results:
	if itr<=1:
	itr+=1
	continue
	if "body" in result:
	log.append("Summary found from the web.")
	return result["body"], log

	log.append("No summary found on the web.")
	return None, log

	def get_best_match(book_title, book_list, log):
	"""Find the closest matching book title in the dataset using fuzzy matching."""
	best_match, score = process.extractOne(book_title, book_list)
	if score > 90:
	if(book_title!=best_match):
	log.append(f"Typo detected! Corrected '{book_title}' to '{best_match}'.")
	return best_match, log
	log.append(f"No correction needed for '{book_title}'.")
	return book_title, log

	def retrieve_candidates(book_title, top_n=10):
	"""Retrieve top-N similar books using SBERT embeddings"""
	log = ["Starting book recommendation process..."]
	book_title, log = get_best_match(book_title, df["book_name"].values.tolist(), log)

	if book_title in df["book_name"].values:
	book_idx = df[df["book_name"] == book_title].index[0]
	query_embedding = book_embeddings[book_idx]
	summary = df[df["book_name"] == book_title]["summaries"].values[0]
	log.append(f"Book '{book_title}' found in the dataset.")
	else:
	log.append(f"Book '{book_title}' not found in the dataset.")
	summary, log = fetch_summary_duckduckgo(book_title, log)

	if summary is None:
	log.append("No summary found. Cannot proceed with recommendation.")
	return None, None, None, log

	query_embedding = retriever_model.encode(summary, convert_to_tensor=True)

	scores = util.cos_sim(query_embedding, book_embeddings)[0]
	top_indices = torch.argsort(scores, descending=True)[1:top_n+1]

	log.append(f"Top {top_n} similar books retrieved from the dataset.")
	return book_title, summary, df.iloc[top_indices][["book_name", "summaries"]].values.tolist(), log

	def rerank_books(query_title, query_summary, candidates, log):
	"""Re-rank books using a cross-encoder"""
	# query_summary = df[df["book_name"] == query_title]["summaries"].values[0]
	pairs = [(query_summary, cand_summary) for _, cand_summary in candidates]

	scores = reranker_model.predict(pairs)
	ranked_books = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)

	log.append("Books re-ranked based on cross-encoder model and returning top 5 books")
	return [book[0][0] for book in ranked_books[:5]], log

	def recommend_books(book_title):
	"""Complete recommendation pipeline with logging"""
	book_title, summary, candidates, log = retrieve_candidates(book_title, top_n=10)

	if book_title is None:
	log.append("Book not found. Exiting recommendation process.")
	return "Book not found", "\n".join(log)

	recommendations, log = rerank_books(book_title, summary, candidates, log)
	log.append("Recommendation process complete.")

	return ", ".join(recommendations), "\n".join(log)

	# Gradio Interface
	with gr.Blocks() as demo:
	gr.Markdown("# Book Recommendation System")
	gr.Markdown("Enter a book title to find similar books based on summaries.")

	with gr.Row():
	book_input = gr.Textbox(label="Enter Book Title")
	submit_btn = gr.Button("Recommend")

	output = gr.Textbox(label="Recommended Books", interactive=False)
	log_output = gr.Textbox(label="Logs", interactive=False, lines=10) # Log display

	submit_btn.click(recommend_books, inputs=book_input, outputs=[output, log_output])

	# Run the app
	if __name__ == "__main__":
	demo.launch()