qsaheeb
commited on
Commit
·
3c564f4
1
Parent(s):
d1f6b4c
Addsome changes 6
Browse files
app.py
CHANGED
@@ -3,9 +3,9 @@ import pandas as pd
|
|
3 |
import pickle
|
4 |
import torch
|
5 |
from sentence_transformers import SentenceTransformer, util, CrossEncoder
|
6 |
-
from recommender import BookRecommender
|
7 |
from duckduckgo_search import DDGS
|
8 |
from fuzzywuzzy import process
|
|
|
9 |
# Load book dataset
|
10 |
df = pd.read_csv("data/books_summary_cleaned.csv")
|
11 |
|
@@ -17,61 +17,57 @@ with open("model/sbert_embeddings2.pkl", "rb") as f:
|
|
17 |
retriever_model = SentenceTransformer("all-mpnet-base-v2") # More accurate than MiniLM
|
18 |
reranker_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") # More accurate ranking
|
19 |
|
20 |
-
def fetch_summary_duckduckgo(book_title):
|
21 |
"""Fetch book summary from DuckDuckGo API (search engine)."""
|
|
|
|
|
22 |
with DDGS() as ddgs:
|
23 |
search_results = list(ddgs.text(f"{book_title} book summary", max_results=3))
|
24 |
-
|
25 |
-
itr = 0
|
26 |
for result in search_results:
|
27 |
-
if itr<=1:
|
28 |
-
itr+=1
|
29 |
-
continue
|
30 |
if "body" in result:
|
31 |
-
|
32 |
-
|
33 |
-
return None
|
34 |
-
|
35 |
-
def get_best_match(book_title, book_list):
|
36 |
-
"""
|
37 |
-
Finds the closest matching book title in the dataset.
|
38 |
|
39 |
-
|
40 |
-
|
41 |
-
- book_list (list): List of book titles in the dataset.
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
"""
|
46 |
best_match, score = process.extractOne(book_title, book_list)
|
47 |
-
|
|
|
|
|
|
|
|
|
48 |
|
49 |
def retrieve_candidates(book_title, top_n=10):
|
50 |
"""Retrieve top-N similar books using SBERT embeddings"""
|
51 |
-
|
|
|
52 |
|
53 |
if book_title in df["book_name"].values:
|
54 |
book_idx = df[df["book_name"] == book_title].index[0]
|
55 |
query_embedding = book_embeddings[book_idx]
|
56 |
-
summary = df[df["
|
57 |
-
|
58 |
else:
|
59 |
-
|
60 |
-
summary = fetch_summary_duckduckgo(book_title)
|
61 |
|
62 |
if summary is None:
|
63 |
-
|
64 |
-
|
|
|
65 |
query_embedding = retriever_model.encode(summary, convert_to_tensor=True)
|
66 |
|
67 |
scores = util.cos_sim(query_embedding, book_embeddings)[0]
|
68 |
-
|
69 |
-
# Get top-N similar books (excluding the book itself)
|
70 |
top_indices = torch.argsort(scores, descending=True)[1:top_n+1]
|
71 |
|
72 |
-
|
|
|
73 |
|
74 |
-
def rerank_books(query_title, query_embedding, candidates):
|
75 |
"""Re-rank books using a cross-encoder"""
|
76 |
query_summary = df[df["book_name"] == query_title]["summaries"].values[0]
|
77 |
pairs = [(query_summary, cand_summary) for _, cand_summary in candidates]
|
@@ -79,17 +75,21 @@ def rerank_books(query_title, query_embedding, candidates):
|
|
79 |
scores = reranker_model.predict(pairs)
|
80 |
ranked_books = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
|
81 |
|
82 |
-
|
|
|
83 |
|
84 |
def recommend_books(book_title):
|
85 |
-
"""Complete recommendation pipeline"""
|
86 |
-
book_title, query_embedding, candidates = retrieve_candidates(book_title, top_n=10)
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
return
|
|
|
|
|
|
|
91 |
|
92 |
-
return
|
93 |
|
94 |
# Gradio Interface
|
95 |
with gr.Blocks() as demo:
|
@@ -100,7 +100,7 @@ with gr.Blocks() as demo:
|
|
100 |
book_input = gr.Textbox(label="Enter Book Title")
|
101 |
submit_btn = gr.Button("Recommend")
|
102 |
|
103 |
-
output = gr.Textbox(label="Recommended Books")
|
104 |
log_output = gr.Textbox(label="Logs", interactive=False, lines=10) # Log display
|
105 |
|
106 |
submit_btn.click(recommend_books, inputs=book_input, outputs=[output, log_output])
|
|
|
3 |
import pickle
|
4 |
import torch
|
5 |
from sentence_transformers import SentenceTransformer, util, CrossEncoder
|
|
|
6 |
from duckduckgo_search import DDGS
|
7 |
from fuzzywuzzy import process
|
8 |
+
|
9 |
# Load book dataset
|
10 |
df = pd.read_csv("data/books_summary_cleaned.csv")
|
11 |
|
|
|
17 |
retriever_model = SentenceTransformer("all-mpnet-base-v2") # More accurate than MiniLM
|
18 |
reranker_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") # More accurate ranking
|
19 |
|
20 |
+
def fetch_summary_duckduckgo(book_title, log):
|
21 |
"""Fetch book summary from DuckDuckGo API (search engine)."""
|
22 |
+
log.append(f"Searching the internet for '{book_title}' summary...")
|
23 |
+
|
24 |
with DDGS() as ddgs:
|
25 |
search_results = list(ddgs.text(f"{book_title} book summary", max_results=3))
|
26 |
+
|
|
|
27 |
for result in search_results:
|
|
|
|
|
|
|
28 |
if "body" in result:
|
29 |
+
log.append("Summary found from the web.")
|
30 |
+
return result["body"], log
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
+
log.append("No summary found on the web.")
|
33 |
+
return None, log
|
|
|
34 |
|
35 |
+
def get_best_match(book_title, book_list, log):
|
36 |
+
"""Find the closest matching book title in the dataset using fuzzy matching."""
|
|
|
37 |
best_match, score = process.extractOne(book_title, book_list)
|
38 |
+
if score > 90:
|
39 |
+
log.append(f"Typo detected! Corrected '{book_title}' to '{best_match}'.")
|
40 |
+
return best_match, log
|
41 |
+
log.append(f"No correction needed for '{book_title}'.")
|
42 |
+
return book_title, log
|
43 |
|
44 |
def retrieve_candidates(book_title, top_n=10):
|
45 |
"""Retrieve top-N similar books using SBERT embeddings"""
|
46 |
+
log = ["Starting book recommendation process..."]
|
47 |
+
book_title, log = get_best_match(book_title, df["book_name"].values.tolist(), log)
|
48 |
|
49 |
if book_title in df["book_name"].values:
|
50 |
book_idx = df[df["book_name"] == book_title].index[0]
|
51 |
query_embedding = book_embeddings[book_idx]
|
52 |
+
summary = df[df["book_name"] == book_title]["summaries"].values[0]
|
53 |
+
log.append(f"Book '{book_title}' found in the dataset.")
|
54 |
else:
|
55 |
+
log.append(f"Book '{book_title}' not found in the dataset.")
|
56 |
+
summary, log = fetch_summary_duckduckgo(book_title, log)
|
57 |
|
58 |
if summary is None:
|
59 |
+
log.append("No summary found. Cannot proceed with recommendation.")
|
60 |
+
return None, None, None, log
|
61 |
+
|
62 |
query_embedding = retriever_model.encode(summary, convert_to_tensor=True)
|
63 |
|
64 |
scores = util.cos_sim(query_embedding, book_embeddings)[0]
|
|
|
|
|
65 |
top_indices = torch.argsort(scores, descending=True)[1:top_n+1]
|
66 |
|
67 |
+
log.append(f"Top {top_n} similar books retrieved from the dataset.")
|
68 |
+
return book_title, query_embedding, df.iloc[top_indices][["book_name", "summaries"]].values.tolist(), log
|
69 |
|
70 |
+
def rerank_books(query_title, query_embedding, candidates, log):
|
71 |
"""Re-rank books using a cross-encoder"""
|
72 |
query_summary = df[df["book_name"] == query_title]["summaries"].values[0]
|
73 |
pairs = [(query_summary, cand_summary) for _, cand_summary in candidates]
|
|
|
75 |
scores = reranker_model.predict(pairs)
|
76 |
ranked_books = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
|
77 |
|
78 |
+
log.append("Books re-ranked based on cross-encoder model.")
|
79 |
+
return [book[0][0] for book in ranked_books[:5]], log
|
80 |
|
81 |
def recommend_books(book_title):
|
82 |
+
"""Complete recommendation pipeline with logging"""
|
83 |
+
book_title, query_embedding, candidates, log = retrieve_candidates(book_title, top_n=10)
|
84 |
+
|
85 |
+
if book_title is None:
|
86 |
+
log.append("Book not found. Exiting recommendation process.")
|
87 |
+
return "Book not found", "\n".join(log)
|
88 |
+
|
89 |
+
recommendations, log = rerank_books(book_title, query_embedding, candidates, log)
|
90 |
+
log.append("Recommendation process complete.")
|
91 |
|
92 |
+
return ", ".join(recommendations), "\n".join(log)
|
93 |
|
94 |
# Gradio Interface
|
95 |
with gr.Blocks() as demo:
|
|
|
100 |
book_input = gr.Textbox(label="Enter Book Title")
|
101 |
submit_btn = gr.Button("Recommend")
|
102 |
|
103 |
+
output = gr.Textbox(label="Recommended Books", interactive=False)
|
104 |
log_output = gr.Textbox(label="Logs", interactive=False, lines=10) # Log display
|
105 |
|
106 |
submit_btn.click(recommend_books, inputs=book_input, outputs=[output, log_output])
|