qsaheeb commited on
Commit
3c564f4
·
1 Parent(s): d1f6b4c

Addsome changes 6

Browse files
Files changed (1) hide show
  1. app.py +41 -41
app.py CHANGED
@@ -3,9 +3,9 @@ import pandas as pd
3
  import pickle
4
  import torch
5
  from sentence_transformers import SentenceTransformer, util, CrossEncoder
6
- from recommender import BookRecommender
7
  from duckduckgo_search import DDGS
8
  from fuzzywuzzy import process
 
9
  # Load book dataset
10
  df = pd.read_csv("data/books_summary_cleaned.csv")
11
 
@@ -17,61 +17,57 @@ with open("model/sbert_embeddings2.pkl", "rb") as f:
17
  retriever_model = SentenceTransformer("all-mpnet-base-v2") # More accurate than MiniLM
18
  reranker_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") # More accurate ranking
19
 
20
- def fetch_summary_duckduckgo(book_title):
21
  """Fetch book summary from DuckDuckGo API (search engine)."""
 
 
22
  with DDGS() as ddgs:
23
  search_results = list(ddgs.text(f"{book_title} book summary", max_results=3))
24
- print(search_results)
25
- itr = 0
26
  for result in search_results:
27
- if itr<=1:
28
- itr+=1
29
- continue
30
  if "body" in result:
31
- return result["body"]
32
-
33
- return None
34
-
35
- def get_best_match(book_title, book_list):
36
- """
37
- Finds the closest matching book title in the dataset.
38
 
39
- Parameters:
40
- - book_title (str): User-inputted book title (possibly with typos).
41
- - book_list (list): List of book titles in the dataset.
42
 
43
- Returns:
44
- - str: Best-matching book title.
45
- """
46
  best_match, score = process.extractOne(book_title, book_list)
47
- return best_match if score > 90 else book_title
 
 
 
 
48
 
49
  def retrieve_candidates(book_title, top_n=10):
50
  """Retrieve top-N similar books using SBERT embeddings"""
51
- book_title = get_best_match(book_title, df["book_name"].values.tolist())
 
52
 
53
  if book_title in df["book_name"].values:
54
  book_idx = df[df["book_name"] == book_title].index[0]
55
  query_embedding = book_embeddings[book_idx]
56
- summary = df[df["summaries"] == book_title]
57
- # If book is NOT in dataset
58
  else:
59
- print(f"Book '{book_title}' not found! scraping the web for summaries")
60
- summary = fetch_summary_duckduckgo(book_title)
61
 
62
  if summary is None:
63
- return None
64
- # Compute embedding for new book summary
 
65
  query_embedding = retriever_model.encode(summary, convert_to_tensor=True)
66
 
67
  scores = util.cos_sim(query_embedding, book_embeddings)[0]
68
-
69
- # Get top-N similar books (excluding the book itself)
70
  top_indices = torch.argsort(scores, descending=True)[1:top_n+1]
71
 
72
- return book_title, summary ,df.iloc[top_indices][["book_name", "summaries"]].values.tolist()
 
73
 
74
- def rerank_books(query_title, query_embedding, candidates):
75
  """Re-rank books using a cross-encoder"""
76
  query_summary = df[df["book_name"] == query_title]["summaries"].values[0]
77
  pairs = [(query_summary, cand_summary) for _, cand_summary in candidates]
@@ -79,17 +75,21 @@ def rerank_books(query_title, query_embedding, candidates):
79
  scores = reranker_model.predict(pairs)
80
  ranked_books = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
81
 
82
- return [book[0][0] for book in ranked_books[:5]]
 
83
 
84
  def recommend_books(book_title):
85
- """Complete recommendation pipeline"""
86
- book_title, query_embedding, candidates = retrieve_candidates(book_title, top_n=10)
87
- if book_title==None:
88
- return "Book not found"
89
- if isinstance(candidates, list) and "Error" in candidates[0]:
90
- return candidates[0]
 
 
 
91
 
92
- return rerank_books(book_title, query_embedding, candidates)
93
 
94
  # Gradio Interface
95
  with gr.Blocks() as demo:
@@ -100,7 +100,7 @@ with gr.Blocks() as demo:
100
  book_input = gr.Textbox(label="Enter Book Title")
101
  submit_btn = gr.Button("Recommend")
102
 
103
- output = gr.Textbox(label="Recommended Books")
104
  log_output = gr.Textbox(label="Logs", interactive=False, lines=10) # Log display
105
 
106
  submit_btn.click(recommend_books, inputs=book_input, outputs=[output, log_output])
 
3
  import pickle
4
  import torch
5
  from sentence_transformers import SentenceTransformer, util, CrossEncoder
 
6
  from duckduckgo_search import DDGS
7
  from fuzzywuzzy import process
8
+
9
  # Load book dataset
10
  df = pd.read_csv("data/books_summary_cleaned.csv")
11
 
 
17
  retriever_model = SentenceTransformer("all-mpnet-base-v2") # More accurate than MiniLM
18
  reranker_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") # More accurate ranking
19
 
20
+ def fetch_summary_duckduckgo(book_title, log):
21
  """Fetch book summary from DuckDuckGo API (search engine)."""
22
+ log.append(f"Searching the internet for '{book_title}' summary...")
23
+
24
  with DDGS() as ddgs:
25
  search_results = list(ddgs.text(f"{book_title} book summary", max_results=3))
26
+
 
27
  for result in search_results:
 
 
 
28
  if "body" in result:
29
+ log.append("Summary found from the web.")
30
+ return result["body"], log
 
 
 
 
 
31
 
32
+ log.append("No summary found on the web.")
33
+ return None, log
 
34
 
35
+ def get_best_match(book_title, book_list, log):
36
+ """Find the closest matching book title in the dataset using fuzzy matching."""
 
37
  best_match, score = process.extractOne(book_title, book_list)
38
+ if score > 90:
39
+ log.append(f"Typo detected! Corrected '{book_title}' to '{best_match}'.")
40
+ return best_match, log
41
+ log.append(f"No correction needed for '{book_title}'.")
42
+ return book_title, log
43
 
44
  def retrieve_candidates(book_title, top_n=10):
45
  """Retrieve top-N similar books using SBERT embeddings"""
46
+ log = ["Starting book recommendation process..."]
47
+ book_title, log = get_best_match(book_title, df["book_name"].values.tolist(), log)
48
 
49
  if book_title in df["book_name"].values:
50
  book_idx = df[df["book_name"] == book_title].index[0]
51
  query_embedding = book_embeddings[book_idx]
52
+ summary = df[df["book_name"] == book_title]["summaries"].values[0]
53
+ log.append(f"Book '{book_title}' found in the dataset.")
54
  else:
55
+ log.append(f"Book '{book_title}' not found in the dataset.")
56
+ summary, log = fetch_summary_duckduckgo(book_title, log)
57
 
58
  if summary is None:
59
+ log.append("No summary found. Cannot proceed with recommendation.")
60
+ return None, None, None, log
61
+
62
  query_embedding = retriever_model.encode(summary, convert_to_tensor=True)
63
 
64
  scores = util.cos_sim(query_embedding, book_embeddings)[0]
 
 
65
  top_indices = torch.argsort(scores, descending=True)[1:top_n+1]
66
 
67
+ log.append(f"Top {top_n} similar books retrieved from the dataset.")
68
+ return book_title, query_embedding, df.iloc[top_indices][["book_name", "summaries"]].values.tolist(), log
69
 
70
+ def rerank_books(query_title, query_embedding, candidates, log):
71
  """Re-rank books using a cross-encoder"""
72
  query_summary = df[df["book_name"] == query_title]["summaries"].values[0]
73
  pairs = [(query_summary, cand_summary) for _, cand_summary in candidates]
 
75
  scores = reranker_model.predict(pairs)
76
  ranked_books = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
77
 
78
+ log.append("Books re-ranked based on cross-encoder model.")
79
+ return [book[0][0] for book in ranked_books[:5]], log
80
 
81
  def recommend_books(book_title):
82
+ """Complete recommendation pipeline with logging"""
83
+ book_title, query_embedding, candidates, log = retrieve_candidates(book_title, top_n=10)
84
+
85
+ if book_title is None:
86
+ log.append("Book not found. Exiting recommendation process.")
87
+ return "Book not found", "\n".join(log)
88
+
89
+ recommendations, log = rerank_books(book_title, query_embedding, candidates, log)
90
+ log.append("Recommendation process complete.")
91
 
92
+ return ", ".join(recommendations), "\n".join(log)
93
 
94
  # Gradio Interface
95
  with gr.Blocks() as demo:
 
100
  book_input = gr.Textbox(label="Enter Book Title")
101
  submit_btn = gr.Button("Recommend")
102
 
103
+ output = gr.Textbox(label="Recommended Books", interactive=False)
104
  log_output = gr.Textbox(label="Logs", interactive=False, lines=10) # Log display
105
 
106
  submit_btn.click(recommend_books, inputs=book_input, outputs=[output, log_output])