qsaheeb commited on
Commit
245e9ef
·
1 Parent(s): 5613170

Add full app

Browse files
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import pickle
4
+ import torch
5
+ from sentence_transformers import SentenceTransformer, util, CrossEncoder
6
+ from recommender import BookRecommender
7
+ # Load book dataset
8
+ df = pd.read_csv("model/books_summary_cleaned.csv")
9
+
10
+ # Load precomputed SBERT embeddings
11
+ with open("model/sbert_embeddings2.pkl", "rb") as f:
12
+ book_embeddings = pickle.load(f)
13
+
14
+ # Load models
15
+ reranker_model = CrossEncoder("cross-encoder/stsb-roberta-large") # More accurate ranking
16
+
17
+ recommender = BookRecommender()
18
+
19
+ def rerank_books(query_title, candidates):
20
+ """Re-rank books using a cross-encoder"""
21
+ query_summary = df[df["title"] == query_title]["summary"].values[0]
22
+ pairs = [(query_summary, cand_summary) for _, cand_summary in candidates]
23
+
24
+ scores = reranker_model.predict(pairs)
25
+ ranked_books = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
26
+
27
+ return [book[0][0] for book in ranked_books[:5]] # Return top-5 ranked books
28
+
29
+ def recommend_books(book_title):
30
+ """Complete recommendation pipeline"""
31
+ candidates = recommender.recommend(book_title, top_n=10)
32
+ if isinstance(candidates, list) and "Error" in candidates[0]:
33
+ return candidates[0]
34
+
35
+ return rerank_books(book_title, candidates)
36
+
37
+ # Gradio Interface
38
+ with gr.Blocks() as demo:
39
+ gr.Markdown("# 📚 Content-Based Book Recommendation")
40
+ gr.Markdown("Enter a book title to find similar books based on summaries.")
41
+
42
+ with gr.Row():
43
+ book_input = gr.Textbox(label="Enter Book Title")
44
+ submit_btn = gr.Button("Recommend")
45
+
46
+ output = gr.Textbox(label="Recommended Books")
47
+
48
+ submit_btn.click(recommend_books, inputs=book_input, outputs=output)
49
+
50
+ # Run the app
51
+ if __name__ == "__main__":
52
+ demo.launch()
data/books_summary.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/books_summary_cleaned.csv ADDED
The diff for this file is too large to render. See raw diff
 
embeddings.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from preprocess import preprocess_books
3
+ import pickle
4
+ import numpy as np
5
+
6
+ def extract_sbert_embeddings(df, save_path="/model/sbert_embeddings2.pkl"):
7
+ """Extracts SBERT embeddings from book summaries."""
8
+ model = SentenceTransformer('all-mpnet-base-v2') # Small, fast, high-performance
9
+
10
+ # Generate embeddings for book summaries
11
+ embeddings = model.encode(df["combined_text"].fillna(""), show_progress_bar=True)
12
+
13
+ with open(save_path, "wb") as f:
14
+ pickle.dump(embeddings, f)
15
+
16
+ return embeddings
17
+
18
+ def load_book_data(filepath="/content/data/books_summary_cleaned.csv"):
19
+ """Loads book dataset and ensures necessary columns exist."""
20
+ df = pd.read_csv(filepath)
21
+
22
+
23
+ return df
24
+ preprocess_books()
25
+ df = load_book_data()
26
+ embeddings = extract_sbert_embeddings(df)
model/sbert_embeddings2.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbeb697fafedf7670c7aa4a75c1fffbee52481497552f3accc05f913837e8147
3
+ size 3781795
preprocess.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ import pandas as pd
4
+ import re
5
+
6
+ def clean_text(text):
7
+ """Cleans text by removing special characters and extra spaces."""
8
+ if pd.isna(text): # Handle missing values
9
+ return ""
10
+ text = re.sub(r"\s+", " ", text) # Remove extra spaces
11
+ text = re.sub(r"[^a-zA-Z0-9.,!?;:()'\" ]", "", text) # Keep only relevant characters
12
+ return text.strip()
13
+ import pandas as pd
14
+
15
+ def preprocess_books(input_path="/content/data/books_summary.csv", output_path="/content/data/books_summary_cleaned.csv"):
16
+ """Preprocesses book dataset by handling duplicates, missing values, and text cleaning."""
17
+
18
+ # Load dataset
19
+ df = pd.read_csv(input_path)
20
+
21
+ # Ensure required columns exist
22
+ required_cols = {"book_name", "summaries", "categories"}
23
+ if not required_cols.issubset(df.columns):
24
+ raise ValueError(f"Dataset must contain columns: {required_cols}")
25
+
26
+ # Fill missing summaries with categories if available
27
+ df["summaries"] = df["summaries"].fillna("")
28
+ df["categories"] = df["categories"].fillna("Unknown")
29
+
30
+ # 🔹 Merge duplicate titles while keeping distinct categories
31
+ df = df.groupby("book_name", as_index=False).agg({
32
+ "summaries": "first", # Keep the first non-null summary
33
+ "categories": lambda x: "; ".join(set(x)) # Combine unique categories
34
+ })
35
+
36
+ # 🔹 Create a new feature combining title, summary, and categories
37
+ df["combined_text"] = df["summaries"] + " " + df["categories"]
38
+
39
+ # Save cleaned dataset
40
+ df.to_csv(output_path, index=False)
41
+ print("✅ Dataset cleaned and saved!")
42
+
43
+ if __name__ == "__main__":
44
+ preprocess_books()
recommender.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import pickle
4
+ from sentence_transformers import SentenceTransformer, util
5
+ from embeddings import load_book_data
6
+
7
+ class BookRecommender:
8
+ def __init__(self, data_path="/content/data/books_summary.csv", emb_path="/content/model/sbert_embeddings2.pkl"):
9
+ """Loads book dataset and precomputed embeddings."""
10
+ # from data_loader import load_book_data
11
+ self.df = load_book_data(data_path)
12
+
13
+ with open(emb_path, "rb") as f:
14
+ self.embeddings = pickle.load(f)
15
+
16
+ def recommend(self, book_title, top_n=5):
17
+ """Finds top-N similar books, ensuring diversity in recommendations."""
18
+ if book_title not in df["book_name"].values:
19
+ raise ValueError("Book title not found in dataset!")
20
+
21
+ # Get the book index
22
+ book_idx = self.df[self.df["book_name"] == book_title].index[0]
23
+
24
+ # Compute cosine similarity with all books
25
+ query_embedding = self.embeddings[book_idx]
26
+ scores = util.cos_sim(query_embedding, self.embeddings)[0]
27
+
28
+ # Sort by similarity (excluding the book itself)
29
+ top_indices = np.argsort(scores.numpy())[::-1][1:top_n+1]
30
+
31
+ return self.df.iloc[top_indices][["book_name"]].values.tolist()
32
+
33
+ # recommender = BookRecommender()
34
+ # print(recommender.recommend("The End Of Stress")) # Test with a known book title
35
+