qsaheeb
commited on
Commit
·
245e9ef
1
Parent(s):
5613170
Add full app
Browse files- app.py +52 -0
- data/books_summary.csv +0 -0
- data/books_summary_cleaned.csv +0 -0
- embeddings.py +26 -0
- model/sbert_embeddings2.pkl +3 -0
- preprocess.py +44 -0
- recommender.py +35 -0
app.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import pickle
|
4 |
+
import torch
|
5 |
+
from sentence_transformers import SentenceTransformer, util, CrossEncoder
|
6 |
+
from recommender import BookRecommender
|
7 |
+
# Load book dataset
|
8 |
+
df = pd.read_csv("model/books_summary_cleaned.csv")
|
9 |
+
|
10 |
+
# Load precomputed SBERT embeddings
|
11 |
+
with open("model/sbert_embeddings2.pkl", "rb") as f:
|
12 |
+
book_embeddings = pickle.load(f)
|
13 |
+
|
14 |
+
# Load models
|
15 |
+
reranker_model = CrossEncoder("cross-encoder/stsb-roberta-large") # More accurate ranking
|
16 |
+
|
17 |
+
recommender = BookRecommender()
|
18 |
+
|
19 |
+
def rerank_books(query_title, candidates):
|
20 |
+
"""Re-rank books using a cross-encoder"""
|
21 |
+
query_summary = df[df["title"] == query_title]["summary"].values[0]
|
22 |
+
pairs = [(query_summary, cand_summary) for _, cand_summary in candidates]
|
23 |
+
|
24 |
+
scores = reranker_model.predict(pairs)
|
25 |
+
ranked_books = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
|
26 |
+
|
27 |
+
return [book[0][0] for book in ranked_books[:5]] # Return top-5 ranked books
|
28 |
+
|
29 |
+
def recommend_books(book_title):
|
30 |
+
"""Complete recommendation pipeline"""
|
31 |
+
candidates = recommender.recommend(book_title, top_n=10)
|
32 |
+
if isinstance(candidates, list) and "Error" in candidates[0]:
|
33 |
+
return candidates[0]
|
34 |
+
|
35 |
+
return rerank_books(book_title, candidates)
|
36 |
+
|
37 |
+
# Gradio Interface
|
38 |
+
with gr.Blocks() as demo:
|
39 |
+
gr.Markdown("# 📚 Content-Based Book Recommendation")
|
40 |
+
gr.Markdown("Enter a book title to find similar books based on summaries.")
|
41 |
+
|
42 |
+
with gr.Row():
|
43 |
+
book_input = gr.Textbox(label="Enter Book Title")
|
44 |
+
submit_btn = gr.Button("Recommend")
|
45 |
+
|
46 |
+
output = gr.Textbox(label="Recommended Books")
|
47 |
+
|
48 |
+
submit_btn.click(recommend_books, inputs=book_input, outputs=output)
|
49 |
+
|
50 |
+
# Run the app
|
51 |
+
if __name__ == "__main__":
|
52 |
+
demo.launch()
|
data/books_summary.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/books_summary_cleaned.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
embeddings.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
from preprocess import preprocess_books
|
3 |
+
import pickle
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
def extract_sbert_embeddings(df, save_path="/model/sbert_embeddings2.pkl"):
|
7 |
+
"""Extracts SBERT embeddings from book summaries."""
|
8 |
+
model = SentenceTransformer('all-mpnet-base-v2') # Small, fast, high-performance
|
9 |
+
|
10 |
+
# Generate embeddings for book summaries
|
11 |
+
embeddings = model.encode(df["combined_text"].fillna(""), show_progress_bar=True)
|
12 |
+
|
13 |
+
with open(save_path, "wb") as f:
|
14 |
+
pickle.dump(embeddings, f)
|
15 |
+
|
16 |
+
return embeddings
|
17 |
+
|
18 |
+
def load_book_data(filepath="/content/data/books_summary_cleaned.csv"):
|
19 |
+
"""Loads book dataset and ensures necessary columns exist."""
|
20 |
+
df = pd.read_csv(filepath)
|
21 |
+
|
22 |
+
|
23 |
+
return df
|
24 |
+
preprocess_books()
|
25 |
+
df = load_book_data()
|
26 |
+
embeddings = extract_sbert_embeddings(df)
|
model/sbert_embeddings2.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fbeb697fafedf7670c7aa4a75c1fffbee52481497552f3accc05f913837e8147
|
3 |
+
size 3781795
|
preprocess.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import re
|
5 |
+
|
6 |
+
def clean_text(text):
|
7 |
+
"""Cleans text by removing special characters and extra spaces."""
|
8 |
+
if pd.isna(text): # Handle missing values
|
9 |
+
return ""
|
10 |
+
text = re.sub(r"\s+", " ", text) # Remove extra spaces
|
11 |
+
text = re.sub(r"[^a-zA-Z0-9.,!?;:()'\" ]", "", text) # Keep only relevant characters
|
12 |
+
return text.strip()
|
13 |
+
import pandas as pd
|
14 |
+
|
15 |
+
def preprocess_books(input_path="/content/data/books_summary.csv", output_path="/content/data/books_summary_cleaned.csv"):
|
16 |
+
"""Preprocesses book dataset by handling duplicates, missing values, and text cleaning."""
|
17 |
+
|
18 |
+
# Load dataset
|
19 |
+
df = pd.read_csv(input_path)
|
20 |
+
|
21 |
+
# Ensure required columns exist
|
22 |
+
required_cols = {"book_name", "summaries", "categories"}
|
23 |
+
if not required_cols.issubset(df.columns):
|
24 |
+
raise ValueError(f"Dataset must contain columns: {required_cols}")
|
25 |
+
|
26 |
+
# Fill missing summaries with categories if available
|
27 |
+
df["summaries"] = df["summaries"].fillna("")
|
28 |
+
df["categories"] = df["categories"].fillna("Unknown")
|
29 |
+
|
30 |
+
# 🔹 Merge duplicate titles while keeping distinct categories
|
31 |
+
df = df.groupby("book_name", as_index=False).agg({
|
32 |
+
"summaries": "first", # Keep the first non-null summary
|
33 |
+
"categories": lambda x: "; ".join(set(x)) # Combine unique categories
|
34 |
+
})
|
35 |
+
|
36 |
+
# 🔹 Create a new feature combining title, summary, and categories
|
37 |
+
df["combined_text"] = df["summaries"] + " " + df["categories"]
|
38 |
+
|
39 |
+
# Save cleaned dataset
|
40 |
+
df.to_csv(output_path, index=False)
|
41 |
+
print("✅ Dataset cleaned and saved!")
|
42 |
+
|
43 |
+
if __name__ == "__main__":
|
44 |
+
preprocess_books()
|
recommender.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import pickle
|
4 |
+
from sentence_transformers import SentenceTransformer, util
|
5 |
+
from embeddings import load_book_data
|
6 |
+
|
7 |
+
class BookRecommender:
|
8 |
+
def __init__(self, data_path="/content/data/books_summary.csv", emb_path="/content/model/sbert_embeddings2.pkl"):
|
9 |
+
"""Loads book dataset and precomputed embeddings."""
|
10 |
+
# from data_loader import load_book_data
|
11 |
+
self.df = load_book_data(data_path)
|
12 |
+
|
13 |
+
with open(emb_path, "rb") as f:
|
14 |
+
self.embeddings = pickle.load(f)
|
15 |
+
|
16 |
+
def recommend(self, book_title, top_n=5):
|
17 |
+
"""Finds top-N similar books, ensuring diversity in recommendations."""
|
18 |
+
if book_title not in df["book_name"].values:
|
19 |
+
raise ValueError("Book title not found in dataset!")
|
20 |
+
|
21 |
+
# Get the book index
|
22 |
+
book_idx = self.df[self.df["book_name"] == book_title].index[0]
|
23 |
+
|
24 |
+
# Compute cosine similarity with all books
|
25 |
+
query_embedding = self.embeddings[book_idx]
|
26 |
+
scores = util.cos_sim(query_embedding, self.embeddings)[0]
|
27 |
+
|
28 |
+
# Sort by similarity (excluding the book itself)
|
29 |
+
top_indices = np.argsort(scores.numpy())[::-1][1:top_n+1]
|
30 |
+
|
31 |
+
return self.df.iloc[top_indices][["book_name"]].values.tolist()
|
32 |
+
|
33 |
+
# recommender = BookRecommender()
|
34 |
+
# print(recommender.recommend("The End Of Stress")) # Test with a known book title
|
35 |
+
|