import pandas as pd import re def clean_text(text): """Cleans text by removing special characters and extra spaces.""" if pd.isna(text): return "" text = re.sub(r"\s+", " ", text) text = re.sub(r"[^a-zA-Z0-9.,!?;:()'\" ]", "", text) return text.strip() def preprocess_books(input_path="data/books_summary.csv", output_path="data/books_summary_cleaned.csv"): """Preprocesses book dataset by handling duplicates, missing values, and text cleaning.""" # Load dataset df = pd.read_csv(input_path) df["summaries"] = df["summaries"].fillna("") df["categories"] = df["categories"].fillna("Unknown") # Merge duplicate titles while keeping distinct categories df = df.groupby("book_name", as_index=False).agg({ "summaries": "first", "categories": lambda x: "; ".join(set(x)) }) # Create a new feature combining title, summary, and categories df["combined_text"] = df["summaries"] + " " + df["categories"] df.to_csv(output_path, index=False) print("Dataset cleaned and saved!") if __name__ == "__main__": preprocess_books()