import pandas as pd
import re

def clean_text(text):
    """Cleans text by removing special characters and extra spaces."""
    if pd.isna(text):
        return ""
    text = re.sub(r"\s+", " ", text)  
    text = re.sub(r"[^a-zA-Z0-9.,!?;:()'\" ]", "", text) 
    return text.strip()

def preprocess_books(input_path="data/books_summary.csv", output_path="data/books_summary_cleaned.csv"):
    """Preprocesses book dataset by handling duplicates, missing values, and text cleaning."""
    
    # Load dataset
    df = pd.read_csv(input_path)

    df["summaries"] = df["summaries"].fillna("")
    df["categories"] = df["categories"].fillna("Unknown")

    # Merge duplicate titles while keeping distinct categories
    df = df.groupby("book_name", as_index=False).agg({
        "summaries": "first",  
        "categories": lambda x: "; ".join(set(x)) 
    })

    # Create a new feature combining title, summary, and categories
    df["combined_text"] = df["summaries"] + " " + df["categories"]

    df.to_csv(output_path, index=False)
    print("Dataset cleaned and saved!")

if __name__ == "__main__":
  preprocess_books()