import pandas as pd


def format_docs(docs):
    """Print the contents of a list of Langchain Documents.
    Args:
        docs (str):
    """
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )


def clean_and_format_text(text):
    if isinstance(text, str):
        # Replace curly apostrophes with straight ones
        text = text.replace("\u2019", "'")
        words = text.split()
        # Title case words, preserving acronyms
        title_words = [
            word if word.isupper() and len(word) > 1 else word.capitalize()
            for word in words
        ]
        return " ".join(title_words)
    else:
        return text


def categorize_location(location):
    if any(place in location.lower() for place in ["cordova bay", "james bay"]):
        return "Victoria"
    return location


def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
    """Load an Excel file, clean its contents, and generate a pd.Dataframe.

    Args:
        data_directory (str): File path to the directory where the Excel file is located.

    Raises:
        FileNotFoundError: If no Excel files are found in the specified directory.

    Returns:
        pd.Dataframe:

    """
    # Get the xls file name (one excel worksheet)
    excel_files = [file for file in data_directory.iterdir() if file.suffix == ".xlsx"]

    if not excel_files:
        raise FileNotFoundError("No Excel files found in the specified directory.")
    if len(excel_files) > 1:
        raise ValueError("More than one Excel file found in the specified directory.")

    path = excel_files[0]

    # Load Excel file
    df = pd.read_excel(path, engine="openpyxl")

    # Change column names to title case
    df.columns = df.columns.str.title()

    # Clean data
    for col in df.columns:
        if col.lower() != "booking link" and df[col].dtype == "object":
            df[col] = df[col].str.strip().apply(clean_and_format_text)

    # Handle missing values
    df.fillna("Information Not Available", inplace=True)

    # Add city column
    df["City"] = df["Location"].apply(categorize_location)

    return df