Spaces:

tall-tree
/

ai-virtual-assistant

Running on CPU Upgrade

File size: 2,190 Bytes

9e95b48
 
 
 
 
 
 
 
 
 
93d3140
9e95b48
 
 
 
fcae4fc
 
 
 
 
 
93d3140
 
 
 
 
fcae4fc
 
 
 
 
93d3140
 
fcae4fc
 
 
9e95b48
 
 
 
 
 
 
 
 
 
 
 
 
 
93d3140
9e95b48
 
93d3140
9e95b48
93d3140
9e95b48
 
 
 
93d3140
9e95b48
 
 
 
 
 
93d3140
fcae4fc
9e95b48
 
93d3140
9e95b48
fcae4fc
93d3140
fcae4fc
9e95b48

import pandas as pd


def format_docs(docs):
    """Print the contents of a list of Langchain Documents.
    Args:
        docs (str):
    """
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )


def clean_and_format_text(text):
    if isinstance(text, str):
        # Replace curly apostrophes with straight ones
        text = text.replace("\u2019", "'")
        words = text.split()
        # Title case words, preserving acronyms
        title_words = [
            word if word.isupper() and len(word) > 1 else word.capitalize()
            for word in words
        ]
        return " ".join(title_words)
    else:
        return text


def categorize_location(location):
    if any(place in location.lower() for place in ["cordova bay", "james bay"]):
        return "Victoria"
    return location


def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
    """Load an Excel file, clean its contents, and generate a pd.Dataframe.

    Args:
        data_directory (str): File path to the directory where the Excel file is located.

    Raises:
        FileNotFoundError: If no Excel files are found in the specified directory.

    Returns:
        pd.Dataframe:

    """
    # Get the xls file name (one excel worksheet)
    excel_files = [file for file in data_directory.iterdir() if file.suffix == ".xlsx"]

    if not excel_files:
        raise FileNotFoundError("No Excel files found in the specified directory.")
    if len(excel_files) > 1:
        raise ValueError("More than one Excel file found in the specified directory.")

    path = excel_files[0]

    # Load Excel file
    df = pd.read_excel(path, engine="openpyxl")

    # Change column names to title case
    df.columns = df.columns.str.title()

    # Clean data
    for col in df.columns:
        if col.lower() != "booking link" and df[col].dtype == "object":
            df[col] = df[col].str.strip().apply(clean_and_format_text)

    # Handle missing values
    df.fillna("Information Not Available", inplace=True)

    # Add city column
    df["City"] = df["Location"].apply(categorize_location)

    return df