File size: 2,240 Bytes
9e95b48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcae4fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e95b48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcae4fc
9e95b48
 
 
 
fcae4fc
 
 
9e95b48
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd


def format_docs(docs):
    """Print the contents of a list of Langchain Documents.
    Args:
        docs (str):
    """
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" +
                d.page_content for i, d in enumerate(docs)]
        )
    )


def clean_and_format_text(text):
    if isinstance(text, str):
        # Replace curly apostrophes with straight ones
        text = text.replace("\u2019", "'")
        words = text.split()
        # Title case words, preserving acronyms
        title_words = [word if word.isupper() and len(word) > 1 else word.capitalize()
                       for word in words]
        return ' '.join(title_words)
    else:
        return text


def categorize_location(location):
    if any(place in location.lower() for place in ['cordova bay', 'james bay']):
        return 'Victoria'
    return location


def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
    """Load an Excel file, clean its contents, and generate a pd.Dataframe.

    Args:
        data_directory (str): File path to the directory where the Excel file is located.

    Raises:
        FileNotFoundError: If no Excel files are found in the specified directory.

    Returns:
        pd.Dataframe:

    """
    # Get the xls file name (one excel worksheet)
    excel_files = [file for file in data_directory.iterdir()
                   if file.suffix == '.xlsx']

    if not excel_files:
        raise FileNotFoundError(
            "No Excel files found in the specified directory.")
    if len(excel_files) > 1:
        raise ValueError(
            "More than one Excel file found in the specified directory.")

    path = excel_files[0]

    # Load Excel file
    df = pd.read_excel(path, engine='openpyxl')

    # Change column names to title case
    df.columns = df.columns.str.title()

    # Clean data
    for col in df.columns:
        if col.lower() != 'booking link' and df[col].dtype == 'object':
            df[col] = df[col].str.strip().apply(clean_and_format_text)

    # Handle missing values
    df.fillna('Information Not Available', inplace=True)

    # Add city column
    df['City'] = df['Location'].apply(categorize_location)

    return df