File size: 1,954 Bytes
9e95b48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pandas as pd


def format_docs(docs):
    """Print the contents of a list of Langchain Documents.
    Args:
        docs (str):
    """
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" +
                d.page_content for i, d in enumerate(docs)]
        )
    )


def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
    """Load an Excel file, clean its contents, and generate a pd.Dataframe.

    Args:
        data_directory (str): File path to the directory where the Excel file is located.

    Raises:
        FileNotFoundError: If no Excel files are found in the specified directory.

    Returns:
        pd.Dataframe:

    """
    # Get the xls file name (one excel worksheet)
    excel_files = [file for file in data_directory.iterdir()
                   if file.suffix == '.xlsx']

    if not excel_files:
        raise FileNotFoundError(
            "No Excel files found in the specified directory.")
    if len(excel_files) > 1:
        raise ValueError(
            "More than one Excel file found in the specified directory.")

    path = excel_files[0]

    # Load Excel file
    df = pd.read_excel(path, engine='openpyxl')

    # Change column names to title case
    df.columns = df.columns.str.title()

    # Function to replace curly apostrophes with straight ones
    def replace_apostrophes(text):
        if isinstance(text, str):
            return text.replace("\u2019", "'")
        return text

    # Clean data
    # Trim strings, standardize text (convert to title case), and replace apostrophes
    for col in df.columns:
        # If the column is text-based
        if col.lower() != 'booking link' and df[col].dtype == 'object':
            # Trim, standardize case, and replace apostrophes
            df[col] = df[col].str.strip().str.title().apply(replace_apostrophes)

    # Handle missing values
    df.fillna('Information Not Available', inplace=True)

    return df