Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import pandas as pd | |
def format_docs(docs): | |
"""Print the contents of a list of Langchain Documents. | |
Args: | |
docs (str): | |
""" | |
print( | |
f"\n{'-' * 100}\n".join( | |
[f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)] | |
) | |
) | |
def clean_and_format_text(text): | |
if isinstance(text, str): | |
# Replace curly apostrophes with straight ones | |
text = text.replace("\u2019", "'") | |
words = text.split() | |
# Title case words, preserving acronyms | |
title_words = [ | |
word if word.isupper() and len(word) > 1 else word.capitalize() | |
for word in words | |
] | |
return " ".join(title_words) | |
else: | |
return text | |
def categorize_location(location): | |
if any(place in location.lower() for place in ["cordova bay", "james bay"]): | |
return "Victoria" | |
return location | |
def excel_to_dataframe(data_directory: str) -> pd.DataFrame: | |
"""Load an Excel file, clean its contents, and generate a pd.Dataframe. | |
Args: | |
data_directory (str): File path to the directory where the Excel file is located. | |
Raises: | |
FileNotFoundError: If no Excel files are found in the specified directory. | |
Returns: | |
pd.Dataframe: | |
""" | |
# Get the xls file name (one excel worksheet) | |
excel_files = [file for file in data_directory.iterdir() if file.suffix == ".xlsx"] | |
if not excel_files: | |
raise FileNotFoundError("No Excel files found in the specified directory.") | |
if len(excel_files) > 1: | |
raise ValueError("More than one Excel file found in the specified directory.") | |
path = excel_files[0] | |
# Load Excel file | |
df = pd.read_excel(path, engine="openpyxl") | |
# Change column names to title case | |
df.columns = df.columns.str.title() | |
# Clean data | |
for col in df.columns: | |
if col.lower() != "booking link" and df[col].dtype == "object": | |
df[col] = df[col].str.strip().apply(clean_and_format_text) | |
# Handle missing values | |
df.fillna("Information Not Available", inplace=True) | |
# Add city column | |
df["City"] = df["Location"].apply(categorize_location) | |
return df | |