Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import pandas as pd | |
def format_docs(docs): | |
"""Print the contents of a list of Langchain Documents. | |
Args: | |
docs (str): | |
""" | |
print( | |
f"\n{'-' * 100}\n".join( | |
[f"Document {i+1}:\n\n" + | |
d.page_content for i, d in enumerate(docs)] | |
) | |
) | |
def clean_and_format_text(text): | |
if isinstance(text, str): | |
# Replace curly apostrophes with straight ones | |
text = text.replace("\u2019", "'") | |
words = text.split() | |
# Title case words, preserving acronyms | |
title_words = [word if word.isupper() and len(word) > 1 else word.capitalize() | |
for word in words] | |
return ' '.join(title_words) | |
else: | |
return text | |
def categorize_location(location): | |
if any(place in location.lower() for place in ['cordova bay', 'james bay']): | |
return 'Victoria' | |
return location | |
def excel_to_dataframe(data_directory: str) -> pd.DataFrame: | |
"""Load an Excel file, clean its contents, and generate a pd.Dataframe. | |
Args: | |
data_directory (str): File path to the directory where the Excel file is located. | |
Raises: | |
FileNotFoundError: If no Excel files are found in the specified directory. | |
Returns: | |
pd.Dataframe: | |
""" | |
# Get the xls file name (one excel worksheet) | |
excel_files = [file for file in data_directory.iterdir() | |
if file.suffix == '.xlsx'] | |
if not excel_files: | |
raise FileNotFoundError( | |
"No Excel files found in the specified directory.") | |
if len(excel_files) > 1: | |
raise ValueError( | |
"More than one Excel file found in the specified directory.") | |
path = excel_files[0] | |
# Load Excel file | |
df = pd.read_excel(path, engine='openpyxl') | |
# Change column names to title case | |
df.columns = df.columns.str.title() | |
# Clean data | |
for col in df.columns: | |
if col.lower() != 'booking link' and df[col].dtype == 'object': | |
df[col] = df[col].str.strip().apply(clean_and_format_text) | |
# Handle missing values | |
df.fillna('Information Not Available', inplace=True) | |
# Add city column | |
df['City'] = df['Location'].apply(categorize_location) | |
return df | |