Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import pandas as pd | |
def format_docs(docs): | |
"""Print the contents of a list of Langchain Documents. | |
Args: | |
docs (str): | |
""" | |
print( | |
f"\n{'-' * 100}\n".join( | |
[f"Document {i+1}:\n\n" + | |
d.page_content for i, d in enumerate(docs)] | |
) | |
) | |
def excel_to_dataframe(data_directory: str) -> pd.DataFrame: | |
"""Load an Excel file, clean its contents, and generate a pd.Dataframe. | |
Args: | |
data_directory (str): File path to the directory where the Excel file is located. | |
Raises: | |
FileNotFoundError: If no Excel files are found in the specified directory. | |
Returns: | |
pd.Dataframe: | |
""" | |
# Get the xls file name (one excel worksheet) | |
excel_files = [file for file in data_directory.iterdir() | |
if file.suffix == '.xlsx'] | |
if not excel_files: | |
raise FileNotFoundError( | |
"No Excel files found in the specified directory.") | |
if len(excel_files) > 1: | |
raise ValueError( | |
"More than one Excel file found in the specified directory.") | |
path = excel_files[0] | |
# Load Excel file | |
df = pd.read_excel(path, engine='openpyxl') | |
# Change column names to title case | |
df.columns = df.columns.str.title() | |
# Function to replace curly apostrophes with straight ones | |
def replace_apostrophes(text): | |
if isinstance(text, str): | |
return text.replace("\u2019", "'") | |
return text | |
# Clean data | |
# Trim strings, standardize text (convert to title case), and replace apostrophes | |
for col in df.columns: | |
# If the column is text-based | |
if col.lower() != 'booking link' and df[col].dtype == 'object': | |
# Trim, standardize case, and replace apostrophes | |
df[col] = df[col].str.strip().str.title().apply(replace_apostrophes) | |
# Handle missing values | |
df.fillna('Information Not Available', inplace=True) | |
return df | |