ai-virtual-assistant / utils /data_processing.py
talltree's picture
Upload 3 files
9e95b48 verified
raw
history blame
1.95 kB
import pandas as pd
def format_docs(docs):
"""Print the contents of a list of Langchain Documents.
Args:
docs (str):
"""
print(
f"\n{'-' * 100}\n".join(
[f"Document {i+1}:\n\n" +
d.page_content for i, d in enumerate(docs)]
)
)
def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
"""Load an Excel file, clean its contents, and generate a pd.Dataframe.
Args:
data_directory (str): File path to the directory where the Excel file is located.
Raises:
FileNotFoundError: If no Excel files are found in the specified directory.
Returns:
pd.Dataframe:
"""
# Get the xls file name (one excel worksheet)
excel_files = [file for file in data_directory.iterdir()
if file.suffix == '.xlsx']
if not excel_files:
raise FileNotFoundError(
"No Excel files found in the specified directory.")
if len(excel_files) > 1:
raise ValueError(
"More than one Excel file found in the specified directory.")
path = excel_files[0]
# Load Excel file
df = pd.read_excel(path, engine='openpyxl')
# Change column names to title case
df.columns = df.columns.str.title()
# Function to replace curly apostrophes with straight ones
def replace_apostrophes(text):
if isinstance(text, str):
return text.replace("\u2019", "'")
return text
# Clean data
# Trim strings, standardize text (convert to title case), and replace apostrophes
for col in df.columns:
# If the column is text-based
if col.lower() != 'booking link' and df[col].dtype == 'object':
# Trim, standardize case, and replace apostrophes
df[col] = df[col].str.strip().str.title().apply(replace_apostrophes)
# Handle missing values
df.fillna('Information Not Available', inplace=True)
return df