import pandas as pd def format_docs(docs): """Print the contents of a list of Langchain Documents. Args: docs (str): """ print( f"\n{'-' * 100}\n".join( [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)] ) ) def clean_and_format_text(text): if isinstance(text, str): # Replace curly apostrophes with straight ones text = text.replace("\u2019", "'") words = text.split() # Title case words, preserving acronyms title_words = [ word if word.isupper() and len(word) > 1 else word.capitalize() for word in words ] return " ".join(title_words) else: return text def categorize_location(location): if any(place in location.lower() for place in ["cordova bay", "james bay"]): return "Victoria" return location def excel_to_dataframe(data_directory: str) -> pd.DataFrame: """Load an Excel file, clean its contents, and generate a pd.Dataframe. Args: data_directory (str): File path to the directory where the Excel file is located. Raises: FileNotFoundError: If no Excel files are found in the specified directory. Returns: pd.Dataframe: """ # Get the xls file name (one excel worksheet) excel_files = [file for file in data_directory.iterdir() if file.suffix == ".xlsx"] if not excel_files: raise FileNotFoundError("No Excel files found in the specified directory.") if len(excel_files) > 1: raise ValueError("More than one Excel file found in the specified directory.") path = excel_files[0] # Load Excel file df = pd.read_excel(path, engine="openpyxl") # Change column names to title case df.columns = df.columns.str.title() # Clean data for col in df.columns: if col.lower() != "booking link" and df[col].dtype == "object": df[col] = df[col].str.strip().apply(clean_and_format_text) # Handle missing values df.fillna("Information Not Available", inplace=True) # Add city column df["City"] = df["Location"].apply(categorize_location) return df