Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 2,190 Bytes
9e95b48 93d3140 9e95b48 fcae4fc 93d3140 fcae4fc 93d3140 fcae4fc 9e95b48 93d3140 9e95b48 93d3140 9e95b48 93d3140 9e95b48 93d3140 9e95b48 93d3140 fcae4fc 9e95b48 93d3140 9e95b48 fcae4fc 93d3140 fcae4fc 9e95b48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import pandas as pd
def format_docs(docs):
"""Print the contents of a list of Langchain Documents.
Args:
docs (str):
"""
print(
f"\n{'-' * 100}\n".join(
[f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
)
)
def clean_and_format_text(text):
if isinstance(text, str):
# Replace curly apostrophes with straight ones
text = text.replace("\u2019", "'")
words = text.split()
# Title case words, preserving acronyms
title_words = [
word if word.isupper() and len(word) > 1 else word.capitalize()
for word in words
]
return " ".join(title_words)
else:
return text
def categorize_location(location):
if any(place in location.lower() for place in ["cordova bay", "james bay"]):
return "Victoria"
return location
def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
"""Load an Excel file, clean its contents, and generate a pd.Dataframe.
Args:
data_directory (str): File path to the directory where the Excel file is located.
Raises:
FileNotFoundError: If no Excel files are found in the specified directory.
Returns:
pd.Dataframe:
"""
# Get the xls file name (one excel worksheet)
excel_files = [file for file in data_directory.iterdir() if file.suffix == ".xlsx"]
if not excel_files:
raise FileNotFoundError("No Excel files found in the specified directory.")
if len(excel_files) > 1:
raise ValueError("More than one Excel file found in the specified directory.")
path = excel_files[0]
# Load Excel file
df = pd.read_excel(path, engine="openpyxl")
# Change column names to title case
df.columns = df.columns.str.title()
# Clean data
for col in df.columns:
if col.lower() != "booking link" and df[col].dtype == "object":
df[col] = df[col].str.strip().apply(clean_and_format_text)
# Handle missing values
df.fillna("Information Not Available", inplace=True)
# Add city column
df["City"] = df["Location"].apply(categorize_location)
return df
|