Spaces:

tall-tree
/

ai-virtual-assistant

Running on CPU Upgrade

App Files Files

ai-virtual-assistant / utils /data_processing.py

talltree

Upload 2 files

fcae4fc verified 11 months ago

raw

history blame

2.24 kB

	import pandas as pd


	def format_docs(docs):
	"""Print the contents of a list of Langchain Documents.
	Args:
	docs (str):
	"""
	print(
	f"\n{'-' * 100}\n".join(
	[f"Document {i+1}:\n\n" +
	d.page_content for i, d in enumerate(docs)]
	)
	)


	def clean_and_format_text(text):
	if isinstance(text, str):
	# Replace curly apostrophes with straight ones
	text = text.replace("\u2019", "'")
	words = text.split()
	# Title case words, preserving acronyms
	title_words = [word if word.isupper() and len(word) > 1 else word.capitalize()
	for word in words]
	return ' '.join(title_words)
	else:
	return text


	def categorize_location(location):
	if any(place in location.lower() for place in ['cordova bay', 'james bay']):
	return 'Victoria'
	return location


	def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
	"""Load an Excel file, clean its contents, and generate a pd.Dataframe.

	Args:
	data_directory (str): File path to the directory where the Excel file is located.

	Raises:
	FileNotFoundError: If no Excel files are found in the specified directory.

	Returns:
	pd.Dataframe:

	"""
	# Get the xls file name (one excel worksheet)
	excel_files = [file for file in data_directory.iterdir()
	if file.suffix == '.xlsx']

	if not excel_files:
	raise FileNotFoundError(
	"No Excel files found in the specified directory.")
	if len(excel_files) > 1:
	raise ValueError(
	"More than one Excel file found in the specified directory.")

	path = excel_files[0]

	# Load Excel file
	df = pd.read_excel(path, engine='openpyxl')

	# Change column names to title case
	df.columns = df.columns.str.title()

	# Clean data
	for col in df.columns:
	if col.lower() != 'booking link' and df[col].dtype == 'object':
	df[col] = df[col].str.strip().apply(clean_and_format_text)

	# Handle missing values
	df.fillna('Information Not Available', inplace=True)

	# Add city column
	df['City'] = df['Location'].apply(categorize_location)

	return df