Spaces:

tall-tree
/

ai-virtual-assistant

Running on CPU Upgrade

talltree commited on Mar 15, 2024

Commit

fcae4fc

verified ·

1 Parent(s): 9593cdf

Upload 2 files

Files changed (2) hide show

utils/data_processing.py CHANGED Viewed

@@ -14,6 +14,25 @@ def format_docs(docs):
     )
 def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
     """Load an Excel file, clean its contents, and generate a pd.Dataframe.
@@ -46,21 +65,15 @@ def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
     # Change column names to title case
     df.columns = df.columns.str.title()
-    # Function to replace curly apostrophes with straight ones
-    def replace_apostrophes(text):
-        if isinstance(text, str):
-            return text.replace("\u2019", "'")
-        return text
     # Clean data
-    # Trim strings, standardize text (convert to title case), and replace apostrophes
     for col in df.columns:
-        # If the column is text-based
         if col.lower() != 'booking link' and df[col].dtype == 'object':
-            # Trim, standardize case, and replace apostrophes
-            df[col] = df[col].str.strip().str.title().apply(replace_apostrophes)
     # Handle missing values
     df.fillna('Information Not Available', inplace=True)
     return df

     )
+def clean_and_format_text(text):
+    if isinstance(text, str):
+        # Replace curly apostrophes with straight ones
+        text = text.replace("\u2019", "'")
+        words = text.split()
+        # Title case words, preserving acronyms
+        title_words = [word if word.isupper() and len(word) > 1 else word.capitalize()
+                       for word in words]
+        return ' '.join(title_words)
+    else:
+        return text
+def categorize_location(location):
+    if any(place in location.lower() for place in ['cordova bay', 'james bay']):
+        return 'Victoria'
+    return location
 def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
     """Load an Excel file, clean its contents, and generate a pd.Dataframe.
     # Change column names to title case
     df.columns = df.columns.str.title()
     # Clean data
     for col in df.columns:
         if col.lower() != 'booking link' and df[col].dtype == 'object':
+            df[col] = df[col].str.strip().apply(clean_and_format_text)
     # Handle missing values
     df.fillna('Information Not Available', inplace=True)
+    # Add city column
+    df['City'] = df['Location'].apply(categorize_location)
     return df

utils/update_vector_database.py CHANGED Viewed

@@ -19,16 +19,9 @@ class DataProcessor:
     def __init__(self, data_dir: Path):
         self.data_dir = data_dir
-    @staticmethod
-    def categorize_location(location):
-        if any(place in location.lower() for place in ['cordova bay', 'james bay']):
-            return 'Victoria'
-        return location
     def load_practitioners_data(self):
         try:
             df = excel_to_dataframe(self.data_dir)
-            df['City'] = df['Location'].apply(self.categorize_location)
             practitioners_data = []
             for idx, row in df.iterrows():
                 # I am using dot as a separator for text embeddings
@@ -195,7 +188,7 @@ def main():
     tall_tree_dataset = processor.load_tall_tree_data()
     # Set OpenAI embeddings model
-    # TODO: Test new embeddings model text-embedding-3-small
     embeddings_model = "text-embedding-ada-002"
     openai_embeddings = OpenAIEmbeddings(model=embeddings_model)

     def __init__(self, data_dir: Path):
         self.data_dir = data_dir
     def load_practitioners_data(self):
         try:
             df = excel_to_dataframe(self.data_dir)
             practitioners_data = []
             for idx, row in df.iterrows():
                 # I am using dot as a separator for text embeddings
     tall_tree_dataset = processor.load_tall_tree_data()
     # Set OpenAI embeddings model
+    # TODO: Test new OpenAI text embeddings models
     embeddings_model = "text-embedding-ada-002"
     openai_embeddings = OpenAIEmbeddings(model=embeddings_model)