talltree commited on
Commit
fcae4fc
1 Parent(s): 9593cdf

Upload 2 files

Browse files
utils/data_processing.py CHANGED
@@ -14,6 +14,25 @@ def format_docs(docs):
14
  )
15
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
18
  """Load an Excel file, clean its contents, and generate a pd.Dataframe.
19
 
@@ -46,21 +65,15 @@ def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
46
  # Change column names to title case
47
  df.columns = df.columns.str.title()
48
 
49
- # Function to replace curly apostrophes with straight ones
50
- def replace_apostrophes(text):
51
- if isinstance(text, str):
52
- return text.replace("\u2019", "'")
53
- return text
54
-
55
  # Clean data
56
- # Trim strings, standardize text (convert to title case), and replace apostrophes
57
  for col in df.columns:
58
- # If the column is text-based
59
  if col.lower() != 'booking link' and df[col].dtype == 'object':
60
- # Trim, standardize case, and replace apostrophes
61
- df[col] = df[col].str.strip().str.title().apply(replace_apostrophes)
62
 
63
  # Handle missing values
64
  df.fillna('Information Not Available', inplace=True)
65
 
 
 
 
66
  return df
 
14
  )
15
 
16
 
17
+ def clean_and_format_text(text):
18
+ if isinstance(text, str):
19
+ # Replace curly apostrophes with straight ones
20
+ text = text.replace("\u2019", "'")
21
+ words = text.split()
22
+ # Title case words, preserving acronyms
23
+ title_words = [word if word.isupper() and len(word) > 1 else word.capitalize()
24
+ for word in words]
25
+ return ' '.join(title_words)
26
+ else:
27
+ return text
28
+
29
+
30
+ def categorize_location(location):
31
+ if any(place in location.lower() for place in ['cordova bay', 'james bay']):
32
+ return 'Victoria'
33
+ return location
34
+
35
+
36
  def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
37
  """Load an Excel file, clean its contents, and generate a pd.Dataframe.
38
 
 
65
  # Change column names to title case
66
  df.columns = df.columns.str.title()
67
 
 
 
 
 
 
 
68
  # Clean data
 
69
  for col in df.columns:
 
70
  if col.lower() != 'booking link' and df[col].dtype == 'object':
71
+ df[col] = df[col].str.strip().apply(clean_and_format_text)
 
72
 
73
  # Handle missing values
74
  df.fillna('Information Not Available', inplace=True)
75
 
76
+ # Add city column
77
+ df['City'] = df['Location'].apply(categorize_location)
78
+
79
  return df
utils/update_vector_database.py CHANGED
@@ -19,16 +19,9 @@ class DataProcessor:
19
  def __init__(self, data_dir: Path):
20
  self.data_dir = data_dir
21
 
22
- @staticmethod
23
- def categorize_location(location):
24
- if any(place in location.lower() for place in ['cordova bay', 'james bay']):
25
- return 'Victoria'
26
- return location
27
-
28
  def load_practitioners_data(self):
29
  try:
30
  df = excel_to_dataframe(self.data_dir)
31
- df['City'] = df['Location'].apply(self.categorize_location)
32
  practitioners_data = []
33
  for idx, row in df.iterrows():
34
  # I am using dot as a separator for text embeddings
@@ -195,7 +188,7 @@ def main():
195
  tall_tree_dataset = processor.load_tall_tree_data()
196
 
197
  # Set OpenAI embeddings model
198
- # TODO: Test new embeddings model text-embedding-3-small
199
  embeddings_model = "text-embedding-ada-002"
200
  openai_embeddings = OpenAIEmbeddings(model=embeddings_model)
201
 
 
19
  def __init__(self, data_dir: Path):
20
  self.data_dir = data_dir
21
 
 
 
 
 
 
 
22
  def load_practitioners_data(self):
23
  try:
24
  df = excel_to_dataframe(self.data_dir)
 
25
  practitioners_data = []
26
  for idx, row in df.iterrows():
27
  # I am using dot as a separator for text embeddings
 
188
  tall_tree_dataset = processor.load_tall_tree_data()
189
 
190
  # Set OpenAI embeddings model
191
+ # TODO: Test new OpenAI text embeddings models
192
  embeddings_model = "text-embedding-ada-002"
193
  openai_embeddings = OpenAIEmbeddings(model=embeddings_model)
194