yrobel-lima commited on
Commit
93d3140
1 Parent(s): f3d91b8

Upload 2 files

Browse files
utils/data_processing.py CHANGED
@@ -8,8 +8,7 @@ def format_docs(docs):
8
  """
9
  print(
10
  f"\n{'-' * 100}\n".join(
11
- [f"Document {i+1}:\n\n" +
12
- d.page_content for i, d in enumerate(docs)]
13
  )
14
  )
15
 
@@ -20,16 +19,18 @@ def clean_and_format_text(text):
20
  text = text.replace("\u2019", "'")
21
  words = text.split()
22
  # Title case words, preserving acronyms
23
- title_words = [word if word.isupper() and len(word) > 1 else word.capitalize()
24
- for word in words]
25
- return ' '.join(title_words)
 
 
26
  else:
27
  return text
28
 
29
 
30
  def categorize_location(location):
31
- if any(place in location.lower() for place in ['cordova bay', 'james bay']):
32
- return 'Victoria'
33
  return location
34
 
35
 
@@ -47,33 +48,30 @@ def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
47
 
48
  """
49
  # Get the xls file name (one excel worksheet)
50
- excel_files = [file for file in data_directory.iterdir()
51
- if file.suffix == '.xlsx']
52
 
53
  if not excel_files:
54
- raise FileNotFoundError(
55
- "No Excel files found in the specified directory.")
56
  if len(excel_files) > 1:
57
- raise ValueError(
58
- "More than one Excel file found in the specified directory.")
59
 
60
  path = excel_files[0]
61
 
62
  # Load Excel file
63
- df = pd.read_excel(path, engine='openpyxl')
64
 
65
  # Change column names to title case
66
  df.columns = df.columns.str.title()
67
 
68
  # Clean data
69
  for col in df.columns:
70
- if col.lower() != 'booking link' and df[col].dtype == 'object':
71
  df[col] = df[col].str.strip().apply(clean_and_format_text)
72
 
73
  # Handle missing values
74
- df.fillna('Information Not Available', inplace=True)
75
 
76
  # Add city column
77
- df['City'] = df['Location'].apply(categorize_location)
78
 
79
  return df
 
8
  """
9
  print(
10
  f"\n{'-' * 100}\n".join(
11
+ [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
 
12
  )
13
  )
14
 
 
19
  text = text.replace("\u2019", "'")
20
  words = text.split()
21
  # Title case words, preserving acronyms
22
+ title_words = [
23
+ word if word.isupper() and len(word) > 1 else word.capitalize()
24
+ for word in words
25
+ ]
26
+ return " ".join(title_words)
27
  else:
28
  return text
29
 
30
 
31
  def categorize_location(location):
32
+ if any(place in location.lower() for place in ["cordova bay", "james bay"]):
33
+ return "Victoria"
34
  return location
35
 
36
 
 
48
 
49
  """
50
  # Get the xls file name (one excel worksheet)
51
+ excel_files = [file for file in data_directory.iterdir() if file.suffix == ".xlsx"]
 
52
 
53
  if not excel_files:
54
+ raise FileNotFoundError("No Excel files found in the specified directory.")
 
55
  if len(excel_files) > 1:
56
+ raise ValueError("More than one Excel file found in the specified directory.")
 
57
 
58
  path = excel_files[0]
59
 
60
  # Load Excel file
61
+ df = pd.read_excel(path, engine="openpyxl")
62
 
63
  # Change column names to title case
64
  df.columns = df.columns.str.title()
65
 
66
  # Clean data
67
  for col in df.columns:
68
+ if col.lower() != "booking link" and df[col].dtype == "object":
69
  df[col] = df[col].str.strip().apply(clean_and_format_text)
70
 
71
  # Handle missing values
72
+ df.fillna("Information Not Available", inplace=True)
73
 
74
  # Add city column
75
+ df["City"] = df["Location"].apply(categorize_location)
76
 
77
  return df
utils/update_vector_database.py CHANGED
@@ -25,26 +25,27 @@ class DataProcessor:
25
  practitioners_data = []
26
  for idx, row in df.iterrows():
27
  # I am using dot as a separator for text embeddings
28
- content = '. '.join(
29
- f"{key}: {value}" for key, value in row.items())
30
- doc = Document(page_content=content, metadata={'row': idx})
31
  practitioners_data.append(doc)
32
  return practitioners_data
33
  except FileNotFoundError:
34
  sys.exit(
35
- "Directory or Excel file not found. Please check the path and try again.")
 
36
 
37
  def load_tall_tree_data(self):
38
  # Check if the file has a .json extension
39
- json_files = [file for file in self.data_dir.iterdir()
40
- if file.suffix == '.json']
 
41
 
42
  if not json_files:
43
- raise FileNotFoundError(
44
- "No JSON files found in the specified directory.")
45
  if len(json_files) > 1:
46
  raise ValueError(
47
- "More than one JSON file found in the specified directory.")
 
48
 
49
  path = json_files[0]
50
  data = self.load_json_file(path)
@@ -54,7 +55,7 @@ class DataProcessor:
54
 
55
  def load_json_file(self, path):
56
  try:
57
- with open(path, 'r') as f:
58
  data = json.load(f)
59
  return data
60
  except json.JSONDecodeError:
@@ -64,93 +65,129 @@ class DataProcessor:
64
  tall_tree_data = []
65
  for idx, (key, value) in enumerate(data.items()):
66
  content = f"{key}: {value}"
67
- doc = Document(page_content=content, metadata={'row': idx})
68
  tall_tree_data.append(doc)
69
  return tall_tree_data
70
 
71
 
72
- class DenseVectorStore:
73
- """Store dense data in Qdrant vector database."""
74
 
75
- def __init__(self, documents: list[Document], embeddings: OpenAIEmbeddings, collection_name: str = 'practitioners_db'):
76
  self.validate_environment_variables()
77
- self.qdrant_db = Qdrant.from_documents(
78
- documents,
79
- embeddings,
80
- url=os.getenv("QDRANT_URL"),
81
- prefer_grpc=True,
82
- api_key=os.getenv(
83
- "QDRANT_API_KEY"),
84
- collection_name=collection_name,
85
- force_recreate=True)
86
 
87
  def validate_environment_variables(self):
 
88
  required_vars = ["QDRANT_API_KEY", "QDRANT_URL"]
89
- for var in required_vars:
90
- if not os.getenv(var):
91
- raise EnvironmentError(f"Missing environment variable: {var}")
 
 
92
 
93
- def get_db(self):
94
- return self.qdrant_db
95
 
 
 
96
 
97
- class SparseVectorStore:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  """Store sparse vectors in Qdrant vector database using SPLADE neural retrieval model."""
99
 
100
- def __init__(self, documents: list[Document], collection_name: str, vector_name: str, k: int = 4, splade_model_id: str = "naver/splade-cocondenser-ensembledistil"):
101
- self.validate_environment_variables()
102
- self.client = QdrantClient(url=os.getenv(
103
- "QDRANT_URL"), api_key=os.getenv("QDRANT_API_KEY")) # TODO: prefer_grpc=True is not working
 
 
 
 
 
 
 
 
 
 
 
104
  self.model_id = splade_model_id
105
- self.tokenizer, self.model = self.set_tokenizer_config()
 
106
  self.collection_name = collection_name
107
  self.vector_name = vector_name
108
  self.k = k
109
  self.sparse_retriever = self.create_sparse_retriever()
110
  self.add_documents(documents)
111
 
112
- def validate_environment_variables(self):
113
- required_vars = ["QDRANT_API_KEY", "QDRANT_URL"]
114
- for var in required_vars:
115
- if not os.getenv(var):
116
- raise EnvironmentError(f"Missing environment variable: {var}")
 
 
117
 
 
118
  @cache
119
- def set_tokenizer_config(self):
120
- """Initialize the tokenizer and the SPLADE neural retrieval model.
121
- See to https://huggingface.co/naver/splade-cocondenser-ensembledistil for more details.
122
- """
123
- tokenizer = AutoTokenizer.from_pretrained(self.model_id)
124
- model = AutoModelForMaskedLM.from_pretrained(self.model_id)
125
- return tokenizer, model
126
 
127
  def sparse_encoder(self, text: str) -> tuple[list[int], list[float]]:
128
- """This function encodes the input text into a sparse vector. The sparse_encoder is required for the QdrantSparseVectorRetriever.
129
- Adapted from the Qdrant documentation: Computing the Sparse Vector code.
130
-
131
- Args:
132
- text (str): Text to encode
133
-
134
- Returns:
135
- tuple[list[int], list[float]]: Indices and values of the sparse vector
136
- """
137
- tokens = self.tokenizer(text, return_tensors="pt",
138
- max_length=512, padding="max_length", truncation=True)
139
 
140
  with torch.no_grad():
141
- output = self.model(**tokens)
142
-
143
- logits, attention_mask = output.logits, tokens.attention_mask
144
 
145
  relu_log = torch.log1p(torch.relu(logits))
146
- weighted_log = relu_log * attention_mask.unsqueeze(-1)
147
-
148
- max_val, _ = torch.max(weighted_log, dim=1)
149
- vec = max_val.squeeze()
150
-
151
- indices = torch.nonzero(vec, as_tuple=False).squeeze().numpy()
152
- values = vec[indices].numpy()
153
 
 
 
 
154
  return indices.tolist(), values.tolist()
155
 
156
  def create_sparse_retriever(self):
@@ -193,18 +230,19 @@ def main():
193
 
194
  # Set OpenAI embeddings model
195
  # TODO: Test new OpenAI text embeddings models
196
- embeddings_model = "text-embedding-ada-002"
197
- openai_embeddings = OpenAIEmbeddings(model=embeddings_model)
 
198
 
199
  # Store both datasets in Qdrant
200
- print(f"Storing dense vectors in Qdrant using {embeddings_model}...")
201
- practitioners_db = DenseVectorStore(practitioners_dataset,
202
- openai_embeddings,
203
- collection_name="practitioners_db").get_db()
204
-
205
- tall_tree_db = DenseVectorStore(tall_tree_dataset,
206
- openai_embeddings,
207
- collection_name="tall_tree_db").get_db()
208
 
209
  print(f"Storing sparse vectors in Qdrant using SPLADE neural retrieval model...")
210
  practitioners_sparse_vector_db = SparseVectorStore(
 
25
  practitioners_data = []
26
  for idx, row in df.iterrows():
27
  # I am using dot as a separator for text embeddings
28
+ content = ". ".join(f"{key}: {value}" for key, value in row.items())
29
+ doc = Document(page_content=content, metadata={"row": idx})
 
30
  practitioners_data.append(doc)
31
  return practitioners_data
32
  except FileNotFoundError:
33
  sys.exit(
34
+ "Directory or Excel file not found. Please check the path and try again."
35
+ )
36
 
37
  def load_tall_tree_data(self):
38
  # Check if the file has a .json extension
39
+ json_files = [
40
+ file for file in self.data_dir.iterdir() if file.suffix == ".json"
41
+ ]
42
 
43
  if not json_files:
44
+ raise FileNotFoundError("No JSON files found in the specified directory.")
 
45
  if len(json_files) > 1:
46
  raise ValueError(
47
+ "More than one JSON file found in the specified directory."
48
+ )
49
 
50
  path = json_files[0]
51
  data = self.load_json_file(path)
 
55
 
56
  def load_json_file(self, path):
57
  try:
58
+ with open(path, "r") as f:
59
  data = json.load(f)
60
  return data
61
  except json.JSONDecodeError:
 
65
  tall_tree_data = []
66
  for idx, (key, value) in enumerate(data.items()):
67
  content = f"{key}: {value}"
68
+ doc = Document(page_content=content, metadata={"row": idx})
69
  tall_tree_data.append(doc)
70
  return tall_tree_data
71
 
72
 
73
+ class ValidateQdrantClient:
74
+ """Base class for retriever clients to ensure environment variables are set."""
75
 
76
+ def __init__(self):
77
  self.validate_environment_variables()
 
 
 
 
 
 
 
 
 
78
 
79
  def validate_environment_variables(self):
80
+ """Check if the Qdrant environment variables are set."""
81
  required_vars = ["QDRANT_API_KEY", "QDRANT_URL"]
82
+ missing_vars = [var for var in required_vars if not os.getenv(var)]
83
+ if missing_vars:
84
+ raise EnvironmentError(
85
+ f"Missing environment variable(s): {', '.join(missing_vars)}"
86
+ )
87
 
 
 
88
 
89
+ class DenseVectorStore(ValidateQdrantClient):
90
+ """Store dense data in Qdrant vector database."""
91
 
92
+ TEXT_EMBEDDING_MODELS = [
93
+ "text-embedding-ada-002",
94
+ "text-embedding-3-small",
95
+ "text-embedding-3-large",
96
+ ]
97
+
98
+ def __init__(
99
+ self,
100
+ documents: list[Document],
101
+ embeddings_model: str = "text-embedding-3-small",
102
+ collection_name: str = "practitioners_db",
103
+ ):
104
+ super().__init__()
105
+ if embeddings_model not in self.TEXT_EMBEDDING_MODELS:
106
+ raise ValueError(
107
+ f"Invalid embeddings model: {embeddings_model}. Valid options are {', '.join(self.TEXT_EMBEDDING_MODELS)}."
108
+ )
109
+ self.documents = documents
110
+ self.embeddings_model = embeddings_model
111
+ self.collection_name = collection_name
112
+ self._qdrant_db = None
113
+
114
+ @property
115
+ def qdrant_db(self):
116
+ if self._qdrant_db is None:
117
+ self._qdrant_db = Qdrant.from_documents(
118
+ self.documents,
119
+ OpenAIEmbeddings(model=self.embeddings_model),
120
+ url=os.getenv("QDRANT_URL"),
121
+ api_key=os.getenv("QDRANT_API_KEY"),
122
+ prefer_grpc=True,
123
+ collection_name=self.collection_name,
124
+ force_recreate=True,
125
+ )
126
+ return self._qdrant_db
127
+
128
+
129
+ class SparseVectorStore(ValidateQdrantClient):
130
  """Store sparse vectors in Qdrant vector database using SPLADE neural retrieval model."""
131
 
132
+ def __init__(
133
+ self,
134
+ documents: list[Document],
135
+ collection_name: str,
136
+ vector_name: str,
137
+ k: int = 4,
138
+ splade_model_id: str = "naver/splade-cocondenser-ensembledistil",
139
+ ):
140
+
141
+ # Validate Qdrant client
142
+ super().__init__()
143
+ self.client = QdrantClient(
144
+ url=os.getenv("QDRANT_URL"),
145
+ api_key=os.getenv("QDRANT_API_KEY"),
146
+ ) # TODO: prefer_grpc=True is not working
147
  self.model_id = splade_model_id
148
+ self._tokenizer = None
149
+ self._model = None
150
  self.collection_name = collection_name
151
  self.vector_name = vector_name
152
  self.k = k
153
  self.sparse_retriever = self.create_sparse_retriever()
154
  self.add_documents(documents)
155
 
156
+ @property
157
+ @cache
158
+ def tokenizer(self):
159
+ """Initialize the tokenizer."""
160
+ if self._tokenizer is None:
161
+ self._tokenizer = AutoTokenizer.from_pretrained(self.model_id)
162
+ return self._tokenizer
163
 
164
+ @property
165
  @cache
166
+ def model(self):
167
+ """Initialize the SPLADE neural retrieval model."""
168
+ if self._model is None:
169
+ self._model = AutoModelForMaskedLM.from_pretrained(self.model_id)
170
+ return self._model
 
 
171
 
172
  def sparse_encoder(self, text: str) -> tuple[list[int], list[float]]:
173
+ """Encode the input text into a sparse vector."""
174
+ tokens = self.tokenizer(
175
+ text,
176
+ return_tensors="pt",
177
+ max_length=512,
178
+ padding="max_length",
179
+ truncation=True,
180
+ )
 
 
 
181
 
182
  with torch.no_grad():
183
+ logits = self.model(**tokens).logits
 
 
184
 
185
  relu_log = torch.log1p(torch.relu(logits))
186
+ weighted_log = relu_log * tokens.attention_mask.unsqueeze(-1)
 
 
 
 
 
 
187
 
188
+ max_val = torch.max(weighted_log, dim=1).values.squeeze()
189
+ indices = torch.nonzero(max_val, as_tuple=False).squeeze().cpu().numpy()
190
+ values = max_val[indices].cpu().numpy()
191
  return indices.tolist(), values.tolist()
192
 
193
  def create_sparse_retriever(self):
 
230
 
231
  # Set OpenAI embeddings model
232
  # TODO: Test new OpenAI text embeddings models
233
+ # text-embedding-3-large
234
+ # text-embedding-3-small
235
+ EMBEDDINGS_MODEL = "text-embedding-3-small"
236
 
237
  # Store both datasets in Qdrant
238
+ print(f"Storing dense vectors in Qdrant using {EMBEDDINGS_MODEL}...")
239
+ practitioners_db = DenseVectorStore(
240
+ practitioners_dataset, EMBEDDINGS_MODEL, collection_name="practitioners_db"
241
+ ).qdrant_db
242
+
243
+ tall_tree_db = DenseVectorStore(
244
+ tall_tree_dataset, EMBEDDINGS_MODEL, collection_name="tall_tree_db"
245
+ ).qdrant_db
246
 
247
  print(f"Storing sparse vectors in Qdrant using SPLADE neural retrieval model...")
248
  practitioners_sparse_vector_db = SparseVectorStore(