Spaces:

tall-tree
/

ai-virtual-assistant

Running on CPU Upgrade

App Files Files

yrobel-lima commited on Mar 25, 2024

Commit

93d3140

verified ·

1 Parent(s): f3d91b8

Upload 2 files

Browse files

Files changed (2) hide show

utils/data_processing.py +15 -17
utils/update_vector_database.py +115 -77

utils/data_processing.py CHANGED Viewed

@@ -8,8 +8,7 @@ def format_docs(docs):
     """
     print(
         f"\n{'-' * 100}\n".join(
-            [f"Document {i+1}:\n\n" +
-                d.page_content for i, d in enumerate(docs)]
         )
     )
@@ -20,16 +19,18 @@ def clean_and_format_text(text):
         text = text.replace("\u2019", "'")
         words = text.split()
         # Title case words, preserving acronyms
-        title_words = [word if word.isupper() and len(word) > 1 else word.capitalize()
-                       for word in words]
-        return ' '.join(title_words)
     else:
         return text
 def categorize_location(location):
-    if any(place in location.lower() for place in ['cordova bay', 'james bay']):
-        return 'Victoria'
     return location
@@ -47,33 +48,30 @@ def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
     """
     # Get the xls file name (one excel worksheet)
-    excel_files = [file for file in data_directory.iterdir()
-                   if file.suffix == '.xlsx']
     if not excel_files:
-        raise FileNotFoundError(
-            "No Excel files found in the specified directory.")
     if len(excel_files) > 1:
-        raise ValueError(
-            "More than one Excel file found in the specified directory.")
     path = excel_files[0]
     # Load Excel file
-    df = pd.read_excel(path, engine='openpyxl')
     # Change column names to title case
     df.columns = df.columns.str.title()
     # Clean data
     for col in df.columns:
-        if col.lower() != 'booking link' and df[col].dtype == 'object':
             df[col] = df[col].str.strip().apply(clean_and_format_text)
     # Handle missing values
-    df.fillna('Information Not Available', inplace=True)
     # Add city column
-    df['City'] = df['Location'].apply(categorize_location)
     return df

     """
     print(
         f"\n{'-' * 100}\n".join(
+            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
         )
     )
         text = text.replace("\u2019", "'")
         words = text.split()
         # Title case words, preserving acronyms
+        title_words = [
+            word if word.isupper() and len(word) > 1 else word.capitalize()
+            for word in words
+        ]
+        return " ".join(title_words)
     else:
         return text
 def categorize_location(location):
+    if any(place in location.lower() for place in ["cordova bay", "james bay"]):
+        return "Victoria"
     return location
     """
     # Get the xls file name (one excel worksheet)
+    excel_files = [file for file in data_directory.iterdir() if file.suffix == ".xlsx"]
     if not excel_files:
+        raise FileNotFoundError("No Excel files found in the specified directory.")
     if len(excel_files) > 1:
+        raise ValueError("More than one Excel file found in the specified directory.")
     path = excel_files[0]
     # Load Excel file
+    df = pd.read_excel(path, engine="openpyxl")
     # Change column names to title case
     df.columns = df.columns.str.title()
     # Clean data
     for col in df.columns:
+        if col.lower() != "booking link" and df[col].dtype == "object":
             df[col] = df[col].str.strip().apply(clean_and_format_text)
     # Handle missing values
+    df.fillna("Information Not Available", inplace=True)
     # Add city column
+    df["City"] = df["Location"].apply(categorize_location)
     return df

utils/update_vector_database.py CHANGED Viewed

@@ -25,26 +25,27 @@ class DataProcessor:
             practitioners_data = []
             for idx, row in df.iterrows():
                 # I am using dot as a separator for text embeddings
-                content = '. '.join(
-                    f"{key}: {value}" for key, value in row.items())
-                doc = Document(page_content=content, metadata={'row': idx})
                 practitioners_data.append(doc)
             return practitioners_data
         except FileNotFoundError:
             sys.exit(
-                "Directory or Excel file not found. Please check the path and try again.")
     def load_tall_tree_data(self):
         # Check if the file has a .json extension
-        json_files = [file for file in self.data_dir.iterdir()
-                      if file.suffix == '.json']
         if not json_files:
-            raise FileNotFoundError(
-                "No JSON files found in the specified directory.")
         if len(json_files) > 1:
             raise ValueError(
-                "More than one JSON file found in the specified directory.")
         path = json_files[0]
         data = self.load_json_file(path)
@@ -54,7 +55,7 @@ class DataProcessor:
     def load_json_file(self, path):
         try:
-            with open(path, 'r') as f:
                 data = json.load(f)
             return data
         except json.JSONDecodeError:
@@ -64,93 +65,129 @@ class DataProcessor:
         tall_tree_data = []
         for idx, (key, value) in enumerate(data.items()):
             content = f"{key}: {value}"
-            doc = Document(page_content=content, metadata={'row': idx})
             tall_tree_data.append(doc)
         return tall_tree_data
-class DenseVectorStore:
-    """Store dense data in Qdrant vector database."""
-    def __init__(self, documents: list[Document], embeddings: OpenAIEmbeddings, collection_name: str = 'practitioners_db'):
         self.validate_environment_variables()
-        self.qdrant_db = Qdrant.from_documents(
-            documents,
-            embeddings,
-            url=os.getenv("QDRANT_URL"),
-            prefer_grpc=True,
-            api_key=os.getenv(
-                "QDRANT_API_KEY"),
-            collection_name=collection_name,
-            force_recreate=True)
     def validate_environment_variables(self):
         required_vars = ["QDRANT_API_KEY", "QDRANT_URL"]
-        for var in required_vars:
-            if not os.getenv(var):
-                raise EnvironmentError(f"Missing environment variable: {var}")
-    def get_db(self):
-        return self.qdrant_db
-class SparseVectorStore:
     """Store sparse vectors in Qdrant vector database using SPLADE neural retrieval model."""
-    def __init__(self, documents: list[Document], collection_name: str, vector_name: str, k: int = 4, splade_model_id: str = "naver/splade-cocondenser-ensembledistil"):
-        self.validate_environment_variables()
-        self.client = QdrantClient(url=os.getenv(
-            "QDRANT_URL"), api_key=os.getenv("QDRANT_API_KEY"))  # TODO: prefer_grpc=True is not working
         self.model_id = splade_model_id
-        self.tokenizer, self.model = self.set_tokenizer_config()
         self.collection_name = collection_name
         self.vector_name = vector_name
         self.k = k
         self.sparse_retriever = self.create_sparse_retriever()
         self.add_documents(documents)
-    def validate_environment_variables(self):
-        required_vars = ["QDRANT_API_KEY", "QDRANT_URL"]
-        for var in required_vars:
-            if not os.getenv(var):
-                raise EnvironmentError(f"Missing environment variable: {var}")
     @cache
-    def set_tokenizer_config(self):
-        """Initialize the tokenizer and the SPLADE neural retrieval model.
-        See to https://huggingface.co/naver/splade-cocondenser-ensembledistil for more details.
-        """
-        tokenizer = AutoTokenizer.from_pretrained(self.model_id)
-        model = AutoModelForMaskedLM.from_pretrained(self.model_id)
-        return tokenizer, model
     def sparse_encoder(self, text: str) -> tuple[list[int], list[float]]:
-        """This function encodes the input text into a sparse vector. The sparse_encoder is required for the QdrantSparseVectorRetriever.
-        Adapted from the Qdrant documentation: Computing the Sparse Vector code.
-        Args:
-            text (str): Text to encode
-        Returns:
-            tuple[list[int], list[float]]: Indices and values of the sparse vector
-        """
-        tokens = self.tokenizer(text, return_tensors="pt",
-                                max_length=512, padding="max_length", truncation=True)
         with torch.no_grad():
-            output = self.model(**tokens)
-        logits, attention_mask = output.logits, tokens.attention_mask
         relu_log = torch.log1p(torch.relu(logits))
-        weighted_log = relu_log * attention_mask.unsqueeze(-1)
-        max_val, _ = torch.max(weighted_log, dim=1)
-        vec = max_val.squeeze()
-        indices = torch.nonzero(vec, as_tuple=False).squeeze().numpy()
-        values = vec[indices].numpy()
         return indices.tolist(), values.tolist()
     def create_sparse_retriever(self):
@@ -193,18 +230,19 @@ def main():
     # Set OpenAI embeddings model
     # TODO: Test new OpenAI text embeddings models
-    embeddings_model = "text-embedding-ada-002"
-    openai_embeddings = OpenAIEmbeddings(model=embeddings_model)
     # Store both datasets in Qdrant
-    print(f"Storing dense vectors in Qdrant using {embeddings_model}...")
-    practitioners_db = DenseVectorStore(practitioners_dataset,
-                                        openai_embeddings,
-                                        collection_name="practitioners_db").get_db()
-    tall_tree_db = DenseVectorStore(tall_tree_dataset,
-                                    openai_embeddings,
-                                    collection_name="tall_tree_db").get_db()
     print(f"Storing sparse vectors in Qdrant using SPLADE neural retrieval model...")
     practitioners_sparse_vector_db = SparseVectorStore(

             practitioners_data = []
             for idx, row in df.iterrows():
                 # I am using dot as a separator for text embeddings
+                content = ". ".join(f"{key}: {value}" for key, value in row.items())
+                doc = Document(page_content=content, metadata={"row": idx})
                 practitioners_data.append(doc)
             return practitioners_data
         except FileNotFoundError:
             sys.exit(
+                "Directory or Excel file not found. Please check the path and try again."
+            )
     def load_tall_tree_data(self):
         # Check if the file has a .json extension
+        json_files = [
+            file for file in self.data_dir.iterdir() if file.suffix == ".json"
+        ]
         if not json_files:
+            raise FileNotFoundError("No JSON files found in the specified directory.")
         if len(json_files) > 1:
             raise ValueError(
+                "More than one JSON file found in the specified directory."
+            )
         path = json_files[0]
         data = self.load_json_file(path)
     def load_json_file(self, path):
         try:
+            with open(path, "r") as f:
                 data = json.load(f)
             return data
         except json.JSONDecodeError:
         tall_tree_data = []
         for idx, (key, value) in enumerate(data.items()):
             content = f"{key}: {value}"
+            doc = Document(page_content=content, metadata={"row": idx})
             tall_tree_data.append(doc)
         return tall_tree_data
+class ValidateQdrantClient:
+    """Base class for retriever clients to ensure environment variables are set."""
+    def __init__(self):
         self.validate_environment_variables()
     def validate_environment_variables(self):
+        """Check if the Qdrant environment variables are set."""
         required_vars = ["QDRANT_API_KEY", "QDRANT_URL"]
+        missing_vars = [var for var in required_vars if not os.getenv(var)]
+        if missing_vars:
+            raise EnvironmentError(
+                f"Missing environment variable(s): {', '.join(missing_vars)}"
+            )
+class DenseVectorStore(ValidateQdrantClient):
+    """Store dense data in Qdrant vector database."""
+    TEXT_EMBEDDING_MODELS = [
+        "text-embedding-ada-002",
+        "text-embedding-3-small",
+        "text-embedding-3-large",
+    ]
+    def __init__(
+        self,
+        documents: list[Document],
+        embeddings_model: str = "text-embedding-3-small",
+        collection_name: str = "practitioners_db",
+    ):
+        super().__init__()
+        if embeddings_model not in self.TEXT_EMBEDDING_MODELS:
+            raise ValueError(
+                f"Invalid embeddings model: {embeddings_model}. Valid options are {', '.join(self.TEXT_EMBEDDING_MODELS)}."
+            )
+        self.documents = documents
+        self.embeddings_model = embeddings_model
+        self.collection_name = collection_name
+        self._qdrant_db = None
+    @property
+    def qdrant_db(self):
+        if self._qdrant_db is None:
+            self._qdrant_db = Qdrant.from_documents(
+                self.documents,
+                OpenAIEmbeddings(model=self.embeddings_model),
+                url=os.getenv("QDRANT_URL"),
+                api_key=os.getenv("QDRANT_API_KEY"),
+                prefer_grpc=True,
+                collection_name=self.collection_name,
+                force_recreate=True,
+            )
+        return self._qdrant_db
+class SparseVectorStore(ValidateQdrantClient):
     """Store sparse vectors in Qdrant vector database using SPLADE neural retrieval model."""
+    def __init__(
+        self,
+        documents: list[Document],
+        collection_name: str,
+        vector_name: str,
+        k: int = 4,
+        splade_model_id: str = "naver/splade-cocondenser-ensembledistil",
+    ):
+        # Validate Qdrant client
+        super().__init__()
+        self.client = QdrantClient(
+            url=os.getenv("QDRANT_URL"),
+            api_key=os.getenv("QDRANT_API_KEY"),
+        )  # TODO: prefer_grpc=True is not working
         self.model_id = splade_model_id
+        self._tokenizer = None
+        self._model = None
         self.collection_name = collection_name
         self.vector_name = vector_name
         self.k = k
         self.sparse_retriever = self.create_sparse_retriever()
         self.add_documents(documents)
+    @property
+    @cache
+    def tokenizer(self):
+        """Initialize the tokenizer."""
+        if self._tokenizer is None:
+            self._tokenizer = AutoTokenizer.from_pretrained(self.model_id)
+        return self._tokenizer
+    @property
     @cache
+    def model(self):
+        """Initialize the SPLADE neural retrieval model."""
+        if self._model is None:
+            self._model = AutoModelForMaskedLM.from_pretrained(self.model_id)
+        return self._model
     def sparse_encoder(self, text: str) -> tuple[list[int], list[float]]:
+        """Encode the input text into a sparse vector."""
+        tokens = self.tokenizer(
+            text,
+            return_tensors="pt",
+            max_length=512,
+            padding="max_length",
+            truncation=True,
+        )
         with torch.no_grad():
+            logits = self.model(**tokens).logits
         relu_log = torch.log1p(torch.relu(logits))
+        weighted_log = relu_log * tokens.attention_mask.unsqueeze(-1)
+        max_val = torch.max(weighted_log, dim=1).values.squeeze()
+        indices = torch.nonzero(max_val, as_tuple=False).squeeze().cpu().numpy()
+        values = max_val[indices].cpu().numpy()
         return indices.tolist(), values.tolist()
     def create_sparse_retriever(self):
     # Set OpenAI embeddings model
     # TODO: Test new OpenAI text embeddings models
+    # text-embedding-3-large
+    # text-embedding-3-small
+    EMBEDDINGS_MODEL = "text-embedding-3-small"
     # Store both datasets in Qdrant
+    print(f"Storing dense vectors in Qdrant using {EMBEDDINGS_MODEL}...")
+    practitioners_db = DenseVectorStore(
+        practitioners_dataset, EMBEDDINGS_MODEL, collection_name="practitioners_db"
+    ).qdrant_db
+    tall_tree_db = DenseVectorStore(
+        tall_tree_dataset, EMBEDDINGS_MODEL, collection_name="tall_tree_db"
+    ).qdrant_db
     print(f"Storing sparse vectors in Qdrant using SPLADE neural retrieval model...")
     practitioners_sparse_vector_db = SparseVectorStore(