llm

Sleeping

App Files Files Community

Chris4K commited on Jan 13

Commit

d74e0f6

verified ·

1 Parent(s): bf3527f

Update services/data_service.py

Browse files

Files changed (1) hide show

services/data_service.py +114 -70

services/data_service.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # services/data_service.py
-from typing import List, Dict, Any, Optional
 import pandas as pd
 import faiss
 import numpy as np
@@ -8,6 +8,7 @@ from datetime import datetime
 import logging
 from config.config import settings
 from functools import lru_cache
 logger = logging.getLogger(__name__)
@@ -20,85 +21,128 @@ class DataService:
         self.data_cleaned = None
     async def fetch_csv_data(self) -> pd.DataFrame:
         async with aiohttp.ClientSession() as session:
             for attempt in range(settings.MAX_RETRIES):
                 try:
                     async with session.get(settings.CSV_URL) as response:
                         if response.status == 200:
                             content = await response.text()
-                            return pd.read_csv(pd.StringIO(content), sep='|')
                 except Exception as e:
-                    logger.error(f"Attempt {attempt + 1} failed: {e}")
                     if attempt == settings.MAX_RETRIES - 1:
                         raise
-    async def prepare_data_and_index(self) -> tuple:
-        current_time = datetime.now()
-        # Check cache validity
-        if (self.last_update and
-            (current_time - self.last_update).seconds < settings.CACHE_DURATION and
-            self.cache):
-            return self.cache['data'], self.cache['index']
-        data = await self.fetch_csv_data()
-        # Data cleaning and preparation
-        columns_to_keep = [
-            'ID', 'Name', 'Description', 'Price',
-            'ProductCategory', 'Grammage',
-            'BasePriceText', 'Rating', 'RatingCount',
-            'Ingredients', 'CreationDate', 'Keywords', 'Brand'
-        ]
-        self.data_cleaned = data[columns_to_keep].copy()
-        self.data_cleaned['Description'] = self.data_cleaned['Description'].str.replace(
-            r'[^\w\s.,;:\'/?!€$%&()\[\]{}<>|=+\\-]', ' ', regex=True
-        )
-        # Improved text combination with weights
-        self.data_cleaned['combined_text'] = self.data_cleaned.apply(
-            lambda row: (
-                f"{row['Name']} {row['Name']} "  # Double weight for name
-                f"{row['Description']} "
-                f"{row['Keywords'] if pd.notnull(row['Keywords']) else ''} "
-                f"{row['ProductCategory'] if pd.notnull(row['ProductCategory']) else ''}"
-            ).strip(),
-            axis=1
-        )
-        # Create FAISS index
-        embeddings = self.embedder.encode(
-            self.data_cleaned['combined_text'].tolist(),
-            convert_to_tensor=True,
-            show_progress_bar=True
-        ).cpu().detach().numpy()
-        d = embeddings.shape[1]
-        self.faiss_index = faiss.IndexFlatL2(d)
-        self.faiss_index.add(embeddings)
-        # Update cache
-        self.cache = {
-            'data': self.data_cleaned,
-            'index': self.faiss_index
-        }
-        self.last_update = current_time
-        return self.data_cleaned, self.faiss_index
-    async def search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
-        if not self.faiss_index:
-            await self.prepare_data_and_index()
-        query_embedding = self.embedder.encode([query], convert_to_tensor=True).cpu().detach().numpy()
-        distances, indices = self.faiss_index.search(query_embedding, top_k)
-        results = []
-        for i, idx in enumerate(indices[0]):
-            product = self.data_cleaned.iloc[idx].to_dict()
-            product['score'] = float(distances[0][i])
-            results.append(product)
-        return results

 # services/data_service.py
+from typing import List, Dict, Any, Optional, Tuple
 import pandas as pd
 import faiss
 import numpy as np
 import logging
 from config.config import settings
 from functools import lru_cache
+from io import StringIO  # Add explicit StringIO import
 logger = logging.getLogger(__name__)
         self.data_cleaned = None
     async def fetch_csv_data(self) -> pd.DataFrame:
+        """Fetch CSV data from URL with retry logic"""
         async with aiohttp.ClientSession() as session:
             for attempt in range(settings.MAX_RETRIES):
                 try:
                     async with session.get(settings.CSV_URL) as response:
                         if response.status == 200:
                             content = await response.text()
+                            return pd.read_csv(StringIO(content), sep='|')
+                        else:
+                            logger.error(f"Failed to fetch data: HTTP {response.status}")
                 except Exception as e:
+                    logger.error(f"Attempt {attempt + 1} failed: {e}", exc_info=True)
                     if attempt == settings.MAX_RETRIES - 1:
                         raise
+            return pd.DataFrame()  # Return empty DataFrame if all attempts fail
+    async def prepare_data_and_index(self) -> Tuple[pd.DataFrame, Any]:
+        """Prepare data and create FAISS index with caching"""
+        try:
+            current_time = datetime.now()
+            # Check cache validity
+            if (self.last_update and
+                (current_time - self.last_update).seconds < settings.CACHE_DURATION and
+                self.cache):
+                return self.cache['data'], self.cache['index']
+            data = await self.fetch_csv_data()
+            if data.empty:
+                logger.error("Failed to fetch data")
+                return pd.DataFrame(), None
+            # Data cleaning and preparation
+            columns_to_keep = [
+                'ID', 'Name', 'Description', 'Price',
+                'ProductCategory', 'Grammage',
+                'BasePriceText', 'Rating', 'RatingCount',
+                'Ingredients', 'CreationDate', 'Keywords', 'Brand'
+            ]
+            self.data_cleaned = data[columns_to_keep].copy()
+            # Clean description text
+            self.data_cleaned['Description'] = self.data_cleaned['Description'].astype(str).str.replace(
+                r'[^\w\s.,;:\'/?!€$%&()\[\]{}<>|=+\\-]', ' ', regex=True
+            )
+            # Combine text fields with weights
+            self.data_cleaned['combined_text'] = self.data_cleaned.apply(
+                lambda row: (
+                    f"{row['Name']} {row['Name']} "  # Double weight for name
+                    f"{str(row['Description'])} "
+                    f"{str(row['Keywords']) if pd.notnull(row['Keywords']) else ''} "
+                    f"{str(row['ProductCategory']) if pd.notnull(row['ProductCategory']) else ''}"
+                ).strip(),
+                axis=1
+            )
+            # Create FAISS index
+            embeddings = self.embedder.encode(
+                self.data_cleaned['combined_text'].tolist(),
+                convert_to_tensor=True,
+                show_progress_bar=True
+            ).cpu().detach().numpy()
+            d = embeddings.shape[1]
+            self.faiss_index = faiss.IndexFlatL2(d)
+            self.faiss_index.add(embeddings)
+            # Update cache
+            self.cache = {
+                'data': self.data_cleaned,
+                'index': self.faiss_index
+            }
+            self.last_update = current_time
+            return self.data_cleaned, self.faiss_index
+        except Exception as e:
+            logger.error(f"Error in prepare_data_and_index: {e}", exc_info=True)
+            return pd.DataFrame(), None
+    async def search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
+        """Search for products similar to the query"""
+        try:
+            if not self.faiss_index:
+                self.data_cleaned, self.faiss_index = await self.prepare_data_and_index()
+                if self.faiss_index is None:
+                    return []
+            # Create query embedding
+            query_embedding = self.embedder.encode([query], convert_to_tensor=True)
+            query_embedding_np = query_embedding.cpu().detach().numpy()
+            # Search in FAISS index
+            distances, indices = self.faiss_index.search(query_embedding_np, top_k)
+            # Prepare results
+            results = []
+            for i, idx in enumerate(indices[0]):
+                try:
+                    product = {}
+                    row = self.data_cleaned.iloc[idx]
+                    for column in self.data_cleaned.columns:
+                        value = row[column]
+                        # Convert numpy/pandas types to Python native types
+                        if isinstance(value, (np.integer, np.floating)):
+                            value = value.item()
+                        elif isinstance(value, pd.Timestamp):
+                            value = value.isoformat()
+                        elif isinstance(value, np.bool_):
+                            value = bool(value)
+                        product[column] = value
+                    product['score'] = float(distances[0][i])
+                    results.append(product)
+                except Exception as e:
+                    logger.error(f"Error processing search result {i}: {e}", exc_info=True)
+                    continue
+            return results
+        except Exception as e:
+            logger.error(f"Error in search: {e}", exc_info=True)
+            return []