Spaces:

latterworks
/

db

Runtime error

App Files Files Community

latterworks commited on Dec 26, 2024

Commit

3098121

verified ·

1 Parent(s): 1bf21f5

Update noaa_incidents.py

Browse files

Files changed (1) hide show

noaa_incidents.py +131 -1

noaa_incidents.py CHANGED Viewed

@@ -419,4 +419,134 @@ class NOAAIncidentDB:
             for idx, row in df.iterrows():
                 # Generate unique ID
-                unique_string = f"{

             for idx, row in df.iterrows():
                 # Generate unique ID
+                # Continue from the previous code...
+                # Generate unique ID using title, date, location and index
+                unique_string = str(row.get('title', '')) + '_' + str(row.get('date', '')) + '_' + str(row.get('location', '')) + '_' + str(idx)
+                incident_id = "incident_" + hashlib.md5(unique_string.encode()).hexdigest()[:8]
+                # Create searchable document content
+                doc_content = "\n".join([
+                    "Incident: " + str(row.get('title', 'N/A')),
+                    "Location: " + str(row.get('location', 'N/A')),
+                    "Date: " + str(row.get('date', 'N/A')),
+                    "Details: " + str(row.get('initial_notification', ''))
+                ])
+                # Create metadata
+                metadata = {
+                    'title': str(row.get('title', 'N/A')),
+                    'date': str(row.get('date', 'N/A')),
+                    'location': str(row.get('location', 'N/A'))
+                }
+                # Add any additional fields present
+                for col in df.columns:
+                    if col not in ['title', 'date', 'location'] and pd.notna(row[col]):
+                        metadata[col.lower().replace(' ', '_')] = str(row[col])
+                documents.append(doc_content.strip())
+                metadatas.append(metadata)
+                ids.append(incident_id)
+            # Add to database in batches
+            total_documents = len(documents)
+            for i in range(0, total_documents, BATCH_SIZE):
+                batch_end = min(i + BATCH_SIZE, total_documents)
+                self.collection.add(
+                    documents=documents[i:batch_end],
+                    metadatas=metadatas[i:batch_end],
+                    ids=ids[i:batch_end]
+                )
+                logger.info(f"Added batch {i // BATCH_SIZE + 1} with {batch_end - i} incidents")
+            logger.info(f"Successfully loaded {total_documents} incidents into ChromaDB")
+            return total_documents
+        except Exception as e:
+            logger.error(f"Error loading incidents from CSV: {e}")
+            return 0
+    def search(self, query: str, n_results: int = 5) -> List[Dict]:
+        """
+        Search for incidents matching the query.
+        Args:
+            query (str): Search query
+            n_results (int): Number of results to return
+        Returns:
+            List[Dict]: List of matching incidents
+        """
+        try:
+            results = self.collection.query(
+                query_texts=[query],
+                n_results=n_results,
+                include=['metadatas', 'documents', 'ids']
+            )
+            formatted_results = []
+            for doc, metadata, incident_id in zip(
+                results['documents'][0],
+                results['metadatas'][0],
+                results['ids'][0]
+            ):
+                result = {
+                    'id': incident_id,
+                    'title': metadata.get('title', 'N/A'),
+                    'date': metadata.get('date', 'N/A'),
+                    'location': metadata.get('location', 'N/A'),
+                    'details': doc,
+                    'metadata': metadata
+                }
+                formatted_results.append(result)
+            return formatted_results
+        except Exception as e:
+            logger.error(f"Error during search: {e}")
+            return []
+    def delete_collection(self):
+        """Delete the current collection."""
+        try:
+            self.client.delete_collection("noaa_incidents")
+            logger.info("Collection deleted successfully")
+        except Exception as e:
+            logger.error(f"Error deleting collection: {e}")
+    def get_collection_stats(self) -> Dict:
+        """
+        Get statistics about the current collection.
+        Returns:
+            Dict: Collection statistics
+        """
+        try:
+            count = self.collection.count()
+            return {
+                "total_documents": count,
+                "collection_name": "noaa_incidents",
+                "embedding_model": self.embedding_function.model_name
+            }
+        except Exception as e:
+            logger.error(f"Error getting collection stats: {e}")
+            return {}
+if __name__ == "__main__":
+    # Example usage
+    scraper = NOAAIncidentScraper(max_workers=5)
+    csv_file, json_file = scraper.run(validate_first=True)
+    if csv_file:
+        db = NOAAIncidentDB()
+        num_loaded = db.load_incidents(csv_file)
+        logger.info(f"Loaded {num_loaded} incidents into database")
+        # Example search
+        results = db.search("oil spill near coral reefs", n_results=5)
+        for i, result in enumerate(results, 1):
+            print(f"\nResult {i}:")
+            print(f"Title: {result['title']}")
+            print(f"Date: {result['date']}")
+            print(f"Location: {result['location']}")
+            print(f"Details: {result['details']}\n")