latterworks commited on
Commit
3098121
·
verified ·
1 Parent(s): 1bf21f5

Update noaa_incidents.py

Browse files
Files changed (1) hide show
  1. noaa_incidents.py +131 -1
noaa_incidents.py CHANGED
@@ -419,4 +419,134 @@ class NOAAIncidentDB:
419
 
420
  for idx, row in df.iterrows():
421
  # Generate unique ID
422
- unique_string = f"{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
 
420
  for idx, row in df.iterrows():
421
  # Generate unique ID
422
+ # Continue from the previous code...
423
+ # Generate unique ID using title, date, location and index
424
+ unique_string = str(row.get('title', '')) + '_' + str(row.get('date', '')) + '_' + str(row.get('location', '')) + '_' + str(idx)
425
+ incident_id = "incident_" + hashlib.md5(unique_string.encode()).hexdigest()[:8]
426
+
427
+ # Create searchable document content
428
+ doc_content = "\n".join([
429
+ "Incident: " + str(row.get('title', 'N/A')),
430
+ "Location: " + str(row.get('location', 'N/A')),
431
+ "Date: " + str(row.get('date', 'N/A')),
432
+ "Details: " + str(row.get('initial_notification', ''))
433
+ ])
434
+
435
+ # Create metadata
436
+ metadata = {
437
+ 'title': str(row.get('title', 'N/A')),
438
+ 'date': str(row.get('date', 'N/A')),
439
+ 'location': str(row.get('location', 'N/A'))
440
+ }
441
+
442
+ # Add any additional fields present
443
+ for col in df.columns:
444
+ if col not in ['title', 'date', 'location'] and pd.notna(row[col]):
445
+ metadata[col.lower().replace(' ', '_')] = str(row[col])
446
+
447
+ documents.append(doc_content.strip())
448
+ metadatas.append(metadata)
449
+ ids.append(incident_id)
450
+
451
+ # Add to database in batches
452
+ total_documents = len(documents)
453
+ for i in range(0, total_documents, BATCH_SIZE):
454
+ batch_end = min(i + BATCH_SIZE, total_documents)
455
+ self.collection.add(
456
+ documents=documents[i:batch_end],
457
+ metadatas=metadatas[i:batch_end],
458
+ ids=ids[i:batch_end]
459
+ )
460
+ logger.info(f"Added batch {i // BATCH_SIZE + 1} with {batch_end - i} incidents")
461
+
462
+ logger.info(f"Successfully loaded {total_documents} incidents into ChromaDB")
463
+ return total_documents
464
+
465
+ except Exception as e:
466
+ logger.error(f"Error loading incidents from CSV: {e}")
467
+ return 0
468
+
469
+ def search(self, query: str, n_results: int = 5) -> List[Dict]:
470
+ """
471
+ Search for incidents matching the query.
472
+
473
+ Args:
474
+ query (str): Search query
475
+ n_results (int): Number of results to return
476
+
477
+ Returns:
478
+ List[Dict]: List of matching incidents
479
+ """
480
+ try:
481
+ results = self.collection.query(
482
+ query_texts=[query],
483
+ n_results=n_results,
484
+ include=['metadatas', 'documents', 'ids']
485
+ )
486
+
487
+ formatted_results = []
488
+ for doc, metadata, incident_id in zip(
489
+ results['documents'][0],
490
+ results['metadatas'][0],
491
+ results['ids'][0]
492
+ ):
493
+ result = {
494
+ 'id': incident_id,
495
+ 'title': metadata.get('title', 'N/A'),
496
+ 'date': metadata.get('date', 'N/A'),
497
+ 'location': metadata.get('location', 'N/A'),
498
+ 'details': doc,
499
+ 'metadata': metadata
500
+ }
501
+ formatted_results.append(result)
502
+
503
+ return formatted_results
504
+
505
+ except Exception as e:
506
+ logger.error(f"Error during search: {e}")
507
+ return []
508
+
509
+ def delete_collection(self):
510
+ """Delete the current collection."""
511
+ try:
512
+ self.client.delete_collection("noaa_incidents")
513
+ logger.info("Collection deleted successfully")
514
+ except Exception as e:
515
+ logger.error(f"Error deleting collection: {e}")
516
+
517
+ def get_collection_stats(self) -> Dict:
518
+ """
519
+ Get statistics about the current collection.
520
+
521
+ Returns:
522
+ Dict: Collection statistics
523
+ """
524
+ try:
525
+ count = self.collection.count()
526
+ return {
527
+ "total_documents": count,
528
+ "collection_name": "noaa_incidents",
529
+ "embedding_model": self.embedding_function.model_name
530
+ }
531
+ except Exception as e:
532
+ logger.error(f"Error getting collection stats: {e}")
533
+ return {}
534
+
535
+ if __name__ == "__main__":
536
+ # Example usage
537
+ scraper = NOAAIncidentScraper(max_workers=5)
538
+ csv_file, json_file = scraper.run(validate_first=True)
539
+
540
+ if csv_file:
541
+ db = NOAAIncidentDB()
542
+ num_loaded = db.load_incidents(csv_file)
543
+ logger.info(f"Loaded {num_loaded} incidents into database")
544
+
545
+ # Example search
546
+ results = db.search("oil spill near coral reefs", n_results=5)
547
+ for i, result in enumerate(results, 1):
548
+ print(f"\nResult {i}:")
549
+ print(f"Title: {result['title']}")
550
+ print(f"Date: {result['date']}")
551
+ print(f"Location: {result['location']}")
552
+ print(f"Details: {result['details']}\n")