Spaces:
Runtime error
Runtime error
Update noaa_incidents.py
Browse files- noaa_incidents.py +131 -1
noaa_incidents.py
CHANGED
@@ -419,4 +419,134 @@ class NOAAIncidentDB:
|
|
419 |
|
420 |
for idx, row in df.iterrows():
|
421 |
# Generate unique ID
|
422 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
419 |
|
420 |
for idx, row in df.iterrows():
|
421 |
# Generate unique ID
|
422 |
+
# Continue from the previous code...
|
423 |
+
# Generate unique ID using title, date, location and index
|
424 |
+
unique_string = str(row.get('title', '')) + '_' + str(row.get('date', '')) + '_' + str(row.get('location', '')) + '_' + str(idx)
|
425 |
+
incident_id = "incident_" + hashlib.md5(unique_string.encode()).hexdigest()[:8]
|
426 |
+
|
427 |
+
# Create searchable document content
|
428 |
+
doc_content = "\n".join([
|
429 |
+
"Incident: " + str(row.get('title', 'N/A')),
|
430 |
+
"Location: " + str(row.get('location', 'N/A')),
|
431 |
+
"Date: " + str(row.get('date', 'N/A')),
|
432 |
+
"Details: " + str(row.get('initial_notification', ''))
|
433 |
+
])
|
434 |
+
|
435 |
+
# Create metadata
|
436 |
+
metadata = {
|
437 |
+
'title': str(row.get('title', 'N/A')),
|
438 |
+
'date': str(row.get('date', 'N/A')),
|
439 |
+
'location': str(row.get('location', 'N/A'))
|
440 |
+
}
|
441 |
+
|
442 |
+
# Add any additional fields present
|
443 |
+
for col in df.columns:
|
444 |
+
if col not in ['title', 'date', 'location'] and pd.notna(row[col]):
|
445 |
+
metadata[col.lower().replace(' ', '_')] = str(row[col])
|
446 |
+
|
447 |
+
documents.append(doc_content.strip())
|
448 |
+
metadatas.append(metadata)
|
449 |
+
ids.append(incident_id)
|
450 |
+
|
451 |
+
# Add to database in batches
|
452 |
+
total_documents = len(documents)
|
453 |
+
for i in range(0, total_documents, BATCH_SIZE):
|
454 |
+
batch_end = min(i + BATCH_SIZE, total_documents)
|
455 |
+
self.collection.add(
|
456 |
+
documents=documents[i:batch_end],
|
457 |
+
metadatas=metadatas[i:batch_end],
|
458 |
+
ids=ids[i:batch_end]
|
459 |
+
)
|
460 |
+
logger.info(f"Added batch {i // BATCH_SIZE + 1} with {batch_end - i} incidents")
|
461 |
+
|
462 |
+
logger.info(f"Successfully loaded {total_documents} incidents into ChromaDB")
|
463 |
+
return total_documents
|
464 |
+
|
465 |
+
except Exception as e:
|
466 |
+
logger.error(f"Error loading incidents from CSV: {e}")
|
467 |
+
return 0
|
468 |
+
|
469 |
+
def search(self, query: str, n_results: int = 5) -> List[Dict]:
|
470 |
+
"""
|
471 |
+
Search for incidents matching the query.
|
472 |
+
|
473 |
+
Args:
|
474 |
+
query (str): Search query
|
475 |
+
n_results (int): Number of results to return
|
476 |
+
|
477 |
+
Returns:
|
478 |
+
List[Dict]: List of matching incidents
|
479 |
+
"""
|
480 |
+
try:
|
481 |
+
results = self.collection.query(
|
482 |
+
query_texts=[query],
|
483 |
+
n_results=n_results,
|
484 |
+
include=['metadatas', 'documents', 'ids']
|
485 |
+
)
|
486 |
+
|
487 |
+
formatted_results = []
|
488 |
+
for doc, metadata, incident_id in zip(
|
489 |
+
results['documents'][0],
|
490 |
+
results['metadatas'][0],
|
491 |
+
results['ids'][0]
|
492 |
+
):
|
493 |
+
result = {
|
494 |
+
'id': incident_id,
|
495 |
+
'title': metadata.get('title', 'N/A'),
|
496 |
+
'date': metadata.get('date', 'N/A'),
|
497 |
+
'location': metadata.get('location', 'N/A'),
|
498 |
+
'details': doc,
|
499 |
+
'metadata': metadata
|
500 |
+
}
|
501 |
+
formatted_results.append(result)
|
502 |
+
|
503 |
+
return formatted_results
|
504 |
+
|
505 |
+
except Exception as e:
|
506 |
+
logger.error(f"Error during search: {e}")
|
507 |
+
return []
|
508 |
+
|
509 |
+
def delete_collection(self):
|
510 |
+
"""Delete the current collection."""
|
511 |
+
try:
|
512 |
+
self.client.delete_collection("noaa_incidents")
|
513 |
+
logger.info("Collection deleted successfully")
|
514 |
+
except Exception as e:
|
515 |
+
logger.error(f"Error deleting collection: {e}")
|
516 |
+
|
517 |
+
def get_collection_stats(self) -> Dict:
|
518 |
+
"""
|
519 |
+
Get statistics about the current collection.
|
520 |
+
|
521 |
+
Returns:
|
522 |
+
Dict: Collection statistics
|
523 |
+
"""
|
524 |
+
try:
|
525 |
+
count = self.collection.count()
|
526 |
+
return {
|
527 |
+
"total_documents": count,
|
528 |
+
"collection_name": "noaa_incidents",
|
529 |
+
"embedding_model": self.embedding_function.model_name
|
530 |
+
}
|
531 |
+
except Exception as e:
|
532 |
+
logger.error(f"Error getting collection stats: {e}")
|
533 |
+
return {}
|
534 |
+
|
535 |
+
if __name__ == "__main__":
|
536 |
+
# Example usage
|
537 |
+
scraper = NOAAIncidentScraper(max_workers=5)
|
538 |
+
csv_file, json_file = scraper.run(validate_first=True)
|
539 |
+
|
540 |
+
if csv_file:
|
541 |
+
db = NOAAIncidentDB()
|
542 |
+
num_loaded = db.load_incidents(csv_file)
|
543 |
+
logger.info(f"Loaded {num_loaded} incidents into database")
|
544 |
+
|
545 |
+
# Example search
|
546 |
+
results = db.search("oil spill near coral reefs", n_results=5)
|
547 |
+
for i, result in enumerate(results, 1):
|
548 |
+
print(f"\nResult {i}:")
|
549 |
+
print(f"Title: {result['title']}")
|
550 |
+
print(f"Date: {result['date']}")
|
551 |
+
print(f"Location: {result['location']}")
|
552 |
+
print(f"Details: {result['details']}\n")
|