Spaces:

dwb2023
/

toy-intelligence

Sleeping

App Files Files Community

toy-intelligence / src /analyzer.py

dwb2023

update classifier model

af4ca43 16 days ago

raw

history blame

8.61 kB

	# src/analyzer.py
	from typing import Dict, List, Any, Optional, Union
	import asyncio
	from concurrent.futures import ThreadPoolExecutor
	from transformers import pipeline
	from datetime import datetime

	from .ontology import OntologyRegistry
	from .relationships import RelationshipEngine

	class EventAnalyzer:
	"""Main analyzer class for event processing."""

	def __init__(self) -> None:
	"""Initialize the event analyzer with required components."""
	self.ontology = OntologyRegistry()
	self.relationship_engine = RelationshipEngine()
	self.executor = ThreadPoolExecutor(max_workers=3)

	# Initialize NLP pipelines
	self.ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
	self.classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

	async def extract_entities(self, text: str) -> Dict[str, List[str]]:
	"""Extract entities from text using NER pipeline."""
	def _extract():
	return self.ner_pipeline(text)

	ner_results = await asyncio.get_event_loop().run_in_executor(
	self.executor, _extract
	)

	entities = {
	"people": [],
	"organizations": [],
	"locations": [],
	"hashtags": [word for word in text.split() if word.startswith('#')]
	}

	for item in ner_results:
	if item["entity"].endswith("PER"):
	entities["people"].append(item["word"])
	elif item["entity"].endswith("ORG"):
	entities["organizations"].append(item["word"])
	elif item["entity"].endswith("LOC"):
	entities["locations"].append(item["word"])

	return entities

	def extract_temporal(self, text: str) -> List[str]:
	"""Extract temporal expressions from text."""
	return self.ontology.validate_pattern(text, 'temporal')

	async def extract_locations(self, text: str) -> List[str]:
	"""Extract locations using both NER and pattern matching."""
	entities = await self.extract_entities(text)
	ml_locations = entities.get('locations', [])
	pattern_locations = self.ontology.validate_pattern(text, 'location')
	return list(set(ml_locations + pattern_locations))

	def calculate_confidence(self,
	entities: Dict[str, List[str]],
	temporal_data: List[str],
	related_events: List[Any]) -> float:
	"""Calculate confidence score for extracted information."""
	# Base confidence from entity presence
	base_confidence = min(1.0, (
	0.2 * bool(entities["people"]) +
	0.2 * bool(entities["organizations"]) +
	0.3 * bool(entities["locations"]) +
	0.3 * bool(temporal_data)
	))

	# Get entity parameters for frequency calculation
	entity_params = [
	*entities["people"],
	*entities["organizations"],
	*entities["locations"]
	]

	if not entity_params:
	return base_confidence

	# Calculate entity frequency boost
	query = f'''
	SELECT AVG(frequency) as avg_freq
	FROM entities
	WHERE entity_text IN ({','.join(['?']*len(entity_params))})
	'''
	cursor = self.relationship_engine.conn.execute(query, entity_params)
	avg_frequency = cursor.fetchone()[0] or 1
	frequency_boost = min(0.2, (avg_frequency - 1) * 0.05)

	# Calculate relationship confidence boost
	relationship_confidence = 0
	if related_events:
	relationship_scores = []
	for event in related_events:
	cursor = self.relationship_engine.conn.execute('''
	SELECT COUNT(*) as shared_entities
	FROM event_entities ee1
	JOIN event_entities ee2 ON ee1.entity_id = ee2.entity_id
	WHERE ee1.event_id = ? AND ee2.event_id = ?
	''', (event[0], event[0]))
	shared_count = cursor.fetchone()[0]
	relationship_scores.append(min(0.3, shared_count * 0.1))

	if relationship_scores:
	relationship_confidence = max(relationship_scores)

	return min(1.0, base_confidence + frequency_boost + relationship_confidence)

	async def analyze_event(self, text: str) -> Dict[str, Any]:
	"""Analyze event text and extract structured information."""
	try:
	# Parallel extraction
	entities_future = self.extract_entities(text)
	temporal_data = self.extract_temporal(text)
	locations_future = self.extract_locations(text)

	# Gather async results
	entities, locations = await asyncio.gather(
	entities_future, locations_future
	)

	# Merge locations and add temporal data
	entities['locations'] = locations
	entities['temporal'] = temporal_data

	# Find related events
	related_events = self.relationship_engine.find_related_events({
	'text': text,
	'entities': entities
	})

	# Calculate confidence
	confidence = self.calculate_confidence(entities, temporal_data, related_events)

	# Store event if confidence meets threshold
	cursor = None
	if confidence >= 0.6:
	cursor = self.relationship_engine.conn.execute(
	'INSERT INTO events (text, timestamp, confidence) VALUES (?, ?, ?)',
	(text, datetime.now().isoformat(), confidence)
	)
	event_id = cursor.lastrowid

	# Store entities and update relationships
	self.relationship_engine.store_entities(event_id, {
	'person': entities['people'],
	'organization': entities['organizations'],
	'location': entities['locations'],
	'temporal': temporal_data,
	'hashtag': entities['hashtags']
	})

	self.relationship_engine.update_entity_relationships(event_id)
	self.relationship_engine.conn.commit()

	# Get entity relationships for output
	entity_relationships = []
	if cursor and cursor.lastrowid:
	entity_relationships = self.relationship_engine.get_entity_relationships(cursor.lastrowid)

	return {
	"text": text,
	"entities": entities,
	"confidence": confidence,
	"verification_needed": confidence < 0.6,
	"related_events": [
	{
	"text": event[1],
	"timestamp": event[2],
	"confidence": event[3],
	"shared_entities": event[4] if len(event) > 4 else None
	}
	for event in related_events
	],
	"entity_relationships": entity_relationships
	}

	except Exception as e:
	return {"error": str(e)}

	def get_entity_statistics(self) -> Dict[str, List[tuple]]:
	"""Get statistics about stored entities and relationships."""
	stats = {}

	# Entity counts by type
	cursor = self.relationship_engine.conn.execute('''
	SELECT entity_type, COUNT(*) as count, AVG(frequency) as avg_frequency
	FROM entities
	GROUP BY entity_type
	''')
	stats['entity_counts'] = cursor.fetchall()

	# Most frequent entities
	cursor = self.relationship_engine.conn.execute('''
	SELECT entity_text, entity_type, frequency
	FROM entities
	ORDER BY frequency DESC
	LIMIT 10
	''')
	stats['frequent_entities'] = cursor.fetchall()

	# Relationship statistics
	cursor = self.relationship_engine.conn.execute('''
	SELECT relationship_type, COUNT(*) as count, AVG(confidence) as avg_confidence
	FROM entity_relationships
	GROUP BY relationship_type
	''')
	stats['relationship_stats'] = cursor.fetchall()

	return stats