Spaces:

saneowl
/

full-search-api

Running

Oscar Wang

Update app.py

d512680 verified about 2 months ago

6.32 kB

	import os
	import sys
	import logging
	import torch
	from flask import Flask, request, jsonify
	import requests
	from transformers import AutoTokenizer, AutoModelForSequenceClassification

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s: %(message)s',
	handlers=[
	logging.StreamHandler(sys.stdout), # Log to console
	logging.FileHandler('/tmp/search_app.log') # Log to file
	]
	)
	logger = logging.getLogger(__name__)

	# Set cache directory explicitly
	os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
	os.makedirs('/tmp/huggingface_cache', exist_ok=True)

	logger.info("🚀 Initializing Educational Search Reranker Application")
	logger.info(f"Cache directory: {os.environ['TRANSFORMERS_CACHE']}")

	app = Flask(__name__)

	# Define the SearXNG instance URL
	SEARXNG_INSTANCE_URL = "https://oscarwang2-searxng.hf.space/search"

	# Load the educational content classifier with explicit cache directory
	def load_model_with_retry(max_retries=3):
	logger.info("Attempting to load educational content classifier...")
	for attempt in range(max_retries):
	try:
	logger.info(f"Loading attempt {attempt + 1}...")

	# Log system info
	logger.info(f"Python Version: {sys.version}")
	logger.info(f"Torch Version: {torch.__version__}")

	logger.info("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(
	"HuggingFaceTB/fineweb-edu-classifier",
	cache_dir='/tmp/huggingface_cache'
	)

	logger.info("Loading classification model...")
	model = AutoModelForSequenceClassification.from_pretrained(
	"HuggingFaceTB/fineweb-edu-classifier",
	cache_dir='/tmp/huggingface_cache'
	)

	logger.info("✅ Model and tokenizer loaded successfully!")
	return tokenizer, model

	except Exception as e:
	logger.error(f"Model loading attempt {attempt + 1} failed: {e}")
	logger.error(f"Detailed error: {sys.exc_info()}")

	if attempt == max_retries - 1:
	logger.critical("❌ Failed to load model after all attempts!")
	raise

	# Load models at startup
	try:
	tokenizer, model = load_model_with_retry()
	except Exception as startup_error:
	logger.critical(f"Startup failed: {startup_error}")
	tokenizer, model = None, None

	def classify_educational_quality(text):
	"""
	Classify the educational quality of a given text snippet
	"""
	if tokenizer is None or model is None:
	logger.warning("Model not initialized. Returning default score.")
	return 0

	try:
	logger.info(f"Classifying text (first 50 chars): {text[:50]}...")

	# Prepare input for the model
	inputs = tokenizer(text, return_tensors="pt", padding="longest", truncation=True)

	# Get model outputs
	with torch.no_grad():
	outputs = model(**inputs)

	# Extract the logits and convert to a score
	logits = outputs.logits.squeeze(-1).float().detach().numpy()
	score = logits.item()

	logger.info(f"Educational quality score: {score}")
	return score

	except Exception as e:
	logger.error(f"Classification error: {e}")
	return 0 # Default score if classification fails

	@app.route('/search', methods=['GET'])
	def search():
	# Get the search term from query parameters
	search_term = request.args.get('q', '')

	logger.info(f"🔍 Received search query: {search_term}")

	if not search_term:
	logger.warning("No search term provided")
	return jsonify({'error': 'No search term provided'}), 400

	# Define the query parameters for the SearXNG API
	params = {
	'q': search_term,
	'format': 'json',
	'categories': 'general'
	}

	try:
	logger.info("Sending request to SearXNG search API...")
	# Make the request to the SearXNG API
	response = requests.get(SEARXNG_INSTANCE_URL, params=params)

	# Check the response status code
	if response.status_code == 200:
	logger.info("Received successful response from SearXNG")
	data = response.json()
	# Retrieve the first 30 results
	results = data.get('results', [])[:30]

	logger.info(f"Total results found: {len(results)}")

	# Classify and score educational quality for each result
	scored_snippets = []
	for idx, result in enumerate(results, 1):
	snippet = {
	'title': result.get('title', 'No title'),
	'snippet': result.get('content', 'No snippet available'),
	'url': result.get('url', 'No URL')
	}

	# Combine title and snippet for classification
	full_text = f"{snippet['title']} {snippet['snippet']}"

	# Classify educational quality
	edu_score = classify_educational_quality(full_text)

	snippet['educational_score'] = edu_score
	scored_snippets.append(snippet)

	logger.info(f"Result {idx}: URL={snippet['url']}, Score={edu_score}")

	# Sort results by educational score in descending order
	sorted_snippets = sorted(scored_snippets, key=lambda x: x['educational_score'], reverse=True)

	logger.info("🏆 Results sorted by educational quality")
	return jsonify(sorted_snippets)

	else:
	logger.error(f"SearXNG API error: {response.status_code}")
	return jsonify({'error': f'SearXNG API error: {response.status_code}'}), response.status_code

	except Exception as e:
	logger.error(f"Search processing error: {e}")
	return jsonify({'error': str(e)}), 500

	if __name__ == '__main__':
	logger.info("🌐 Starting Flask application...")
	# Run the Flask app on port 7860
	app.run(host='0.0.0.0', port=7860, debug=True)