full-search-api / app.py
Oscar Wang
Update app.py
d512680 verified
raw
history blame
6.32 kB
import os
import sys
import logging
import torch
from flask import Flask, request, jsonify
import requests
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s',
handlers=[
logging.StreamHandler(sys.stdout), # Log to console
logging.FileHandler('/tmp/search_app.log') # Log to file
]
)
logger = logging.getLogger(__name__)
# Set cache directory explicitly
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
os.makedirs('/tmp/huggingface_cache', exist_ok=True)
logger.info("πŸš€ Initializing Educational Search Reranker Application")
logger.info(f"Cache directory: {os.environ['TRANSFORMERS_CACHE']}")
app = Flask(__name__)
# Define the SearXNG instance URL
SEARXNG_INSTANCE_URL = "https://oscarwang2-searxng.hf.space/search"
# Load the educational content classifier with explicit cache directory
def load_model_with_retry(max_retries=3):
logger.info("Attempting to load educational content classifier...")
for attempt in range(max_retries):
try:
logger.info(f"Loading attempt {attempt + 1}...")
# Log system info
logger.info(f"Python Version: {sys.version}")
logger.info(f"Torch Version: {torch.__version__}")
logger.info("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
"HuggingFaceTB/fineweb-edu-classifier",
cache_dir='/tmp/huggingface_cache'
)
logger.info("Loading classification model...")
model = AutoModelForSequenceClassification.from_pretrained(
"HuggingFaceTB/fineweb-edu-classifier",
cache_dir='/tmp/huggingface_cache'
)
logger.info("βœ… Model and tokenizer loaded successfully!")
return tokenizer, model
except Exception as e:
logger.error(f"Model loading attempt {attempt + 1} failed: {e}")
logger.error(f"Detailed error: {sys.exc_info()}")
if attempt == max_retries - 1:
logger.critical("❌ Failed to load model after all attempts!")
raise
# Load models at startup
try:
tokenizer, model = load_model_with_retry()
except Exception as startup_error:
logger.critical(f"Startup failed: {startup_error}")
tokenizer, model = None, None
def classify_educational_quality(text):
"""
Classify the educational quality of a given text snippet
"""
if tokenizer is None or model is None:
logger.warning("Model not initialized. Returning default score.")
return 0
try:
logger.info(f"Classifying text (first 50 chars): {text[:50]}...")
# Prepare input for the model
inputs = tokenizer(text, return_tensors="pt", padding="longest", truncation=True)
# Get model outputs
with torch.no_grad():
outputs = model(**inputs)
# Extract the logits and convert to a score
logits = outputs.logits.squeeze(-1).float().detach().numpy()
score = logits.item()
logger.info(f"Educational quality score: {score}")
return score
except Exception as e:
logger.error(f"Classification error: {e}")
return 0 # Default score if classification fails
@app.route('/search', methods=['GET'])
def search():
# Get the search term from query parameters
search_term = request.args.get('q', '')
logger.info(f"πŸ” Received search query: {search_term}")
if not search_term:
logger.warning("No search term provided")
return jsonify({'error': 'No search term provided'}), 400
# Define the query parameters for the SearXNG API
params = {
'q': search_term,
'format': 'json',
'categories': 'general'
}
try:
logger.info("Sending request to SearXNG search API...")
# Make the request to the SearXNG API
response = requests.get(SEARXNG_INSTANCE_URL, params=params)
# Check the response status code
if response.status_code == 200:
logger.info("Received successful response from SearXNG")
data = response.json()
# Retrieve the first 30 results
results = data.get('results', [])[:30]
logger.info(f"Total results found: {len(results)}")
# Classify and score educational quality for each result
scored_snippets = []
for idx, result in enumerate(results, 1):
snippet = {
'title': result.get('title', 'No title'),
'snippet': result.get('content', 'No snippet available'),
'url': result.get('url', 'No URL')
}
# Combine title and snippet for classification
full_text = f"{snippet['title']} {snippet['snippet']}"
# Classify educational quality
edu_score = classify_educational_quality(full_text)
snippet['educational_score'] = edu_score
scored_snippets.append(snippet)
logger.info(f"Result {idx}: URL={snippet['url']}, Score={edu_score}")
# Sort results by educational score in descending order
sorted_snippets = sorted(scored_snippets, key=lambda x: x['educational_score'], reverse=True)
logger.info("πŸ† Results sorted by educational quality")
return jsonify(sorted_snippets)
else:
logger.error(f"SearXNG API error: {response.status_code}")
return jsonify({'error': f'SearXNG API error: {response.status_code}'}), response.status_code
except Exception as e:
logger.error(f"Search processing error: {e}")
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
logger.info("🌐 Starting Flask application...")
# Run the Flask app on port 7860
app.run(host='0.0.0.0', port=7860, debug=True)