from flask import Flask, jsonify, request import requests from bs4 import BeautifulSoup import os import re import urllib.parse import time import random import base64 from io import BytesIO from googlesearch import search app = Flask(__name__) def search_images(query, num_images=5): # Headers to mimic a browser request headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'DNT': '1', 'Connection': 'keep-alive', } # Format the query for URL formatted_query = urllib.parse.quote(query) # Google Images URL url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active" try: # Get the HTML content response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() # Find all image URLs using regex image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text) # Remove duplicates while preserving order image_urls = list(dict.fromkeys(image_urls)) # Store results results = [] downloaded = 0 for img_url in image_urls: if downloaded >= num_images: break try: # Skip small thumbnails and icons if 'gstatic.com' in img_url or 'google.com' in img_url: continue # Download image img_response = requests.get(img_url, headers=headers, timeout=10) img_response.raise_for_status() # Check if the response is actually an image content_type = img_response.headers.get('Content-Type', '') if not content_type.startswith('image/'): continue # Convert image to base64 image_base64 = base64.b64encode(img_response.content).decode('utf-8') # Add to results results.append({ 'image_url': img_url, 'base64_data': f"data:{content_type};base64,{image_base64}" }) downloaded += 1 # Add a random delay between downloads time.sleep(random.uniform(0.5, 1)) except Exception as e: print(f"Error downloading image: {str(e)}") continue return results except Exception as e: print(f"An error occurred: {str(e)}") return [] @app.route('/search_images', methods=['GET']) def api_search_images(): try: # Get query parameters query = request.args.get('query', '') num_images = int(request.args.get('num_images', 5)) if not query: return jsonify({'error': 'Query parameter is required'}), 400 if num_images < 1 or num_images > 20: return jsonify({'error': 'Number of images must be between 1 and 20'}), 400 # Search for images results = search_images(query, num_images) return jsonify({ 'success': True, 'query': query, 'results': results }) except Exception as e: return jsonify({ 'success': False, 'error': str(e) }), 500 def scrape_site_content(query, num_sites=5): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'DNT': '1', 'Connection': 'keep-alive', } results = [] scraped = 0 try: # Use googlesearch-python to get URLs search_results = search(query, num_results=num_sites) # Process each found URL for url in search_results: if scraped >= num_sites: break try: # Get the HTML content response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() # Verify it's HTML content content_type = response.headers.get('Content-Type', '').lower() if 'text/html' not in content_type: continue # Parse the HTML content soup = BeautifulSoup(response.text, 'html.parser') # Remove script and style elements for script in soup(["script", "style"]): script.decompose() # Extract text content (limit to first 1000 characters) text_content = soup.get_text(separator='\n', strip=True)[:10000] # Extract all links (limit to first 10) links = [] for link in soup.find_all('a', href=True)[:10]: href = link['href'] if href.startswith('http'): links.append({ 'text': link.get_text(strip=True), 'url': href }) # Extract meta information title = soup.title.string if soup.title else '' meta_description = '' meta_keywords = '' meta_desc_tag = soup.find('meta', attrs={'name': 'description'}) if meta_desc_tag: meta_description = meta_desc_tag.get('content', '') meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'}) if meta_keywords_tag: meta_keywords = meta_keywords_tag.get('content', '') results.append({ 'url': url, 'title': title, 'meta_description': meta_description, 'meta_keywords': meta_keywords, 'text_content': text_content, 'links': links }) scraped += 1 # Add a random delay between scrapes time.sleep(random.uniform(0.5, 1)) except Exception as e: print(f"Error scraping {url}: {str(e)}") continue except Exception as e: print(f"Error in search: {str(e)}") return results @app.route('/scrape_sites', methods=['GET']) def api_scrape_sites(): try: # Get query parameters query = request.args.get('query', '') num_sites = int(request.args.get('num_sites', 10)) if not query: return jsonify({'error': 'Query parameter is required'}), 400 if num_sites < 1 or num_sites > 20: return jsonify({'error': 'Number of sites must be between 1 and 20'}), 400 # Scrape the websites results = scrape_site_content(query, num_sites) return jsonify({ 'success': True, 'query': query, 'results': results }) except Exception as e: return jsonify({ 'success': False, 'error': str(e) }), 500 if __name__ == '__main__': app.run(host='0.0.0.0', port=5000)