import gradio as gr import requests import xml.etree.ElementTree as ET import re from urllib.parse import urljoin, urlparse from typing import List, Dict, Optional, Tuple import json from groq import Groq import time import pandas as pd import os def normalize_url(url: str) -> str: if not url.startswith(('http://', 'https://')): url = 'https://' + url return url.rstrip('/') def fetch_with_proxy(url: str) -> str: """Fetch URL content with error handling""" try: response = requests.get(url, timeout=10) response.raise_for_status() return response.text except Exception as e: raise Exception(f"Failed to fetch {url}: {str(e)}") def extract_urls_from_sitemap(content: str) -> List[str]: urls = [] try: root = ET.fromstring(content) ns = { 'ns': root.tag.split('}')[0].strip('{') } if '}' in root.tag else {} # Handle sitemap index if 'sitemapindex' in root.tag: for sitemap in root.findall('.//ns:loc', ns): try: sitemap_content = fetch_with_proxy(sitemap.text.strip()) urls.extend(extract_urls_from_sitemap(sitemap_content)) except Exception: continue # Handle urlset else: for url in root.findall('.//ns:loc', ns): urls.append(url.text.strip()) except ET.ParseError: pass return urls def get_common_sitemap_urls(base_url: str) -> List[str]: domain = urlparse(base_url).hostname return [ f"{base_url}/sitemap.xml", f"{base_url}/sitemap_index.xml", f"{base_url}/wp-sitemap.xml", f"{base_url}/sitemap/sitemap-index.xml", f"{base_url}/sitemap/{domain}-sitemap.xml" ] def extract_sitemap_urls_from_robots(robots_content: str) -> List[str]: return [ line.split(': ')[1].strip() for line in robots_content.splitlines() if line.lower().startswith('sitemap:') ] def generate_hyperbolic_summary(url: str, content: str, api_key: str) -> str: try: # Ensure content is properly encoded content = content.encode('utf-8', errors='ignore').decode('utf-8') response = requests.post( 'https://api.hyperbolic.xyz/v1/chat/completions', headers={ 'Content-Type': 'application/json; charset=utf-8', 'Authorization': f'Bearer {api_key}', }, json={ 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'messages': [{ 'role': 'user', 'content': f"""Generate a concise 1-sentence summary of this webpage content. Wrap your summary in tags. URL: {url} Content: {content} Example response format: This is a clear and concise one-sentence summary of the webpage.""" }], 'max_tokens': 200, 'temperature': 0.7, 'top_p': 0.9, 'stream': False }, timeout=30 ) response.raise_for_status() result = response.json() summary = result['choices'][0]['message']['content'] # Extract summary from tags match = re.search(r'(.*?)', summary, re.DOTALL) return match.group(1).strip() if match else summary.strip() except Exception as e: print(f"Error in generate_hyperbolic_summary: {str(e)}") return f"Error generating Hyperbolic summary: {str(e)}" def generate_groq_summary(url: str, content: str, api_key: str) -> str: try: # Ensure content is properly encoded content = content.encode('utf-8', errors='ignore').decode('utf-8') client = Groq(api_key=api_key) completion = client.chat.completions.create( messages=[{ 'role': 'user', 'content': f"""Generate a concise 1-sentence summary of this webpage content. Wrap your summary in tags. URL: {url} Content: {content} Example response format: This is a clear and concise one-sentence summary of the webpage.""" }], model="llama-3.2-1b-preview", temperature=0.7, max_tokens=200, top_p=0.9, stream=False ) summary = completion.choices[0].message.content # Extract summary from tags match = re.search(r'(.*?)', summary, re.DOTALL) return match.group(1).strip() if match else summary.strip() except Exception as e: print(f"Error in generate_groq_summary: {str(e)}") return f"Error generating Groq summary: {str(e)}" def generate_llms_txt(summaries: List[Dict[str, str]]) -> str: if not summaries: return "" return "\n".join([ f"# {summary['url']}\n\n{summary['summary']}\n\n---\n" for summary in summaries ]) def generate_llms_full_txt(summaries: List[Dict]) -> str: if not summaries: return "No content generated" content = "" for summary in summaries: content += f"# {summary['url']}\n\n" content += f"{summary.get('fullContent', 'No content available')}\n\n" content += "---\n\n" return content def get_page_content(url: str, markdowner_key: Optional[str] = None) -> str: try: headers = { "Accept": "text/plain", "Accept-Language": "en-US,en;q=0.9", "User-Agent": "Mozilla/5.0 (compatible; SitemapParser/1.0)", "Origin": "http://localhost:3000", "Referer": "http://localhost:3000/", } if markdowner_key: headers["Authorization"] = f"Bearer {markdowner_key}" # Use direct URL construction like the curl command encoded_url = requests.utils.quote(url) full_url = f"https://md.dhr.wtf/?url={encoded_url}" print(f"Requesting URL: {full_url}") # Debug logging print(f"Headers: {headers}") # Debug logging response = requests.get( # Changed to GET request full_url, headers=headers, timeout=30 ) response.encoding = 'utf-8' response.raise_for_status() if response.status_code == 200: return response.text else: print(f"Response status: {response.status_code}") # Debug logging print(f"Response headers: {response.headers}") # Debug logging print(f"Response text: {response.text[:500]}") # Debug logging return f"Error fetching content: {response.status_code} {response.reason}" except Exception as e: print(f"Error fetching content for {url}: {str(e)}") return f"Error fetching content: {str(e)}" def process_website( url: str, hyperbolic_key: str = "", groq_key: str = "", markdowner_key: str = "", use_hyperbolic: bool = True, progress=gr.Progress() ) -> Tuple[str, str, List[str], str]: try: if not (use_hyperbolic and hyperbolic_key) and not (not use_hyperbolic and groq_key): return "Error: Please provide an API key for the selected AI provider", None, [], "" base_url = normalize_url(url) progress(0, desc="Initializing...") # Try robots.txt first sitemap_urls = [] try: robots_url = urljoin(base_url, '/robots.txt') robots_content = fetch_with_proxy(robots_url) sitemap_urls = extract_sitemap_urls_from_robots(robots_content) except: pass progress(0.2, desc="Checking common sitemap locations...") # Try common locations if no sitemaps found if not sitemap_urls: common_locations = get_common_sitemap_urls(base_url) for sitemap_url in common_locations: try: content = fetch_with_proxy(sitemap_url) if ' str: """Convert text to downloadable format""" if not text: return None # Create a file with the proper name with open(filename, "w", encoding="utf-8") as f: f.write(text) return filename download_btn = gr.File( label="Download llms.txt", visible=True, file_types=[".txt"] ) download_full_btn = gr.File( label="Download llms-full.txt", visible=True, file_types=[".txt"] ) download_trigger = gr.Button("Download llms.txt 📥") download_full_trigger = gr.Button("Download llms-full.txt 📥") download_trigger.click( fn=lambda x: download_txt(x, "llms.txt"), inputs=[llms_output], outputs=[download_btn] ) download_full_trigger.click( fn=lambda x: download_txt(x, "llms-full.txt"), inputs=[llms_full_output], outputs=[download_full_btn] ) # Clean up function to remove temporary files def cleanup(): try: if os.path.exists("llms.txt"): os.remove("llms.txt") if os.path.exists("llms-full.txt"): os.remove("llms-full.txt") except: pass urls_found = gr.Dataframe( headers=["URLs Found"], label="Discovered URLs", visible=True ) def process_and_update(*args): result, summaries, urls, full_result = process_website(*args) urls_df = pd.DataFrame({ "URLs Found": urls if urls else ["No URLs found"] }) # Clean up any existing temporary files cleanup() return { llms_output: result, llms_full_output: full_result, json_output: summaries if summaries else "", urls_found: urls_df, download_btn: None, download_full_btn: None } generate_btn.click( process_and_update, inputs=[url_input, hyperbolic_key, groq_key, markdowner_key, use_hyperbolic], outputs=[llms_output, llms_full_output, json_output, urls_found, download_btn, download_full_btn] ) if __name__ == "__main__": demo.launch()