patrickacraig commited on
Commit
7e89065
·
1 Parent(s): c2b627a

adding error handling

Browse files
Files changed (1) hide show
  1. app.py +56 -40
app.py CHANGED
@@ -37,55 +37,71 @@ def map_website(app, url):
37
  return []
38
 
39
  async def scrape_all_urls(base_url, api_key, limit_rate, progress=gr.Progress(), cancel_event=None):
40
- app = get_firecrawl_app(api_key)
41
- urls = map_website(app, base_url)
42
- if not urls:
43
- return "No URLs found. Please check if the base URL is correct.", None
 
44
 
45
- parsed_url = urlparse(base_url)
46
- domain = parsed_url.netloc.replace("www.", "")
47
- os.makedirs('scraped_documentation', exist_ok=True)
48
- output_file = os.path.join('scraped_documentation', f"{domain}.md")
49
 
50
- with open(output_file, 'w', encoding='utf-8') as md_file:
51
- for i, url in enumerate(progress.tqdm(urls)):
52
- if cancel_event and cancel_event.is_set():
53
- return "Scraping cancelled.", None
54
- progress(i / len(urls), f"Scraping {url}")
55
- markdown_content = await async_scrape_url(app, url)
56
- md_file.write(f"# {url}\n\n")
57
- md_file.write(markdown_content)
58
- md_file.write("\n\n---\n\n")
59
- if limit_rate and (i + 1) % 10 == 0:
60
- time.sleep(60)
61
 
62
- return f"Scraping completed. Output saved to {output_file}", output_file
 
 
 
63
 
64
  def count_urls(base_url, api_key):
65
- if not api_key:
66
- return "Please enter your Firecrawl API key first."
67
- app = get_firecrawl_app(api_key)
68
- urls = map_website(app, base_url)
69
- if urls:
70
- return f"{len(urls)} URLs found. Do you want to proceed with scraping?"
71
- else:
72
- return "No URLs found. Please check the base URL or API key."
 
 
 
 
73
 
74
  async def gradio_scrape(base_url, api_key, limit_rate, progress=gr.Progress()):
75
- if not api_key:
76
- return "Please enter your Firecrawl API key.", None
77
- if not base_url:
78
- return "Please enter a base URL to scrape.", None
79
- cancel_event = asyncio.Event()
80
- result, file_path = await scrape_all_urls(base_url, api_key, limit_rate, progress, cancel_event)
81
- return result, file_path
 
 
 
 
82
 
83
  def cancel_scrape():
84
- # This function will be called when the cancel button is clicked
85
- global cancel_event
86
- if cancel_event:
87
- cancel_event.set()
88
- return "Cancelling scrape operation..."
 
 
 
 
89
 
90
  with gr.Blocks() as iface:
91
  gr.Markdown("# Docs Scraper")
 
37
  return []
38
 
39
  async def scrape_all_urls(base_url, api_key, limit_rate, progress=gr.Progress(), cancel_event=None):
40
+ try:
41
+ app = get_firecrawl_app(api_key)
42
+ urls = map_website(app, base_url)
43
+ if not urls:
44
+ return "No URLs found. Please check if the base URL is correct.", None
45
 
46
+ parsed_url = urlparse(base_url)
47
+ domain = parsed_url.netloc.replace("www.", "")
48
+ os.makedirs('scraped_documentation', exist_ok=True)
49
+ output_file = os.path.join('scraped_documentation', f"{domain}.md")
50
 
51
+ with open(output_file, 'w', encoding='utf-8') as md_file:
52
+ for i, url in enumerate(progress.tqdm(urls)):
53
+ if cancel_event and cancel_event.is_set():
54
+ return "Scraping cancelled.", None
55
+ progress(i / len(urls), f"Scraping {url}")
56
+ markdown_content = await async_scrape_url(app, url)
57
+ md_file.write(f"# {url}\n\n")
58
+ md_file.write(markdown_content)
59
+ md_file.write("\n\n---\n\n")
60
+ if limit_rate and (i + 1) % 10 == 0:
61
+ time.sleep(60)
62
 
63
+ return f"Scraping completed. Output saved to {output_file}", output_file
64
+ except Exception as e:
65
+ print(f"Error during scraping process: {e}")
66
+ return f"Error during scraping process: {e}", None
67
 
68
  def count_urls(base_url, api_key):
69
+ try:
70
+ if not api_key:
71
+ return "Please enter your Firecrawl API key first."
72
+ app = get_firecrawl_app(api_key)
73
+ urls = map_website(app, base_url)
74
+ if urls:
75
+ return f"{len(urls)} URLs found. Do you want to proceed with scraping?"
76
+ else:
77
+ return "No URLs found. Please check the base URL or API key."
78
+ except Exception as e:
79
+ print(f"Error counting URLs: {e}")
80
+ return f"Error counting URLs: {e}"
81
 
82
  async def gradio_scrape(base_url, api_key, limit_rate, progress=gr.Progress()):
83
+ try:
84
+ if not api_key:
85
+ return "Please enter your Firecrawl API key.", None
86
+ if not base_url:
87
+ return "Please enter a base URL to scrape.", None
88
+ cancel_event = asyncio.Event()
89
+ result, file_path = await scrape_all_urls(base_url, api_key, limit_rate, progress, cancel_event)
90
+ return result, file_path
91
+ except Exception as e:
92
+ print(f"Error in gradio_scrape: {e}")
93
+ return f"Error in gradio_scrape: {e}", None
94
 
95
  def cancel_scrape():
96
+ try:
97
+ # This function will be called when the cancel button is clicked
98
+ global cancel_event
99
+ if cancel_event:
100
+ cancel_event.set()
101
+ return "Cancelling scrape operation..."
102
+ except Exception as e:
103
+ print(f"Error cancelling scrape: {e}")
104
+ return f"Error cancelling scrape: {e}"
105
 
106
  with gr.Blocks() as iface:
107
  gr.Markdown("# Docs Scraper")