Spaces:

tensora
/

webcrawler

Running

App Files Files Community

Add1E commited on 29 days ago

Commit

fee7f34

verified ·

1 Parent(s): e66a983

Update trend_crawl2.py

Browse files

Files changed (1) hide show

trend_crawl2.py +91 -34

trend_crawl2.py CHANGED Viewed

@@ -1,29 +1,24 @@
 from selenium import webdriver
 from selenium.webdriver.common.by import By
-from selenium.webdriver.chrome.service import Service as ChromeService
-from selenium.common.exceptions import ElementClickInterceptedException
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
-from bs4 import BeautifulSoup
-from webdriver_manager.chrome import ChromeDriverManager
-import time
 import json
 # Configure Chrome options
-chrome_options = Options()
-chrome_options.add_argument("--headless")  # Run in headless mode
-chrome_options.add_argument("--disable-gpu")
 def setup_driver():
-    options = webdriver.ChromeOptions()
-    options.add_argument('--headless')
     options.add_argument('--no-sandbox')
     options.add_argument('--disable-dev-shm-usage')
     options.add_argument("--lang=de")
-    wd = webdriver.Chrome(options=options)
-    return wd
 def click_and_scrape(driver, url):
     """Click each li element and scrape data."""
@@ -31,11 +26,33 @@ def click_and_scrape(driver, url):
     try:
         driver.get(url)
-        # Wait for the ul element to load
         try:
-            # Wait for the ul element with the specific aria-label to load
             ul_element = WebDriverWait(driver, 20).until(
-                EC.presence_of_element_located((By.XPATH, "//ul[contains(@aria-label, 'Kategorie') or contains(@aria-label, 'Category')]"))
             )
             li_elements = ul_element.find_elements(By.TAG_NAME, "li")
         except Exception as e:
@@ -43,9 +60,9 @@ def click_and_scrape(driver, url):
         selected_elements = [li_elements[2]] + li_elements[4:]
         for index, li in enumerate(selected_elements):
                 try:
-                    # Scroll each li element into view
                     driver.execute_script("arguments[0].scrollIntoView();", li)
-                    # Click the <li> using JavaScript
                     driver.execute_script("arguments[0].click();", li)
                     print(f"Clicked LI {index} using JavaScript.")
                     time.sleep(2)
@@ -63,7 +80,6 @@ def click_and_scrape(driver, url):
                 except Exception as e:
                     print(f"Error interacting with LI {index}: {e}")
     except Exception as e:
         print(f"Error during click and scrape: {e}")
@@ -79,17 +95,15 @@ def process_selenium_row(index, rows, driver):
         try:
             articles = {}
-            driver.execute_script("arguments[0].click();", rows[index])  # Use JavaScript click for stability
-            # Wait for the articles to load dynamically
             WebDriverWait(driver, 10).until(
                 EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
             )
-            # Fetch only the newly loaded articles
             articles = driver.find_elements(By.CLASS_NAME, "xZCHj")
             articles = articles[:3]
-            # Extract data from the current row only
             dynamic_data = {
                 "article": [
                     {
@@ -143,16 +157,59 @@ def scrape_google_trends(driver):
         print(f"An error occurred during scraping: {e}")
         return []
-def crawl_url(url="https://trends.google.com/trends/trendingsearches/daily?geo=AT"):
-    """Main function to crawl dynamically and scrape Google Trends."""
     driver = setup_driver()
-    results = click_and_scrape(driver,url)
-    return results
-if __name__ == "__main__":
-    results = crawl_url()
     try:
-        with open("results.json", "w", encoding="utf-8") as f:
-            json.dump(results, f, ensure_ascii=False, indent=4)
     except Exception as e:
-        print(f"Error writing results to JSON: {e}")

+from selenium.common.exceptions import ElementClickInterceptedException
+from bs4 import BeautifulSoup
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 import json
+import time
 # Configure Chrome options
 def setup_driver():
+    options = Options()
+    options.add_argument("--headless")
+    options.add_argument("--disable-gpu")
     options.add_argument('--no-sandbox')
     options.add_argument('--disable-dev-shm-usage')
     options.add_argument("--lang=de")
+    return webdriver.Chrome(options=options)
 def click_and_scrape(driver, url):
     """Click each li element and scrape data."""
     try:
         driver.get(url)
+        for attempt in range(4):
+                try:
+                    button = WebDriverWait(driver, 20).until(
+                        EC.element_to_be_clickable((
+                            By.XPATH,
+                            "//button[@aria-label='Alle Kategorien, Kategorie auswählen']"
+                        ))
+                    )
+                    print("Button located.")
+                    driver.execute_script("arguments[0].scrollIntoView();", button)
+                    print(button.get_attribute("outerHTML"))
+                    button.click()
+                    print("Button clicked successfully.")
+                    break
+                except ElementClickInterceptedException:
+                    print(f"Attempt {attempt + 1}: Click intercepted. Retrying...")
         try:
             ul_element = WebDriverWait(driver, 20).until(
+                EC.presence_of_element_located((
+                    By.XPATH,
+                    "//ul[@aria-label='Kategorie']"
+                ))
             )
             li_elements = ul_element.find_elements(By.TAG_NAME, "li")
         except Exception as e:
         selected_elements = [li_elements[2]] + li_elements[4:]
         for index, li in enumerate(selected_elements):
                 try:
                     driver.execute_script("arguments[0].scrollIntoView();", li)
                     driver.execute_script("arguments[0].click();", li)
                     print(f"Clicked LI {index} using JavaScript.")
                     time.sleep(2)
                 except Exception as e:
                     print(f"Error interacting with LI {index}: {e}")
     except Exception as e:
         print(f"Error during click and scrape: {e}")
         try:
             articles = {}
+            driver.execute_script("arguments[0].click();", rows[index])
             WebDriverWait(driver, 10).until(
                 EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
             )
             articles = driver.find_elements(By.CLASS_NAME, "xZCHj")
             articles = articles[:3]
             dynamic_data = {
                 "article": [
                     {
         print(f"An error occurred during scraping: {e}")
         return []
+def process_li_element(index, li_data, url):
+    """Process a single li element."""
     driver = setup_driver()
+    try:
+        driver.get(url)
+        WebDriverWait(driver, 20).until(
+            EC.presence_of_element_located((By.XPATH, "//ul[contains(@aria-label, 'Kategorie') or contains(@aria-label, 'Category')]"))
+        )
+        ul_element = driver.find_element(By.XPATH, "//ul[contains(@aria-label, 'Kategorie') or contains(@aria-label, 'Category')]")
+        li_elements = ul_element.find_elements(By.TAG_NAME, "li")
+        selected_li = li_elements[li_data['index']]
+        driver.execute_script("arguments[0].scrollIntoView();", selected_li)
+        driver.execute_script("arguments[0].click();", selected_li)
+        time.sleep(2)
+        span_content = selected_li.find_element(By.CLASS_NAME, "W7g1Rb-rymPhb-fpDzbe-fmcmS").get_attribute("innerText")
+        print(f"LI {li_data['index']} clicked: {span_content}")
+        data = scrape_google_trends(driver)
+        return {span_content: data}
+    except Exception as e:
+        print(f"Error processing LI {index}: {e}")
+        return {}
+    finally:
+        driver.quit()
+def crawl_url(url):
+    """Click each li element and scrape data in parallel."""
+    driver = setup_driver()
+    result_dict = {}
     try:
+        driver.get(url)
+        WebDriverWait(driver, 20).until(
+            EC.presence_of_element_located((By.XPATH, "//ul[contains(@aria-label, 'Kategorie') or contains(@aria-label, 'Category')]"))
+        )
+        ul_element = driver.find_element(By.XPATH, "//ul[contains(@aria-label, 'Kategorie') or contains(@aria-label, 'Category')]")
+        li_elements = ul_element.find_elements(By.TAG_NAME, "li")
+        selected_elements = [{"index": i} for i in range(2, len(li_elements)) if i != 3]
+        with ThreadPoolExecutor() as executor:
+            futures = [executor.submit(process_li_element, idx, li_data, url) for idx, li_data in enumerate(selected_elements)]
+            for future in as_completed(futures):
+                result = future.result()
+                result_dict.update(result)
     except Exception as e:
+        print(f"Error during click and scrape: {e}")
+    finally:
+        driver.quit()
+    return result_dict