from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service as ChromeService from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup from webdriver_manager.chrome import ChromeDriverManager import os import time # Configure Chrome options chrome_options = Options() chrome_options.add_argument("--headless") # Run in headless mode chrome_options.add_argument("--disable-gpu") complete_starttime = time.time() # URL of the Google Trends page #script_dir = os.path.dirname(os.path.abspath(__file__)) #driver_path = os.path.join(script_dir, 'chromedriver') def setup_driver(): options = webdriver.ChromeOptions() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') wd = webdriver.Chrome(options=options) return wd def process_selenium_row(index, selenium_rows, driver): """Extract dynamic data using Selenium by clicking on the row.""" max_retries = 3 for attempt in range(max_retries): try: articles = {} # Refresh the rows before processing #selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]') row = selenium_rows[index] driver.execute_script("arguments[0].click();", row) # Use JavaScript click for stability # Wait for the articles to load dynamically WebDriverWait(driver, 10).until( EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj")) ) # Fetch only the newly loaded articles articles = driver.find_elements(By.CLASS_NAME, "xZCHj") print(articles) # Extract data from the current row only dynamic_data = { "article": [ { "href": article.get_attribute("href"), "title": article.text } for article in articles ] } # Clear previously fetched articles and return current ones return dynamic_data except Exception as e: print(f"Error processing row {index} (Attempt {attempt + 1}): {e}") time.sleep(1) # Add delay before retry print(f"Failed to process row {index} after {max_retries} attempts.") return {"article": []} def scrape_google_trends(driver, url): """Scrape Google Trends data and save to JSON.""" all_data = [] try: driver.get(url) WebDriverWait(driver, 20).until( EC.presence_of_element_located((By.CSS_SELECTOR, '[jsname="oKdM2c"]')) ) soup = BeautifulSoup(driver.page_source, "html.parser") tables = soup.select('[jsname="cC57zf"]') for table in tables: rows_bs = table.find_all("tr") selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]') for index, row_bs in enumerate(rows_bs): static_data = [ [div.get_text(strip=True) for div in cell.find_all("div")] for cell in row_bs.find_all("td")[1:4] ] dynamic_data = process_selenium_row(index, selenium_rows, driver) combined_row = { "static_data": static_data, "dynamic_data": dynamic_data } all_data.append(combined_row) return all_data except Exception as e: print(f"An error occurred: {e}") finally: driver.quit() def crawl_url(url): """Main function to be called from another script.""" driver = setup_driver() return scrape_google_trends(driver, url) if __name__ == "__main__": #crawl_url(url="https://trends.google.com/trends/trendingsearches/daily?geo=AT&category=2") driver = setup_driver()