Add1E commited on
Commit
fee7f34
·
verified ·
1 Parent(s): e66a983

Update trend_crawl2.py

Browse files
Files changed (1) hide show
  1. trend_crawl2.py +91 -34
trend_crawl2.py CHANGED
@@ -1,29 +1,24 @@
 
 
 
1
  from selenium import webdriver
2
  from selenium.webdriver.common.by import By
3
- from selenium.webdriver.chrome.service import Service as ChromeService
4
- from selenium.common.exceptions import ElementClickInterceptedException
5
  from selenium.webdriver.chrome.options import Options
6
  from selenium.webdriver.support.ui import WebDriverWait
7
  from selenium.webdriver.support import expected_conditions as EC
8
- from bs4 import BeautifulSoup
9
- from webdriver_manager.chrome import ChromeDriverManager
10
- import time
11
  import json
 
12
 
13
  # Configure Chrome options
14
- chrome_options = Options()
15
- chrome_options.add_argument("--headless") # Run in headless mode
16
- chrome_options.add_argument("--disable-gpu")
17
-
18
-
19
  def setup_driver():
20
- options = webdriver.ChromeOptions()
21
- options.add_argument('--headless')
 
22
  options.add_argument('--no-sandbox')
23
  options.add_argument('--disable-dev-shm-usage')
24
  options.add_argument("--lang=de")
25
- wd = webdriver.Chrome(options=options)
26
- return wd
27
 
28
  def click_and_scrape(driver, url):
29
  """Click each li element and scrape data."""
@@ -31,11 +26,33 @@ def click_and_scrape(driver, url):
31
  try:
32
  driver.get(url)
33
 
34
- # Wait for the ul element to load
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  try:
36
- # Wait for the ul element with the specific aria-label to load
37
  ul_element = WebDriverWait(driver, 20).until(
38
- EC.presence_of_element_located((By.XPATH, "//ul[contains(@aria-label, 'Kategorie') or contains(@aria-label, 'Category')]"))
 
 
 
39
  )
40
  li_elements = ul_element.find_elements(By.TAG_NAME, "li")
41
  except Exception as e:
@@ -43,9 +60,9 @@ def click_and_scrape(driver, url):
43
  selected_elements = [li_elements[2]] + li_elements[4:]
44
  for index, li in enumerate(selected_elements):
45
  try:
46
- # Scroll each li element into view
47
  driver.execute_script("arguments[0].scrollIntoView();", li)
48
- # Click the <li> using JavaScript
49
  driver.execute_script("arguments[0].click();", li)
50
  print(f"Clicked LI {index} using JavaScript.")
51
  time.sleep(2)
@@ -63,7 +80,6 @@ def click_and_scrape(driver, url):
63
  except Exception as e:
64
  print(f"Error interacting with LI {index}: {e}")
65
 
66
-
67
  except Exception as e:
68
  print(f"Error during click and scrape: {e}")
69
 
@@ -79,17 +95,15 @@ def process_selenium_row(index, rows, driver):
79
  try:
80
  articles = {}
81
 
82
- driver.execute_script("arguments[0].click();", rows[index]) # Use JavaScript click for stability
83
-
84
- # Wait for the articles to load dynamically
85
  WebDriverWait(driver, 10).until(
86
  EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
87
  )
88
 
89
- # Fetch only the newly loaded articles
90
  articles = driver.find_elements(By.CLASS_NAME, "xZCHj")
91
  articles = articles[:3]
92
- # Extract data from the current row only
93
  dynamic_data = {
94
  "article": [
95
  {
@@ -143,16 +157,59 @@ def scrape_google_trends(driver):
143
  print(f"An error occurred during scraping: {e}")
144
  return []
145
 
146
- def crawl_url(url="https://trends.google.com/trends/trendingsearches/daily?geo=AT"):
147
- """Main function to crawl dynamically and scrape Google Trends."""
 
148
  driver = setup_driver()
149
- results = click_and_scrape(driver,url)
150
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
- if __name__ == "__main__":
153
- results = crawl_url()
 
 
154
  try:
155
- with open("results.json", "w", encoding="utf-8") as f:
156
- json.dump(results, f, ensure_ascii=False, indent=4)
 
 
 
 
 
 
 
 
 
 
 
 
157
  except Exception as e:
158
- print(f"Error writing results to JSON: {e}")
 
 
 
 
 
 
 
1
+ from selenium.common.exceptions import ElementClickInterceptedException
2
+ from bs4 import BeautifulSoup
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
  from selenium import webdriver
5
  from selenium.webdriver.common.by import By
 
 
6
  from selenium.webdriver.chrome.options import Options
7
  from selenium.webdriver.support.ui import WebDriverWait
8
  from selenium.webdriver.support import expected_conditions as EC
 
 
 
9
  import json
10
+ import time
11
 
12
  # Configure Chrome options
 
 
 
 
 
13
  def setup_driver():
14
+ options = Options()
15
+ options.add_argument("--headless")
16
+ options.add_argument("--disable-gpu")
17
  options.add_argument('--no-sandbox')
18
  options.add_argument('--disable-dev-shm-usage')
19
  options.add_argument("--lang=de")
20
+ return webdriver.Chrome(options=options)
21
+
22
 
23
  def click_and_scrape(driver, url):
24
  """Click each li element and scrape data."""
 
26
  try:
27
  driver.get(url)
28
 
29
+ for attempt in range(4):
30
+ try:
31
+ button = WebDriverWait(driver, 20).until(
32
+ EC.element_to_be_clickable((
33
+ By.XPATH,
34
+ "//button[@aria-label='Alle Kategorien, Kategorie auswählen']"
35
+ ))
36
+ )
37
+ print("Button located.")
38
+
39
+ driver.execute_script("arguments[0].scrollIntoView();", button)
40
+ print(button.get_attribute("outerHTML"))
41
+
42
+
43
+ button.click()
44
+ print("Button clicked successfully.")
45
+ break
46
+ except ElementClickInterceptedException:
47
+ print(f"Attempt {attempt + 1}: Click intercepted. Retrying...")
48
+
49
  try:
50
+
51
  ul_element = WebDriverWait(driver, 20).until(
52
+ EC.presence_of_element_located((
53
+ By.XPATH,
54
+ "//ul[@aria-label='Kategorie']"
55
+ ))
56
  )
57
  li_elements = ul_element.find_elements(By.TAG_NAME, "li")
58
  except Exception as e:
 
60
  selected_elements = [li_elements[2]] + li_elements[4:]
61
  for index, li in enumerate(selected_elements):
62
  try:
63
+
64
  driver.execute_script("arguments[0].scrollIntoView();", li)
65
+
66
  driver.execute_script("arguments[0].click();", li)
67
  print(f"Clicked LI {index} using JavaScript.")
68
  time.sleep(2)
 
80
  except Exception as e:
81
  print(f"Error interacting with LI {index}: {e}")
82
 
 
83
  except Exception as e:
84
  print(f"Error during click and scrape: {e}")
85
 
 
95
  try:
96
  articles = {}
97
 
98
+ driver.execute_script("arguments[0].click();", rows[index])
99
+
 
100
  WebDriverWait(driver, 10).until(
101
  EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
102
  )
103
 
104
+
105
  articles = driver.find_elements(By.CLASS_NAME, "xZCHj")
106
  articles = articles[:3]
 
107
  dynamic_data = {
108
  "article": [
109
  {
 
157
  print(f"An error occurred during scraping: {e}")
158
  return []
159
 
160
+
161
+ def process_li_element(index, li_data, url):
162
+ """Process a single li element."""
163
  driver = setup_driver()
164
+ try:
165
+ driver.get(url)
166
+ WebDriverWait(driver, 20).until(
167
+ EC.presence_of_element_located((By.XPATH, "//ul[contains(@aria-label, 'Kategorie') or contains(@aria-label, 'Category')]"))
168
+ )
169
+
170
+ ul_element = driver.find_element(By.XPATH, "//ul[contains(@aria-label, 'Kategorie') or contains(@aria-label, 'Category')]")
171
+ li_elements = ul_element.find_elements(By.TAG_NAME, "li")
172
+ selected_li = li_elements[li_data['index']]
173
+
174
+ driver.execute_script("arguments[0].scrollIntoView();", selected_li)
175
+ driver.execute_script("arguments[0].click();", selected_li)
176
+ time.sleep(2)
177
+
178
+ span_content = selected_li.find_element(By.CLASS_NAME, "W7g1Rb-rymPhb-fpDzbe-fmcmS").get_attribute("innerText")
179
+ print(f"LI {li_data['index']} clicked: {span_content}")
180
+
181
+ data = scrape_google_trends(driver)
182
+ return {span_content: data}
183
+ except Exception as e:
184
+ print(f"Error processing LI {index}: {e}")
185
+ return {}
186
+ finally:
187
+ driver.quit()
188
 
189
+ def crawl_url(url):
190
+ """Click each li element and scrape data in parallel."""
191
+ driver = setup_driver()
192
+ result_dict = {}
193
  try:
194
+ driver.get(url)
195
+ WebDriverWait(driver, 20).until(
196
+ EC.presence_of_element_located((By.XPATH, "//ul[contains(@aria-label, 'Kategorie') or contains(@aria-label, 'Category')]"))
197
+ )
198
+
199
+ ul_element = driver.find_element(By.XPATH, "//ul[contains(@aria-label, 'Kategorie') or contains(@aria-label, 'Category')]")
200
+ li_elements = ul_element.find_elements(By.TAG_NAME, "li")
201
+ selected_elements = [{"index": i} for i in range(2, len(li_elements)) if i != 3]
202
+
203
+ with ThreadPoolExecutor() as executor:
204
+ futures = [executor.submit(process_li_element, idx, li_data, url) for idx, li_data in enumerate(selected_elements)]
205
+ for future in as_completed(futures):
206
+ result = future.result()
207
+ result_dict.update(result)
208
  except Exception as e:
209
+ print(f"Error during click and scrape: {e}")
210
+ finally:
211
+ driver.quit()
212
+
213
+ return result_dict
214
+
215
+