enesmanan commited on
Commit
cf6e5a5
·
verified ·
1 Parent(s): 375fc31

Upload trendyol_scraper_origin.py

Browse files
Files changed (1) hide show
  1. trendyol_scraper_origin.py +120 -0
trendyol_scraper_origin.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+
4
+ import pandas as pd
5
+ from selenium import webdriver
6
+ from selenium.webdriver.chrome.service import Service as ChromeService
7
+ from selenium.webdriver.common.by import By
8
+ from selenium.webdriver.support import expected_conditions as EC
9
+ from selenium.webdriver.support.ui import WebDriverWait
10
+ from webdriver_manager.chrome import ChromeDriverManager
11
+
12
+ def scrape_comments(url):
13
+ # Create data directory if it doesn't exist
14
+ data_directory = "data"
15
+ if not os.path.exists(data_directory):
16
+ os.makedirs(data_directory)
17
+
18
+ def comprehensive_scroll(driver):
19
+ # Scroll until no more new content is loaded
20
+ last_height = driver.execute_script("return document.body.scrollHeight")
21
+ while True:
22
+ # Scroll to bottom
23
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
24
+ time.sleep(3) # Wait for potential content loading
25
+
26
+ # Calculate new scroll height
27
+ new_height = driver.execute_script("return document.body.scrollHeight")
28
+
29
+ # Check if bottom has been reached
30
+ if new_height == last_height:
31
+ break
32
+
33
+ last_height = new_height
34
+
35
+ try:
36
+ chrome_options = webdriver.ChromeOptions()
37
+ chrome_options.add_argument("--disable-notifications")
38
+ chrome_options.add_argument("--headless")
39
+ chrome_options.add_argument("--disable-gpu")
40
+ chrome_options.add_argument("--no-sandbox")
41
+ chrome_options.add_argument("--disable-dev-shm-usage")
42
+ chrome_options.add_argument("--window-size=1920,1080")
43
+ chrome_options.add_argument("--start-maximized")
44
+ chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
45
+
46
+ service = ChromeService(ChromeDriverManager().install())
47
+ driver = webdriver.Chrome(service=service, options=chrome_options)
48
+ driver.maximize_window()
49
+
50
+ driver.get(url)
51
+
52
+ WebDriverWait(driver, 10).until(
53
+ EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
54
+ ).click()
55
+
56
+ comprehensive_scroll(driver)
57
+
58
+ comment_elements = driver.find_elements(
59
+ By.XPATH,
60
+ "/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div",
61
+ )
62
+ total_comments = len(comment_elements)
63
+
64
+ data = []
65
+ for i in range(1, total_comments + 1):
66
+ kullanıcı_id = i
67
+ try:
68
+ username_xpath = f"/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[1]"
69
+ username = driver.find_element(By.XPATH, username_xpath).text
70
+ except:
71
+ username = "N/A"
72
+
73
+ try:
74
+ comment_xpath = f"/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[2]/p"
75
+ comment = driver.find_element(By.XPATH, comment_xpath).text
76
+ except:
77
+ comment = "N/A"
78
+
79
+ try:
80
+ date_xpath = f"/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[2]"
81
+ date = driver.find_element(By.XPATH, date_xpath).text
82
+ except:
83
+ date = "N/A"
84
+
85
+ star_xpath_base = f"/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[1]/div"
86
+ try:
87
+ full_stars = driver.find_elements(
88
+ By.XPATH,
89
+ f"{star_xpath_base}/div[@class='star-w']/div[@class='full'][@style='width: 100%; max-width: 100%;']",
90
+ )
91
+ star_count = len(full_stars)
92
+ except:
93
+ star_count = 0
94
+
95
+ data.append(
96
+ {
97
+ "Kullanıcı_id": kullanıcı_id,
98
+ "Kullanıcı Adı": username,
99
+ "Yorum": comment,
100
+ "Tarih": date,
101
+ "Yıldız Sayısı": star_count,
102
+ }
103
+ )
104
+
105
+ df = pd.DataFrame(data)
106
+ return df
107
+
108
+ except Exception as e:
109
+ print(f"Hata oluştu: {str(e)}")
110
+ return None
111
+
112
+ finally:
113
+ driver.quit()
114
+
115
+ if __name__ == "__main__":
116
+ # Test URL
117
+ url = "https://www.trendyol.com/apple/macbook-air-m1-cip-8gb-256gb-ssd-macos-13-qhd-tasinabilir-bilgisayar-uzay-grisi-p-68042136/yorumlar"
118
+ df = scrape_comments(url)
119
+ if df is not None:
120
+ print(f"Toplam {len(df)} yorum çekildi.")