Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -8,24 +8,21 @@ from urllib.parse import unquote, urlparse
|
|
8 |
|
9 |
app = FastAPI()
|
10 |
|
|
|
11 |
def convert_cookies_to_dict(cookies):
|
12 |
-
|
13 |
-
|
|
|
14 |
|
|
|
15 |
def get_root_domain(url):
|
16 |
-
# 解析URL
|
17 |
parsed_url = urlparse(url)
|
18 |
-
# 获取域名部分
|
19 |
domain = parsed_url.netloc
|
20 |
|
21 |
-
# 分割域名部分以获取根域名
|
22 |
-
# 假设根域名是域名的最后两个部分
|
23 |
parts = domain.split('.')
|
24 |
if len(parts) > 1:
|
25 |
-
# 返回根域名部分
|
26 |
return '.'.join(parts[-2:])
|
27 |
else:
|
28 |
-
# 如果域名部分少于两个部分,返回整个域名
|
29 |
return domain
|
30 |
|
31 |
@app.get("/")
|
@@ -60,39 +57,49 @@ def chrome(url:str=None,wait:int=5,header:str=None,cookie:str=None):
|
|
60 |
# 如果输入了cookie
|
61 |
if type(cookie) == str:
|
62 |
header_array.update({"cookie":unquote(cookie)})
|
63 |
-
|
|
|
64 |
options = Options()
|
|
|
|
|
65 |
options.add_argument('--headless')
|
66 |
-
|
|
|
67 |
driver = webdriver.Chrome(options=options)
|
68 |
|
|
|
69 |
driver.get(target_url)
|
70 |
|
|
|
71 |
if 'cookie' in header_array:
|
72 |
cookie_array = convert_cookies_to_dict(header_array['cookie'])
|
73 |
del header_array['cookie']
|
74 |
for key, value in cookie_array.items():
|
75 |
driver.add_cookie({"name": key, "value": value, "domain": f'.{target_domain}', "path": "/", "secure": False})
|
76 |
-
|
|
|
77 |
driver.header_overrides = header_array
|
78 |
-
|
|
|
79 |
driver.get(target_url)
|
80 |
|
|
|
81 |
print(driver.page_source)
|
82 |
|
|
|
83 |
if wait_time > 0:
|
84 |
time.sleep(wait_time)
|
85 |
|
86 |
-
#
|
87 |
current_url = driver.current_url
|
88 |
|
89 |
-
#
|
90 |
page_source = driver.page_source
|
91 |
|
92 |
-
#
|
93 |
cookies = driver.get_cookies()
|
94 |
|
95 |
-
#
|
96 |
is_jump = (target_url != current_url)
|
97 |
|
98 |
data = {
|
|
|
8 |
|
9 |
app = FastAPI()
|
10 |
|
11 |
+
# 解析cookie字符串为字典
|
12 |
def convert_cookies_to_dict(cookies):
|
13 |
+
cookie_items = cookies.split("; ")
|
14 |
+
parsed_cookies = {item.split("=", 1)[0].strip(): item.split("=", 1)[1].strip() if "=" in item else "" for item in cookie_items}
|
15 |
+
return parsed_cookies#
|
16 |
|
17 |
+
# 获取域名字符串的根域
|
18 |
def get_root_domain(url):
|
|
|
19 |
parsed_url = urlparse(url)
|
|
|
20 |
domain = parsed_url.netloc
|
21 |
|
|
|
|
|
22 |
parts = domain.split('.')
|
23 |
if len(parts) > 1:
|
|
|
24 |
return '.'.join(parts[-2:])
|
25 |
else:
|
|
|
26 |
return domain
|
27 |
|
28 |
@app.get("/")
|
|
|
57 |
# 如果输入了cookie
|
58 |
if type(cookie) == str:
|
59 |
header_array.update({"cookie":unquote(cookie)})
|
60 |
+
|
61 |
+
# 初始化浏览器
|
62 |
options = Options()
|
63 |
+
|
64 |
+
# 设置为无头模式
|
65 |
options.add_argument('--headless')
|
66 |
+
|
67 |
+
# 实例化
|
68 |
driver = webdriver.Chrome(options=options)
|
69 |
|
70 |
+
# 需要打开网址页面,才能用 driver.add_cookie 进行cookie追加
|
71 |
driver.get(target_url)
|
72 |
|
73 |
+
# 对浏览器追加指定域名的cookie
|
74 |
if 'cookie' in header_array:
|
75 |
cookie_array = convert_cookies_to_dict(header_array['cookie'])
|
76 |
del header_array['cookie']
|
77 |
for key, value in cookie_array.items():
|
78 |
driver.add_cookie({"name": key, "value": value, "domain": f'.{target_domain}', "path": "/", "secure": False})
|
79 |
+
|
80 |
+
# 覆写下次访问的请求头(没有修改的则保持原样)
|
81 |
driver.header_overrides = header_array
|
82 |
+
|
83 |
+
# 再次访问网址
|
84 |
driver.get(target_url)
|
85 |
|
86 |
+
# 输出此时访问的网页源码
|
87 |
print(driver.page_source)
|
88 |
|
89 |
+
# 等待多少秒,来预估网页完全的加载完成(执行完内部的所有js,因为部分js可能涉及到请求后的动态处理,或者延时跳转)
|
90 |
if wait_time > 0:
|
91 |
time.sleep(wait_time)
|
92 |
|
93 |
+
# 获取完全加载完成时,页面的URL
|
94 |
current_url = driver.current_url
|
95 |
|
96 |
+
# 获取完全加载完成时,页面的源代码
|
97 |
page_source = driver.page_source
|
98 |
|
99 |
+
# 获取完全加载完成时,页面的cookie
|
100 |
cookies = driver.get_cookies()
|
101 |
|
102 |
+
# 完全加载完成时,页面是否有发生过 301 302 跳转过
|
103 |
is_jump = (target_url != current_url)
|
104 |
|
105 |
data = {
|