Spaces:
Sleeping
Sleeping
Commit
·
3c62199
1
Parent(s):
8492c80
Upload parsing (1).py
Browse files- parsing (1).py +74 -0
parsing (1).py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import requests
|
3 |
+
import pandas as pd
|
4 |
+
from tqdm import tqdm
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
|
7 |
+
|
8 |
+
def text(links):
|
9 |
+
for elem in links:
|
10 |
+
result = elem.text.strip()
|
11 |
+
break
|
12 |
+
|
13 |
+
return result
|
14 |
+
|
15 |
+
|
16 |
+
url = 'https://www.biblio-globus.ru/catalog/categories'
|
17 |
+
catalog = requests.get(url)
|
18 |
+
catalog_soup = BeautifulSoup(catalog.text, 'lxml')
|
19 |
+
list_categories = catalog_soup.find_all('li', class_='list-group-item')
|
20 |
+
|
21 |
+
df = []
|
22 |
+
columns = ['product_url', 'image', 'author', 'title', 'annotation', 'genre']
|
23 |
+
|
24 |
+
n = 1
|
25 |
+
for link in tqdm(list_categories):
|
26 |
+
|
27 |
+
category_url = 'https://www.biblio-globus.ru' + link.find('a')['href']
|
28 |
+
category_page = requests.get(category_url)
|
29 |
+
category_soup = BeautifulSoup(category_page.text, 'lxml')
|
30 |
+
list_subcategories = category_soup.find_all('a', class_='product-preview-title')
|
31 |
+
|
32 |
+
|
33 |
+
for sub in tqdm(list_subcategories):
|
34 |
+
|
35 |
+
subcategory_id = sub['href'].split('/')[-1]
|
36 |
+
|
37 |
+
page = 1
|
38 |
+
while True:
|
39 |
+
|
40 |
+
subcategiry_url = f'https://www.biblio-globus.ru/catalog/category?id={subcategory_id}&page={page}&sort=0'
|
41 |
+
subcategiry_page = requests.get(subcategiry_url)
|
42 |
+
subcategiry_soup = BeautifulSoup(subcategiry_page.text, 'lxml')
|
43 |
+
subcategiry_links = subcategiry_soup.find_all('div', class_='text')
|
44 |
+
if not subcategiry_links:
|
45 |
+
break
|
46 |
+
|
47 |
+
for product in subcategiry_links:
|
48 |
+
product_url = 'https://www.biblio-globus.ru' + product.find('a')['href']
|
49 |
+
product_page = requests.get(product_url)
|
50 |
+
product_soup = BeautifulSoup(product_page.text, 'lxml')
|
51 |
+
product_annotation = product_soup.find('div', id='collapseExample')
|
52 |
+
if product_annotation:
|
53 |
+
annotation = ''.join([symbol for symbol in product_annotation.text if symbol not in ['\n', '\r', '\t', 'm', '\xa0']])
|
54 |
+
annotation = annotation.split('Характеристики', 1)[0]
|
55 |
+
annotation = annotation.strip()
|
56 |
+
else:
|
57 |
+
annotation = None
|
58 |
+
|
59 |
+
try:
|
60 |
+
product_json = product_soup.find('script', type='application/ld+json')
|
61 |
+
dict_json = json.loads(product_json.text)
|
62 |
+
except (AttributeError, json.JSONDecodeError):
|
63 |
+
continue
|
64 |
+
|
65 |
+
author = dict_json['author']['name']
|
66 |
+
title = dict_json['name']
|
67 |
+
image = dict_json['image']
|
68 |
+
genre = dict_json['genre']
|
69 |
+
df.append([product_url, image, author, title, annotation, genre])
|
70 |
+
page += 1
|
71 |
+
|
72 |
+
data = pd.DataFrame(df, columns=columns)
|
73 |
+
data.to_csv(f'data{n}.csv', index=False)
|
74 |
+
n += 1
|