Vladislawoo commited on
Commit
3c62199
·
1 Parent(s): 8492c80

Upload parsing (1).py

Browse files
Files changed (1) hide show
  1. parsing (1).py +74 -0
parsing (1).py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import requests
3
+ import pandas as pd
4
+ from tqdm import tqdm
5
+ from bs4 import BeautifulSoup
6
+
7
+
8
+ def text(links):
9
+ for elem in links:
10
+ result = elem.text.strip()
11
+ break
12
+
13
+ return result
14
+
15
+
16
+ url = 'https://www.biblio-globus.ru/catalog/categories'
17
+ catalog = requests.get(url)
18
+ catalog_soup = BeautifulSoup(catalog.text, 'lxml')
19
+ list_categories = catalog_soup.find_all('li', class_='list-group-item')
20
+
21
+ df = []
22
+ columns = ['product_url', 'image', 'author', 'title', 'annotation', 'genre']
23
+
24
+ n = 1
25
+ for link in tqdm(list_categories):
26
+
27
+ category_url = 'https://www.biblio-globus.ru' + link.find('a')['href']
28
+ category_page = requests.get(category_url)
29
+ category_soup = BeautifulSoup(category_page.text, 'lxml')
30
+ list_subcategories = category_soup.find_all('a', class_='product-preview-title')
31
+
32
+
33
+ for sub in tqdm(list_subcategories):
34
+
35
+ subcategory_id = sub['href'].split('/')[-1]
36
+
37
+ page = 1
38
+ while True:
39
+
40
+ subcategiry_url = f'https://www.biblio-globus.ru/catalog/category?id={subcategory_id}&page={page}&sort=0'
41
+ subcategiry_page = requests.get(subcategiry_url)
42
+ subcategiry_soup = BeautifulSoup(subcategiry_page.text, 'lxml')
43
+ subcategiry_links = subcategiry_soup.find_all('div', class_='text')
44
+ if not subcategiry_links:
45
+ break
46
+
47
+ for product in subcategiry_links:
48
+ product_url = 'https://www.biblio-globus.ru' + product.find('a')['href']
49
+ product_page = requests.get(product_url)
50
+ product_soup = BeautifulSoup(product_page.text, 'lxml')
51
+ product_annotation = product_soup.find('div', id='collapseExample')
52
+ if product_annotation:
53
+ annotation = ''.join([symbol for symbol in product_annotation.text if symbol not in ['\n', '\r', '\t', 'm', '\xa0']])
54
+ annotation = annotation.split('Характеристики', 1)[0]
55
+ annotation = annotation.strip()
56
+ else:
57
+ annotation = None
58
+
59
+ try:
60
+ product_json = product_soup.find('script', type='application/ld+json')
61
+ dict_json = json.loads(product_json.text)
62
+ except (AttributeError, json.JSONDecodeError):
63
+ continue
64
+
65
+ author = dict_json['author']['name']
66
+ title = dict_json['name']
67
+ image = dict_json['image']
68
+ genre = dict_json['genre']
69
+ df.append([product_url, image, author, title, annotation, genre])
70
+ page += 1
71
+
72
+ data = pd.DataFrame(df, columns=columns)
73
+ data.to_csv(f'data{n}.csv', index=False)
74
+ n += 1