Spaces:
Runtime error
Runtime error
File size: 3,508 Bytes
b279c69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import os
import yaml
import requests
import pandas as pd
def internal_data(type):
"""
Extract internal data from either catalog or query.
:param type: str, 'catalog' or 'query'
:return: pandas.DataFrame, dataframe containing product name and category name
"""
if type == 'catalog':
dfs = []
for file in os.listdir('catalog'):
if file.endswith('.xlsx'):
df = pd.read_excel('catalog/' + file)
dfs.append(df)
catalog = pd.concat(dfs, ignore_index=True)
return catalog
elif type == 'query':
dfs = []
for file in os.listdir('query'):
if file.endswith('.xlsx'):
df = pd.read_excel('query/' + file)
dfs.append(df)
query = pd.concat(dfs, ignore_index=True)
return query
else:
return 'Error: type must be either catalog or query'
def registered_fertilizer_data():
"""
Scrape registered fertilizer data in Ministry of Agriculture website.
:param type: str, 'organik' or 'anorganik'
:return: pandas.DataFrame, dataframe containing registered fertilizer data
"""
# check if the "external" folder is empty
if os.listdir('external') == []:
print('External folder is empty. Extracting data from Ministry of Agriculture website...')
print('Extracting Organic Fertilizer Data...')
dfs1 = []
# Scrape every table in every page: Organic
i = 1
while True:
url = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['scraping_url']['organik'][0] + str(i)
result = requests.get(url).content
try:
df = pd.read_html(result)[5].iloc[2:-1, [2, 3, 6]].rename(columns={2: 'Merek', 3: 'Jenis', 6: 'Nomor Pendaftaran'})
df['Page Number'] = i
dfs1.append(df)
i += 1
except IndexError:
break
registered_organic_fertilizers = pd.concat(dfs1, ignore_index=True).dropna()
print('Extracting Inorganic Fertilizer Data...')
dfs2 = []
# Scrape every table in every page: Inorganic
i = 1
while True:
url = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['scraping_url']['anorganik'][0] + str(i)
result = requests.get(url).content
try:
df = pd.read_html(result)[5].iloc[2:-1, 5:8].rename(columns={5: 'Merek', 6: 'Jenis', 7: 'Nomor Pendaftaran'})
df['Page Number'] = i
dfs2.append(df)
i += 1
except IndexError:
break
registered_inorganic_fertilizers = pd.concat(dfs2, ignore_index=True).dropna()
registered_fertilizers = pd.concat([registered_organic_fertilizers, registered_inorganic_fertilizers], ignore_index=True)
registered_fertilizers['Nama Lengkap'] = registered_fertilizers['Jenis'] + ' ' + registered_fertilizers['Merek']
return registered_fertilizers
else :
return pd.read_csv('external/registered_fertilizers.csv')
def scrape_result():
"""
Extract scraped result data.
:return: pandas.DataFrame, dataframe containing scraped result data
"""
dfs = []
for filename in os.listdir('scrape_result'):
df = pd.read_csv('scrape_result/'+filename)
dfs.append(df)
# combine
final_df = pd.concat(dfs, ignore_index=True)
return final_df |