File size: 3,508 Bytes
b279c69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import yaml
import requests
import pandas as pd

def internal_data(type):
    """
    Extract internal data from either catalog or query.

    :param type: str, 'catalog' or 'query'

    :return: pandas.DataFrame, dataframe containing product name and category name
    """
    if type == 'catalog':
        dfs = []
        for file in os.listdir('catalog'):
            if file.endswith('.xlsx'):
                df = pd.read_excel('catalog/' + file)
                dfs.append(df)
        catalog = pd.concat(dfs, ignore_index=True)
        return catalog
    
    elif type == 'query':
        dfs = []
        for file in os.listdir('query'):
            if file.endswith('.xlsx'):
                df = pd.read_excel('query/' + file)
                dfs.append(df)
        query = pd.concat(dfs, ignore_index=True)
        return query
    
    else:
        return 'Error: type must be either catalog or query'

def registered_fertilizer_data():
    """
    Scrape registered fertilizer data in Ministry of Agriculture website.

    :param type: str, 'organik' or 'anorganik'

    :return: pandas.DataFrame, dataframe containing registered fertilizer data
    """
    # check if the "external" folder is empty
    if os.listdir('external') == []:
        print('External folder is empty. Extracting data from Ministry of Agriculture website...')
        print('Extracting Organic Fertilizer Data...')
        dfs1 = []
        # Scrape every table in every page: Organic
        i = 1
        while True:
            url = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['scraping_url']['organik'][0] + str(i)
            result = requests.get(url).content
            try:
                df = pd.read_html(result)[5].iloc[2:-1, [2, 3, 6]].rename(columns={2: 'Merek', 3: 'Jenis', 6: 'Nomor Pendaftaran'})
                df['Page Number'] = i
                dfs1.append(df)
                i += 1
            except IndexError:
                break

        registered_organic_fertilizers = pd.concat(dfs1, ignore_index=True).dropna()
        
        print('Extracting Inorganic Fertilizer Data...')
        dfs2 = []
        # Scrape every table in every page: Inorganic
        i = 1
        while True:
            url = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['scraping_url']['anorganik'][0] + str(i)
            result = requests.get(url).content
            try:
                df = pd.read_html(result)[5].iloc[2:-1, 5:8].rename(columns={5: 'Merek', 6: 'Jenis', 7: 'Nomor Pendaftaran'})
                df['Page Number'] = i
                dfs2.append(df)
                i += 1
            except IndexError:
                break

        registered_inorganic_fertilizers = pd.concat(dfs2, ignore_index=True).dropna()

        registered_fertilizers = pd.concat([registered_organic_fertilizers, registered_inorganic_fertilizers], ignore_index=True)
        registered_fertilizers['Nama Lengkap'] = registered_fertilizers['Jenis'] + ' ' + registered_fertilizers['Merek']
        return registered_fertilizers

    else :
        return pd.read_csv('external/registered_fertilizers.csv')
    
def scrape_result():
    """
    Extract scraped result data.

    :return: pandas.DataFrame, dataframe containing scraped result data
    """
    dfs = []

    for filename in os.listdir('scrape_result'):
        df = pd.read_csv('scrape_result/'+filename)
        dfs.append(df)

    # combine
    final_df = pd.concat(dfs, ignore_index=True)
    return final_df