import os import yaml import requests import pandas as pd def internal_data(type): """ Extract internal data from either catalog or query. :param type: str, 'catalog' or 'query' :return: pandas.DataFrame, dataframe containing product name and category name """ if type == 'catalog': dfs = [] for file in os.listdir('catalog'): if file.endswith('.xlsx'): df = pd.read_excel('catalog/' + file) dfs.append(df) catalog = pd.concat(dfs, ignore_index=True) return catalog elif type == 'query': dfs = [] for file in os.listdir('query'): if file.endswith('.xlsx'): df = pd.read_excel('query/' + file) dfs.append(df) query = pd.concat(dfs, ignore_index=True) return query else: return 'Error: type must be either catalog or query' def registered_fertilizer_data(): """ Scrape registered fertilizer data in Ministry of Agriculture website. :param type: str, 'organik' or 'anorganik' :return: pandas.DataFrame, dataframe containing registered fertilizer data """ # check if the "external" folder is empty if os.listdir('external') == []: print('External folder is empty. Extracting data from Ministry of Agriculture website...') print('Extracting Organic Fertilizer Data...') dfs1 = [] # Scrape every table in every page: Organic i = 1 while True: url = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['scraping_url']['organik'][0] + str(i) result = requests.get(url).content try: df = pd.read_html(result)[5].iloc[2:-1, [2, 3, 6]].rename(columns={2: 'Merek', 3: 'Jenis', 6: 'Nomor Pendaftaran'}) df['Page Number'] = i dfs1.append(df) i += 1 except IndexError: break registered_organic_fertilizers = pd.concat(dfs1, ignore_index=True).dropna() print('Extracting Inorganic Fertilizer Data...') dfs2 = [] # Scrape every table in every page: Inorganic i = 1 while True: url = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['scraping_url']['anorganik'][0] + str(i) result = requests.get(url).content try: df = pd.read_html(result)[5].iloc[2:-1, 5:8].rename(columns={5: 'Merek', 6: 'Jenis', 7: 'Nomor Pendaftaran'}) df['Page Number'] = i dfs2.append(df) i += 1 except IndexError: break registered_inorganic_fertilizers = pd.concat(dfs2, ignore_index=True).dropna() registered_fertilizers = pd.concat([registered_organic_fertilizers, registered_inorganic_fertilizers], ignore_index=True) registered_fertilizers['Nama Lengkap'] = registered_fertilizers['Jenis'] + ' ' + registered_fertilizers['Merek'] return registered_fertilizers else : return pd.read_csv('external/registered_fertilizers.csv') def scrape_result(): """ Extract scraped result data. :return: pandas.DataFrame, dataframe containing scraped result data """ dfs = [] for filename in os.listdir('scrape_result'): df = pd.read_csv('scrape_result/'+filename) dfs.append(df) # combine final_df = pd.concat(dfs, ignore_index=True) return final_df