import requests import zipfile import io import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import os import altair as alt import streamlit as st import statsmodels.formula.api as smf from duckduckgo_search import DDGS st.title("Assignment 2: Building a Data Dashboard with Streamlit") st.markdown(""" **Kiva** is a non-profit organization that facilitates microfinancing for entrepreneurs and small businesses in low-income communities around the world. By providing a platform where individuals can lend small amounts of money to borrowers in developing regions, Kiva aims to expand financial inclusion and foster economic development. The dataset in question encompasses a broad range of variables related to Kiva loans. It includes information on the gender of the borrowers, the amounts of the loans, the number of lenders participating in each loan, and the duration of the loans. This comprehensive dataset allows us to conduct an in-depth analysis of various dimensions of Kiva’s microfinance operations. By examining these variables, we can explore patterns and trends in borrowing behavior, loan distribution, and the impact of microfinance on different demographic groups and regions. """) st.markdown("""We have the following research question that we aim to investigate and attempt to answer: Do men borrow more money than women?""") @st.cache_data # Cache the function to enhance performance def load_data(): # Define the file path zip_url_1= "https://github.com/aaubs/ds-master/raw/main/data/assignments_datasets/KIVA/kiva_loans_part_0.csv.zip" # Download the ZIP file response = requests.get(zip_url_1) response.raise_for_status() # Check if the request was successful # Open the ZIP file from the response content with zipfile.ZipFile(io.BytesIO(response.content)) as zf: # List all files in the ZIP print(zf.namelist()) # Read a specific CSV file from the ZIP df1 = pd.read_csv(zf.open('kiva_loans_part_0.csv')) return df1 # Load the data using the defined function df1 = load_data() @st.cache_data # Cache the function to enhance performance def load_data(): # Define the file path zip_url_2= "https://github.com/aaubs/ds-master/raw/main/data/assignments_datasets/KIVA/kiva_loans_part_1.csv.zip" # Download the ZIP file response = requests.get(zip_url_2) response.raise_for_status() # Check if the request was successful # Open the ZIP file from the response content with zipfile.ZipFile(io.BytesIO(response.content)) as zf: # List all files in the ZIP print(zf.namelist()) # Read a specific CSV file from the ZIP df2 = pd.read_csv(zf.open('kiva_loans_part_1.csv')) return df2 # Load the data using the defined function df2 = load_data() @st.cache_data # Cache the function to enhance performance def load_data(): # Define the file path zip_url_3= "https://github.com/aaubs/ds-master/raw/main/data/assignments_datasets/KIVA/kiva_loans_part_2.csv.zip" # Download the ZIP file response = requests.get(zip_url_3) response.raise_for_status() # Check if the request was successful # Open the ZIP file from the response content with zipfile.ZipFile(io.BytesIO(response.content)) as zf: # List all files in the ZIP print(zf.namelist()) # Read a specific CSV file from the ZIP df3 = pd.read_csv(zf.open('kiva_loans_part_2.csv')) return df3 # Load the data using the defined function df3 = load_data() data = pd.concat([df1, df2, df3]) data.drop(['tags'], axis = 'columns', inplace = True) data.dropna(inplace = True) valid_genders = ['male', 'female'] data = data[data['borrower_genders'].isin(valid_genders)] st.subheader("""Cleaning data""") st.markdown("""We have eliminated the column tags, as well as the associated tags, since they merely consisted of quotations such as “User favorite,” among others. Additionally, these columns contained a significant amount of missing data (NAs).""") st.text(f'We just saved {(len(data) / 671205) * 100} % of the data!') st.text(f'Number of remaining {len(data)} rows') st.subheader("Basic statistics for key variables") st.dataframe(data[['loan_amount','term_in_months','lender_count']].agg(['mean','var','min','median','max','sum'])) st.markdown("""How to interpret the data?""") results_stat = DDGS().chat( "You are an extremely good statician with lots of knowledge about statistics. " "Interpret the following statistic results: " + str(data[['loan_amount','term_in_months','lender_count']].agg(['mean','var','min','median','max','sum'])) +" summarize the results in a easy understanding way and with normal text", model='gpt-4o-mini') st.markdown(results_stat) st.markdown('Pick what to group by') selected1 = st.multiselect("Select variable1", ['loan_amount', 'term_in_months', 'lender_count']) st.markdown('Pick what statistic to inspect') selected2 = st.multiselect("Select statistic(s)", ['mean', 'var', 'min', 'median', 'max', 'sum', 'std']) st.markdown('Pick borrower genders to include') selected_genders = st.multiselect("Select borrower genders", ['male', 'female']) if selected1 and selected2 and selected_genders: filtered_data = data[data['borrower_genders'].isin(selected_genders)] st.table(filtered_data.groupby(['borrower_genders', 'sector'])[selected1].agg(selected2)) else: st.write("Please select at least one variable, one statistic, and at least one gender.") st.subheader("Visualizations") correlation_matrix = data[['loan_amount', 'term_in_months', 'lender_count']].corr(method='spearman') # Dropdown to select the type of visualization visualization_option = st.selectbox( "Select Visualization 🎨", ["Number of loans in sectors Distribution", "Loan Amount Distribution by Gender", "Loan Amount Distribution by Sector Type", "KDE Plot: Loan amount based on sectors", "Correlation Matrix of Loan amount, length of loan and amount of lenders"] ) # Visualizations based on user selection if visualization_option == "Number of loans in sectors Distribution": plt.figure(figsize=(12, 6)) # Number of loans in sectors Distribution sns.histplot(data['sector'], kde=True) plt.title('Number of loans in sectors Distribution') plt.xlabel('Sector') plt.ylabel('Frequency') plt.xticks(rotation=45) plt.show() st.pyplot(plt, use_container_width=True) elif visualization_option == "KDE Plot: Loan amount based on sectors": # KDE plot for Distance from Home based on Attrition sns.kdeplot(data = data, x = 'loan_amount', hue = 'sector', clip = (0,4000)) plt.title('KDE Plot: Loan amount based on sectors') st.pyplot(plt) elif visualization_option == "Loan Amount Distribution by Gender": # Bar chart for attrition by job role plt.figure(figsize=(12, 6)) sns.boxplot(x='borrower_genders', y='loan_amount', data=data, order=data['borrower_genders'].value_counts().index) plt.title('Loan Amount Distribution by Gender') plt.xlabel('Borrower Gender') plt.ylabel('Loan amount') plt.xticks(rotation=45) plt.ylim(0, 3000) st.pyplot(plt, use_container_width=True) elif visualization_option == "Loan Amount Distribution by Sector Type": plt.figure(figsize=(12, 6)) sns.boxplot(x='sector', y='loan_amount', data=data, order=data['sector'].value_counts().index) plt.title('Loan Amount Distribution by Sector Type') plt.xlabel('Sector') plt.ylabel('Loan amount') plt.xticks(rotation=45) plt.ylim(0, 12500) st.pyplot(plt, use_container_width=True) elif visualization_option == "Correlation Matrix of Loan amount, length of loan and amount of lenders": sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm').set_title('Correlation Matrix of Loan amount, length of loan and amount of lenders') st.pyplot(plt) st.subheader("Regression") data['gender_binary'] = data['borrower_genders'].apply(lambda x: 1 if x == 'male' else 0) model = smf.ols('loan_amount ~gender_binary+ lender_count+ term_in_months', data = data).fit() st.write(model.summary()) st.subheader("""We can conclude with 73% significans that men borrow more money than women.""") st.subheader("The world-known economist answering the OLS-regression") results = DDGS().chat( "You are an extremely good economist with lots of knowledge about econometrics. " "Interpret the following OLS results: " + str(model.summary()) + ". Specifically, answer if men borrow more money than women.", model='gpt-4o-mini') st.markdown(results)