prthgo's picture
Upload app.py
2ab6e66
import streamlit as st
import numpy as np
import pandas as pd
import io
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns
from sklearn.preprocessing import (
OneHotEncoder,
OrdinalEncoder,
StandardScaler,
MinMaxScaler,
)
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.linear_model import Ridge, Lasso, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBRFRegressor, XGBRFClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.metrics import (
mean_absolute_error,
mean_squared_error,
mean_squared_error,
r2_score,
)
from sklearn.metrics import (
accuracy_score,
f1_score,
confusion_matrix,
precision_score,
recall_score,
)
import pickle
st.set_page_config(page_title="Tabular Data Analysis and Auto ML", page_icon="🤖")
sns.set_style("white")
sns.set_context("poster", font_scale=0.7)
palette = [
"#1d7874",
"#679289",
"#f4c095",
"#ee2e31",
"#ffb563",
"#918450",
"#f85e00",
"#a41623",
"#9a031e",
"#d6d6d6",
"#ffee32",
"#ffd100",
"#333533",
"#202020",
]
def main():
file = st.sidebar.file_uploader("Upload Your CSV File Here: ")
process = st.sidebar.button("Process")
option = st.sidebar.radio(
"Select an Option: ",
(
"Basic EDA",
"Univariate Analysis",
"Bivariate Analysis",
"Preprocess",
"Training and Evaluation",
),
)
placeholder = st.empty()
placeholder.markdown(
"<h1 style='text-align: center;'>Welcome to Tabular Data Analysis and Auto ML🤖</h1>",
unsafe_allow_html=True
)
if file is not None and process:
data = load_csv(file)
st.session_state["data"] = data
if "data" in st.session_state:
data = st.session_state["data"]
placeholder.empty()
if option == "Basic EDA":
st.markdown(
"<h1 style='text-align: center;'>Basic EDA</h1>", unsafe_allow_html=True
)
st.subheader("Data Overview")
st.write(data_overview(data))
st.write(duplicate(data))
st.dataframe(data.head())
st.subheader("Data Types and Unique Value Counts")
display_data_info(data)
st.subheader("Missing Data")
missing_data(data)
st.subheader("Value Counts")
value_counts(data)
st.subheader("Descriptive Statistics")
st.write(data.describe().T)
if option == "Univariate Analysis":
st.markdown(
"<h1 style='text-align: center;'>Univariate Analysis</h1>",
unsafe_allow_html=True,
)
plot = st.radio(
"Select a chart: ",
("Count Plot", "Pie Chart", "Histogram", "Violin Plot", "Scatter Plot"),
)
if plot == "Count Plot":
column = st.selectbox(
"Select a column", [""] + list(data.select_dtypes("O"))
)
if column:
countplot(data, column)
if plot == "Pie Chart":
column = st.selectbox(
"Select a column", [""] + list(data.select_dtypes("O"))
)
if column:
piechart(data, column)
if plot == "Histogram":
column = st.selectbox(
"Select a column",
[""] + list(data.select_dtypes(include=["int", "float"])),
)
if column:
histogram(data, column)
if plot == "Violin Plot":
column = st.selectbox(
"Select a column",
[""] + list(data.select_dtypes(include=["int", "float"])),
)
if column:
violinplot(data, column)
if plot == "Scatter Plot":
column = st.selectbox(
"Select a column",
[""] + list(data.select_dtypes(include=["int", "float"])),
)
if column:
scatterplot(data, column)
if option == "Bivariate Analysis":
st.markdown(
"<h1 style='text-align: center;'>Bivariate Analysis</h1>",
unsafe_allow_html=True,
)
plot = st.radio(
"Select a chart: ",
("Scatter Plot", "Bar Plot", "Box Plot", "Pareto Chart"),
)
if plot == "Scatter Plot":
columns = st.multiselect(
"Select two columns",
[""] + list(data.select_dtypes(include=["int", "float"])),
)
if columns:
biscatterplot(data, columns)
if plot == "Bar Plot":
columns = st.multiselect("Select two columns", list(data.columns))
if columns:
bibarplot(data, columns)
if plot == "Box Plot":
columns = st.multiselect("Select two columns", list(data.columns))
if columns:
biboxplot(data, columns)
if plot == "Pareto Chart":
column = st.selectbox(
"Select a columns",
[""] + list(data.select_dtypes(include="object")),
)
if column:
paretoplot(data, column)
if option == "Preprocess":
st.markdown(
"<h1 style='text-align: center;'>Data Preprocessing</h1>",
unsafe_allow_html=True,
)
operation = st.radio(
"Select preprocessing step: ",
(
"Drop Columns",
"Handling Missing Values",
"Encode Categorical Features",
),
)
if operation == "Drop Columns":
columns = st.multiselect("Select Columns to drop: ", (data.columns))
drop_columns = st.button("Drop Columns")
if drop_columns:
data.drop(columns, axis=1, inplace=True)
st.success("Dropped selected columns✅✅✅")
elif operation == "Handling Missing Values":
num_missing = st.selectbox(
"Select a Approach (Numerical columns only): ",
("", "Drop", "Backward Fill", "Forward Fill", "Mean", "Median"),
).lower()
cat_missing = st.selectbox(
"Select a Approach (Categorical columns only): ",
("", "Drop", "Most Frequent Values", "Replace with 'Unknown'"),
).lower()
hmv = st.button("Handle Missing Values")
if hmv:
if num_missing:
num_data = data.select_dtypes(include=["int64", "float64"])
if num_missing == "drop":
data = data.dropna(subset=num_data.columns)
elif num_missing in [
"mean",
"median",
"backward fill",
"forward fill",
]:
if num_missing == "mean":
fill_values = num_data.mean()
elif num_missing == "median":
fill_values = num_data.median()
elif num_missing == "backward fill":
fill_values = num_data.bfill()
elif num_missing == "forward fill":
fill_values = num_data.ffill()
data.fillna(value=fill_values, inplace=True)
st.success(
"Imputed missing values in numerical columns with selected approach."
)
if cat_missing:
cat_data = data.select_dtypes(exclude=["int", "float"])
if cat_missing == "drop":
data = data.dropna(subset=cat_data.columns)
elif cat_missing == "most frequent values":
mode_values = data[cat_data.columns].mode().iloc[0]
data[cat_data.columns] = data[cat_data.columns].fillna(
mode_values
)
elif cat_missing == "replace with 'unknown'":
data[cat_data.columns] = data[cat_data.columns].fillna(
"Unknown"
)
st.success(
"Imputed missing values in categorical columns with selected approach."
)
elif operation == "Encode Categorical Features":
oe_columns = st.multiselect(
"Choose Columns for Ordinal Encoding",
[""] + list(data.select_dtypes(include="object")),
)
st.info("Other columns will be One Hot Encoded.")
encode_columns = st.button("Encode Columns")
if encode_columns:
bool_columns = data.select_dtypes(include=bool).columns
data[bool_columns] = data[bool_columns].astype(int)
if oe_columns:
oe = OrdinalEncoder()
data[oe_columns] = oe.fit_transform(
data[oe_columns].astype("str")
)
try:
remaining_cat_cols = [
col
for col in data.select_dtypes(include="object")
if col not in oe_columns
]
except:
pass
if len(remaining_cat_cols) > 0:
data = pd.get_dummies(
data, columns=remaining_cat_cols, drop_first=False
)
st.success("Encoded categorical columns")
bool_columns = data.select_dtypes(include=bool).columns
data[bool_columns] = data[bool_columns].astype(int)
st.session_state["data"] = data
preprocessed_data_csv = data.to_csv(index=False)
preprocessed_data_buffer = io.StringIO()
preprocessed_data_buffer.write(preprocessed_data_csv)
preprocessed_data_bytes = preprocessed_data_buffer.getvalue()
if st.download_button(
label="Download Preprocessed Data",
key="preprocessed_data",
on_click=None,
data=preprocessed_data_bytes.encode(),
file_name="preprocessed_data.csv",
mime="text/csv",
):
st.success('Data Downloaded')
if option == "Training and Evaluation":
st.markdown(
"<h1 style='text-align: center;'>Training and Evaluation</h1>",
unsafe_allow_html=True,
)
algo = st.selectbox("Choose Algorithm Type:", ("", "Regression", "Classification"))
if algo == "Regression":
target = st.selectbox("Chose Target Variable (Y): ", list(data.columns))
try:
X = data.drop(target, axis=1)
Y = data[target]
except Exception as e:
st.write(str(e))
st.write(
"80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model."
)
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.2, random_state=42
)
scale = st.selectbox(
"Choose how do you want to scale features:",
("", "Standard Scaler", "Min Max Scaler"),
)
if scale == "Standard Scaler":
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
elif scale == "Min Max Scaler":
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
model = st.selectbox(
"Choose Regression Model for training: ",
(
"",
"Ridge Regression",
"Decision Tree Regressor",
"Random Forest Regressor",
"SVR",
"XGBRF Regressor",
"LGBM Regressor",
),
)
if model == "Ridge Regression":
reg = Ridge(alpha=1.0)
reg.fit(X_train, y_train)
pred = reg.predict(X_test)
st.write(
"Mean Absolute Error (MAE): {:.4f}".format(
mean_absolute_error(pred, y_test)
)
)
st.write(
"Mean Squared Error (MSE): {:.4f}".format(
mean_squared_error(pred, y_test)
)
)
st.write(
"Root Mean Squared Error (RMSE): {:.4f}".format(
mean_squared_error(pred, y_test, squared=False)
)
)
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(reg),
file_name="ridge_regression_model.pkl",
mime="application/octet-stream",
):
with open("ridge_regression_model.pkl", "wb") as model_file:
pickle.dump(reg, model_file)
elif model == "Decision Tree Regressor":
reg = DecisionTreeRegressor(max_depth=10)
reg.fit(X_train, y_train)
pred = reg.predict(X_test)
st.write(
"Mean Absolute Error (MAE): {:.4f}".format(
mean_absolute_error(pred, y_test)
)
)
st.write(
"Mean Squared Error (MSE): {:.4f}".format(
mean_squared_error(pred, y_test)
)
)
st.write(
"Root Mean Squared Error (RMSE): {:.4f}".format(
mean_squared_error(pred, y_test, squared=False)
)
)
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(reg),
file_name="decision_tree_regression_model.pkl",
mime="application/octet-stream",
):
with open(
"decision_tree_regression_model.pkl", "wb"
) as model_file:
pickle.dump(reg, model_file)
elif model == "Random Forest Regressor":
reg = RandomForestRegressor(max_depth=10, n_estimators=100)
reg.fit(X_train, y_train)
pred = reg.predict(X_test)
st.write(
"Mean Absolute Error (MAE): {:.4f}".format(
mean_absolute_error(pred, y_test)
)
)
st.write(
"Mean Squared Error (MSE): {:.4f}".format(
mean_squared_error(pred, y_test)
)
)
st.write(
"Root Mean Squared Error (RMSE): {:.4f}".format(
mean_squared_error(pred, y_test, squared=False)
)
)
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(reg),
file_name="random_forest_regression_model.pkl",
mime="application/octet-stream",
):
with open(
"random_forest_regression_model.pkl", "wb"
) as model_file:
pickle.dump(reg, model_file)
elif model == "SVR":
reg = SVR(C=1.0, epsilon=0.2)
reg.fit(X_train, y_train)
pred = reg.predict(X_test)
st.write(
"Mean Absolute Error (MAE): {:.4f}".format(
mean_absolute_error(pred, y_test)
)
)
st.write(
"Mean Squared Error (MSE): {:.4f}".format(
mean_squared_error(pred, y_test)
)
)
st.write(
"Root Mean Squared Error (RMSE): {:.4f}".format(
mean_squared_error(pred, y_test, squared=False)
)
)
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(reg),
file_name="svr_model.pkl",
mime="application/octet-stream",
):
with open("svr_model.pkl", "wb") as model_file:
pickle.dump(reg, model_file)
elif model == "XGBRF Regressor":
reg = XGBRFRegressor(reg_lambda=1)
reg.fit(X_train, y_train)
pred = reg.predict(X_test)
st.write(
"Mean Absolute Error (MAE): {:.4f}".format(
mean_absolute_error(pred, y_test)
)
)
st.write(
"Mean Squared Error (MSE): {:.4f}".format(
mean_squared_error(pred, y_test)
)
)
st.write(
"Root Mean Squared Error (RMSE): {:.4f}".format(
mean_squared_error(pred, y_test, squared=False)
)
)
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(reg),
file_name="xgbrf_regression_model.pkl",
mime="application/octet-stream",
):
with open("xgbrf_regression_model.pkl", "wb") as model_file:
pickle.dump(reg, model_file)
elif model == "LGBM Regressor":
reg = LGBMRegressor(reg_lambda=1)
reg.fit(X_train, y_train)
pred = reg.predict(X_test)
st.write(
"Mean Absolute Error (MAE): {:.4f}".format(
mean_absolute_error(pred, y_test)
)
)
st.write(
"Mean Squared Error (MSE): {:.4f}".format(
mean_squared_error(pred, y_test)
)
)
st.write(
"Root Mean Squared Error (RMSE): {:.4f}".format(
mean_squared_error(pred, y_test, squared=False)
)
)
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(reg),
file_name="lgbm_regression_model.pkl",
mime="application/octet-stream",
):
with open("lgbm_regression_model.pkl", "wb") as model_file:
pickle.dump(reg, model_file)
elif algo == "Classification":
target = st.selectbox("Chose Target Variable (Y): ", list(data.columns))
try:
X = data.drop(target, axis=1)
Y = data[target]
except Exception as e:
st.write(str(e))
st.write(
"80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model."
)
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.2, random_state=42
)
balance = st.selectbox(
"Do you want to balance dataset?", ("", "Yes", "No")
)
if balance == "Yes":
piechart(data, target)
sample = st.selectbox(
"Which approach you want to use?",
("", "Random Under Sampling", "Random Over Sampling", "SMOTE"),
)
if sample == "Random Under Sampling":
rus = RandomUnderSampler(random_state=42)
X_train, y_train = rus.fit_resample(X_train, y_train)
elif sample == "Random Over Sampling":
ros = RandomOverSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train, y_train)
elif sample == "SMOTE":
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)
scale = st.selectbox(
"Choose how do you want to scale features:",
("", "Standard Scaler", "Min Max Scaler"),
)
if scale == "Standard Scaler":
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
elif scale == "Min Max Scaler":
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
model = st.selectbox(
"Choose Classification Model for training: ",
(
"",
"Logistic Regression",
"Decision Tree Classifier",
"Random Forest Classifier",
"SVC",
"XGBRF Classifier",
"LGBM Classifier",
),
)
if model == "Logistic Regression":
clf = LogisticRegression(penalty="l2")
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
st.write(
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
)
try:
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
except ValueError:
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
plot_confusion_matrix(
pred, y_test, "Logistic Regression Confusion Matrix "
)
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(clf),
file_name="logistic_regression_model.pkl",
mime="application/octet-stream",
):
with open("logistic_regression_model.pkl", "wb") as model_file:
pickle.dump(clf, model_file)
if model == "Decision Tree Classifier":
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
st.write(
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
)
try:
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
except ValueError:
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
plot_confusion_matrix(
pred, y_test, "DecisionTree Classifier Confusion Matrix "
)
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(clf),
file_name="decision_tree_classifier_model.pkl",
mime="application/octet-stream",
):
with open(
"decision_tree_classifier_model.pkl", "wb"
) as model_file:
pickle.dump(clf, model_file)
if model == "Random Forest Classifier":
clf = RandomForestClassifier(n_estimators=100, max_depth=5)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
st.write(
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
)
try:
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
except ValueError:
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
plot_confusion_matrix(
pred, y_test, "RandomForest Classifier Confusion Matrix "
)
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(clf),
file_name="random_forest_classifier_model.pkl",
mime="application/octet-stream",
):
with open(
"random_forest_classifier_model.pkl", "wb"
) as model_file:
pickle.dump(clf, model_file)
if model == "SVC":
clf = SVC(C=1.5)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
st.write(
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
)
try:
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
except ValueError:
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
plot_confusion_matrix(pred, y_test, "SVC Confusion Matrix ")
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(clf),
file_name="svc_model.pkl",
mime="application/octet-stream",
):
with open("svc_model.pkl", "wb") as model_file:
pickle.dump(clf, model_file)
if model == "XGBRF Classifier":
clf = XGBRFClassifier(reg_lambda=1.0)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
st.write(
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
)
try:
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
except ValueError:
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
plot_confusion_matrix(
pred, y_test, "XGBRF Classifier Confusion Matrix "
)
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(clf),
file_name="xgbrf_classifier_model.pkl",
mime="application/octet-stream",
):
with open("xgbrf_classifier_model.pkl", "wb") as model_file:
pickle.dump(clf, model_file)
if model == "LGBM Classifier":
clf = LGBMClassifier(reg_lambda=1.0)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
st.write(
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
)
try:
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
except ValueError:
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
plot_confusion_matrix(
pred, y_test, "LGBM Classifier Confusion Matrix "
)
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(clf),
file_name="lgbm_classifier_model.pkl",
mime="application/octet-stream",
):
with open("lgbm_classifier_model.pkl", "wb") as model_file:
pickle.dump(clf, model_file)
def load_csv(file):
data = pd.read_csv(file)
return data
def data_overview(data):
r, c = data.shape
st.write(f"Number of Rows: {r}")
return f"Number of Columns: {c}"
def missing_data(data):
missing_values = data.isna().sum()
missing_values = missing_values[missing_values > 0]
missing_value_per = (missing_values / data.shape[0]) * 100
missing_value_per = missing_value_per.round(2).astype(str) + "%"
missing_df = pd.DataFrame(
{"Missing Values": missing_values, "Percentage": missing_value_per}
)
missing_df_html = missing_df.to_html(
classes="table table-striped", justify="center"
)
return st.markdown(missing_df_html, unsafe_allow_html=True)
def display_data_info(data):
dtypes = pd.DataFrame(data.dtypes, columns=["Data Type"])
dtypes.reset_index(inplace=True)
nunique = pd.DataFrame(data.nunique(), columns=["Unique Counts"])
nunique.reset_index(inplace=True)
dtypes.columns = ["Column", "Data Type"]
nunique.columns = ["Column", "Unique Counts"]
combined_df = pd.merge(dtypes, nunique, on="Column")
combined_df_html = combined_df.to_html(
classes="table table-striped", justify="center"
)
return st.markdown(combined_df_html, unsafe_allow_html=True)
def value_counts(data):
column = st.selectbox("Select a Column", [""] + list(data.columns))
if column:
st.write(data[column].value_counts())
def duplicate(data):
if data.duplicated().any():
st.write(
f"There is/are {data.duplicated().sum()} duplicate rows in the DataFrame. Duplicated values will be dropped."
)
data.drop_duplicates(keep="first", inplace=True)
return ""
else:
return "There are no duplicate rows in the DataFrame."
def countplot(data, col):
plt.figure(figsize=(10, 6))
sns.countplot(y=data[col], palette=palette[1:], edgecolor="#1c1c1c", linewidth=2)
plt.title(f"Countplot of {col} Column")
st.pyplot(plt)
def piechart(data, col):
value_counts = data[col].value_counts()
plt.figure(figsize=(8, 6))
plt.pie(
value_counts,
labels=value_counts.index,
autopct="%1.1f%%",
colors=palette,
shadow=False,
wedgeprops=dict(edgecolor="#1c1c1c"),
)
plt.title(f"Pie Chart of {col} Column")
st.pyplot(plt)
def histogram(data, col):
plt.figure(figsize=(10, 6))
sns.histplot(
data[col],
kde=True,
color=palette[4],
fill=True,
edgecolor="#1c1c1c",
linewidth=2,
)
plt.title(f"Histogram of {col} Column")
st.pyplot(plt)
def violinplot(data, col):
plt.figure(figsize=(10, 6))
sns.violinplot(data[col], color=palette[8])
plt.title(f"Violin Plot of {col} Column")
st.pyplot(plt)
def scatterplot(data, col):
plt.figure(figsize=(10, 8))
sns.scatterplot(data[col], color=palette[3])
plt.title(f"Scatter Plot of {col} Column")
st.pyplot(plt)
def biscatterplot(data, cols):
try:
plt.figure(figsize=(10, 8))
sns.scatterplot(
data=data,
x=cols[0],
y=cols[1],
palette=palette[1:],
edgecolor="#1c1c1c",
linewidth=2,
)
plt.title(f"Scatter Plot of {cols[0]} and {cols[1]} Columns")
st.pyplot(plt)
except Exception as e:
st.write(str(e))
def bibarplot(data, cols):
try:
plt.figure(figsize=(10, 8))
sns.barplot(
data=data,
x=cols[0],
y=cols[1],
palette=palette[1:],
edgecolor="#1c1c1c",
linewidth=2,
)
plt.title(f"Bar Plot of {cols[0]} and {cols[1]} Columns")
st.pyplot(plt)
except Exception as e:
st.write(str(e))
def biboxplot(data, cols):
try:
plt.figure(figsize=(10, 8))
sns.boxplot(data=data, x=cols[0], y=cols[1], palette=palette[1:], linewidth=2)
plt.title(f"Box Plot of {cols[0]} and {cols[1]} Columns")
st.pyplot(plt)
except Exception as e:
st.write(str(e))
def paretoplot(data, categorical_col):
try:
value_counts = data[categorical_col].value_counts()
cumulative_percentage = (value_counts / value_counts.sum()).cumsum()
pareto_df = pd.DataFrame(
{
"Categories": value_counts.index,
"Frequency": value_counts.values,
"Cumulative Percentage": cumulative_percentage.values * 100,
}
)
pareto_df = pareto_df.sort_values(by="Frequency", ascending=False)
fig, ax1 = plt.subplots(figsize=(10, 8))
ax1.bar(
pareto_df["Categories"],
pareto_df["Frequency"],
color=palette[1:],
edgecolor="#1c1c1c",
linewidth=2,
)
ax2 = ax1.twinx()
ax2.yaxis.set_major_formatter(PercentFormatter())
ax2.plot(
pareto_df["Categories"],
pareto_df["Cumulative Percentage"],
color=palette[3],
marker="D",
ms=10,
)
ax1.set_xlabel(categorical_col)
ax1.set_ylabel("Frequency", color=palette[0])
ax2.set_ylabel("Cumulative Percentage", color=palette[3])
st.pyplot(fig)
except Exception as e:
pass
def plot_confusion_matrix(y_true, y_pred, title):
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title(title)
st.pyplot(plt)
if __name__ == "__main__":
main()