|
import streamlit as st |
|
import numpy as np |
|
import pandas as pd |
|
import io |
|
import matplotlib.pyplot as plt |
|
from matplotlib.ticker import PercentFormatter |
|
import seaborn as sns |
|
from sklearn.preprocessing import ( |
|
OneHotEncoder, |
|
OrdinalEncoder, |
|
StandardScaler, |
|
MinMaxScaler, |
|
) |
|
from sklearn.model_selection import train_test_split |
|
from imblearn.under_sampling import RandomUnderSampler |
|
from imblearn.over_sampling import RandomOverSampler, SMOTE |
|
from sklearn.linear_model import Ridge, Lasso, LogisticRegression |
|
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier |
|
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier |
|
from sklearn.svm import SVR, SVC |
|
from sklearn.naive_bayes import MultinomialNB |
|
from xgboost import XGBRFRegressor, XGBRFClassifier |
|
from lightgbm import LGBMRegressor, LGBMClassifier |
|
from sklearn.metrics import ( |
|
mean_absolute_error, |
|
mean_squared_error, |
|
mean_squared_error, |
|
r2_score, |
|
) |
|
from sklearn.metrics import ( |
|
accuracy_score, |
|
f1_score, |
|
confusion_matrix, |
|
precision_score, |
|
recall_score, |
|
) |
|
import pickle |
|
|
|
st.set_page_config(page_title="Tabular Data Analysis and Auto ML", page_icon="🤖") |
|
sns.set_style("white") |
|
sns.set_context("poster", font_scale=0.7) |
|
palette = [ |
|
"#1d7874", |
|
"#679289", |
|
"#f4c095", |
|
"#ee2e31", |
|
"#ffb563", |
|
"#918450", |
|
"#f85e00", |
|
"#a41623", |
|
"#9a031e", |
|
"#d6d6d6", |
|
"#ffee32", |
|
"#ffd100", |
|
"#333533", |
|
"#202020", |
|
] |
|
|
|
|
|
def main(): |
|
file = st.sidebar.file_uploader("Upload Your CSV File Here: ") |
|
process = st.sidebar.button("Process") |
|
option = st.sidebar.radio( |
|
"Select an Option: ", |
|
( |
|
"Basic EDA", |
|
"Univariate Analysis", |
|
"Bivariate Analysis", |
|
"Preprocess", |
|
"Training and Evaluation", |
|
), |
|
) |
|
placeholder = st.empty() |
|
placeholder.markdown( |
|
"<h1 style='text-align: center;'>Welcome to Tabular Data Analysis and Auto ML🤖</h1>", |
|
unsafe_allow_html=True |
|
) |
|
|
|
|
|
if file is not None and process: |
|
data = load_csv(file) |
|
st.session_state["data"] = data |
|
|
|
if "data" in st.session_state: |
|
data = st.session_state["data"] |
|
placeholder.empty() |
|
|
|
if option == "Basic EDA": |
|
st.markdown( |
|
"<h1 style='text-align: center;'>Basic EDA</h1>", unsafe_allow_html=True |
|
) |
|
|
|
st.subheader("Data Overview") |
|
st.write(data_overview(data)) |
|
st.write(duplicate(data)) |
|
st.dataframe(data.head()) |
|
|
|
st.subheader("Data Types and Unique Value Counts") |
|
display_data_info(data) |
|
|
|
st.subheader("Missing Data") |
|
missing_data(data) |
|
|
|
st.subheader("Value Counts") |
|
value_counts(data) |
|
|
|
st.subheader("Descriptive Statistics") |
|
st.write(data.describe().T) |
|
|
|
if option == "Univariate Analysis": |
|
st.markdown( |
|
"<h1 style='text-align: center;'>Univariate Analysis</h1>", |
|
unsafe_allow_html=True, |
|
) |
|
plot = st.radio( |
|
"Select a chart: ", |
|
("Count Plot", "Pie Chart", "Histogram", "Violin Plot", "Scatter Plot"), |
|
) |
|
|
|
if plot == "Count Plot": |
|
column = st.selectbox( |
|
"Select a column", [""] + list(data.select_dtypes("O")) |
|
) |
|
if column: |
|
countplot(data, column) |
|
|
|
if plot == "Pie Chart": |
|
column = st.selectbox( |
|
"Select a column", [""] + list(data.select_dtypes("O")) |
|
) |
|
if column: |
|
piechart(data, column) |
|
|
|
if plot == "Histogram": |
|
column = st.selectbox( |
|
"Select a column", |
|
[""] + list(data.select_dtypes(include=["int", "float"])), |
|
) |
|
if column: |
|
histogram(data, column) |
|
|
|
if plot == "Violin Plot": |
|
column = st.selectbox( |
|
"Select a column", |
|
[""] + list(data.select_dtypes(include=["int", "float"])), |
|
) |
|
if column: |
|
violinplot(data, column) |
|
|
|
if plot == "Scatter Plot": |
|
column = st.selectbox( |
|
"Select a column", |
|
[""] + list(data.select_dtypes(include=["int", "float"])), |
|
) |
|
if column: |
|
scatterplot(data, column) |
|
|
|
if option == "Bivariate Analysis": |
|
st.markdown( |
|
"<h1 style='text-align: center;'>Bivariate Analysis</h1>", |
|
unsafe_allow_html=True, |
|
) |
|
plot = st.radio( |
|
"Select a chart: ", |
|
("Scatter Plot", "Bar Plot", "Box Plot", "Pareto Chart"), |
|
) |
|
|
|
if plot == "Scatter Plot": |
|
columns = st.multiselect( |
|
"Select two columns", |
|
[""] + list(data.select_dtypes(include=["int", "float"])), |
|
) |
|
|
|
if columns: |
|
biscatterplot(data, columns) |
|
|
|
if plot == "Bar Plot": |
|
columns = st.multiselect("Select two columns", list(data.columns)) |
|
|
|
if columns: |
|
bibarplot(data, columns) |
|
|
|
if plot == "Box Plot": |
|
columns = st.multiselect("Select two columns", list(data.columns)) |
|
|
|
if columns: |
|
biboxplot(data, columns) |
|
|
|
if plot == "Pareto Chart": |
|
column = st.selectbox( |
|
"Select a columns", |
|
[""] + list(data.select_dtypes(include="object")), |
|
) |
|
|
|
if column: |
|
paretoplot(data, column) |
|
|
|
if option == "Preprocess": |
|
st.markdown( |
|
"<h1 style='text-align: center;'>Data Preprocessing</h1>", |
|
unsafe_allow_html=True, |
|
) |
|
|
|
operation = st.radio( |
|
"Select preprocessing step: ", |
|
( |
|
"Drop Columns", |
|
"Handling Missing Values", |
|
"Encode Categorical Features", |
|
), |
|
) |
|
|
|
if operation == "Drop Columns": |
|
columns = st.multiselect("Select Columns to drop: ", (data.columns)) |
|
drop_columns = st.button("Drop Columns") |
|
if drop_columns: |
|
data.drop(columns, axis=1, inplace=True) |
|
st.success("Dropped selected columns✅✅✅") |
|
|
|
elif operation == "Handling Missing Values": |
|
num_missing = st.selectbox( |
|
"Select a Approach (Numerical columns only): ", |
|
("", "Drop", "Backward Fill", "Forward Fill", "Mean", "Median"), |
|
).lower() |
|
|
|
cat_missing = st.selectbox( |
|
"Select a Approach (Categorical columns only): ", |
|
("", "Drop", "Most Frequent Values", "Replace with 'Unknown'"), |
|
).lower() |
|
hmv = st.button("Handle Missing Values") |
|
|
|
if hmv: |
|
if num_missing: |
|
num_data = data.select_dtypes(include=["int64", "float64"]) |
|
|
|
if num_missing == "drop": |
|
data = data.dropna(subset=num_data.columns) |
|
|
|
elif num_missing in [ |
|
"mean", |
|
"median", |
|
"backward fill", |
|
"forward fill", |
|
]: |
|
if num_missing == "mean": |
|
fill_values = num_data.mean() |
|
elif num_missing == "median": |
|
fill_values = num_data.median() |
|
elif num_missing == "backward fill": |
|
fill_values = num_data.bfill() |
|
elif num_missing == "forward fill": |
|
fill_values = num_data.ffill() |
|
|
|
data.fillna(value=fill_values, inplace=True) |
|
|
|
st.success( |
|
"Imputed missing values in numerical columns with selected approach." |
|
) |
|
|
|
if cat_missing: |
|
cat_data = data.select_dtypes(exclude=["int", "float"]) |
|
|
|
if cat_missing == "drop": |
|
data = data.dropna(subset=cat_data.columns) |
|
|
|
elif cat_missing == "most frequent values": |
|
mode_values = data[cat_data.columns].mode().iloc[0] |
|
data[cat_data.columns] = data[cat_data.columns].fillna( |
|
mode_values |
|
) |
|
|
|
elif cat_missing == "replace with 'unknown'": |
|
data[cat_data.columns] = data[cat_data.columns].fillna( |
|
"Unknown" |
|
) |
|
|
|
st.success( |
|
"Imputed missing values in categorical columns with selected approach." |
|
) |
|
|
|
elif operation == "Encode Categorical Features": |
|
oe_columns = st.multiselect( |
|
"Choose Columns for Ordinal Encoding", |
|
[""] + list(data.select_dtypes(include="object")), |
|
) |
|
st.info("Other columns will be One Hot Encoded.") |
|
|
|
encode_columns = st.button("Encode Columns") |
|
|
|
if encode_columns: |
|
bool_columns = data.select_dtypes(include=bool).columns |
|
data[bool_columns] = data[bool_columns].astype(int) |
|
if oe_columns: |
|
oe = OrdinalEncoder() |
|
data[oe_columns] = oe.fit_transform( |
|
data[oe_columns].astype("str") |
|
) |
|
|
|
try: |
|
remaining_cat_cols = [ |
|
col |
|
for col in data.select_dtypes(include="object") |
|
if col not in oe_columns |
|
] |
|
except: |
|
pass |
|
|
|
if len(remaining_cat_cols) > 0: |
|
data = pd.get_dummies( |
|
data, columns=remaining_cat_cols, drop_first=False |
|
) |
|
st.success("Encoded categorical columns") |
|
|
|
|
|
bool_columns = data.select_dtypes(include=bool).columns |
|
data[bool_columns] = data[bool_columns].astype(int) |
|
st.session_state["data"] = data |
|
|
|
|
|
|
|
|
|
|
|
preprocessed_data_csv = data.to_csv(index=False) |
|
preprocessed_data_buffer = io.StringIO() |
|
preprocessed_data_buffer.write(preprocessed_data_csv) |
|
preprocessed_data_bytes = preprocessed_data_buffer.getvalue() |
|
if st.download_button( |
|
label="Download Preprocessed Data", |
|
key="preprocessed_data", |
|
on_click=None, |
|
data=preprocessed_data_bytes.encode(), |
|
file_name="preprocessed_data.csv", |
|
mime="text/csv", |
|
): |
|
st.success('Data Downloaded') |
|
|
|
|
|
if option == "Training and Evaluation": |
|
st.markdown( |
|
"<h1 style='text-align: center;'>Training and Evaluation</h1>", |
|
unsafe_allow_html=True, |
|
) |
|
algo = st.selectbox("Choose Algorithm Type:", ("", "Regression", "Classification")) |
|
|
|
if algo == "Regression": |
|
target = st.selectbox("Chose Target Variable (Y): ", list(data.columns)) |
|
|
|
try: |
|
X = data.drop(target, axis=1) |
|
Y = data[target] |
|
except Exception as e: |
|
st.write(str(e)) |
|
|
|
st.write( |
|
"80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model." |
|
) |
|
X_train, X_test, y_train, y_test = train_test_split( |
|
X, Y, test_size=0.2, random_state=42 |
|
) |
|
|
|
scale = st.selectbox( |
|
"Choose how do you want to scale features:", |
|
("", "Standard Scaler", "Min Max Scaler"), |
|
) |
|
|
|
if scale == "Standard Scaler": |
|
scaler = StandardScaler() |
|
X_train = scaler.fit_transform(X_train) |
|
X_test = scaler.transform(X_test) |
|
|
|
elif scale == "Min Max Scaler": |
|
scaler = MinMaxScaler() |
|
X_train = scaler.fit_transform(X_train) |
|
X_test = scaler.transform(X_test) |
|
|
|
model = st.selectbox( |
|
"Choose Regression Model for training: ", |
|
( |
|
"", |
|
"Ridge Regression", |
|
"Decision Tree Regressor", |
|
"Random Forest Regressor", |
|
"SVR", |
|
"XGBRF Regressor", |
|
"LGBM Regressor", |
|
), |
|
) |
|
|
|
if model == "Ridge Regression": |
|
reg = Ridge(alpha=1.0) |
|
reg.fit(X_train, y_train) |
|
pred = reg.predict(X_test) |
|
st.write( |
|
"Mean Absolute Error (MAE): {:.4f}".format( |
|
mean_absolute_error(pred, y_test) |
|
) |
|
) |
|
st.write( |
|
"Mean Squared Error (MSE): {:.4f}".format( |
|
mean_squared_error(pred, y_test) |
|
) |
|
) |
|
st.write( |
|
"Root Mean Squared Error (RMSE): {:.4f}".format( |
|
mean_squared_error(pred, y_test, squared=False) |
|
) |
|
) |
|
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) |
|
|
|
if st.download_button( |
|
label="Download Trained Model", |
|
key="trained_model", |
|
on_click=None, |
|
data=pickle.dumps(reg), |
|
file_name="ridge_regression_model.pkl", |
|
mime="application/octet-stream", |
|
): |
|
with open("ridge_regression_model.pkl", "wb") as model_file: |
|
pickle.dump(reg, model_file) |
|
|
|
elif model == "Decision Tree Regressor": |
|
reg = DecisionTreeRegressor(max_depth=10) |
|
reg.fit(X_train, y_train) |
|
pred = reg.predict(X_test) |
|
st.write( |
|
"Mean Absolute Error (MAE): {:.4f}".format( |
|
mean_absolute_error(pred, y_test) |
|
) |
|
) |
|
st.write( |
|
"Mean Squared Error (MSE): {:.4f}".format( |
|
mean_squared_error(pred, y_test) |
|
) |
|
) |
|
st.write( |
|
"Root Mean Squared Error (RMSE): {:.4f}".format( |
|
mean_squared_error(pred, y_test, squared=False) |
|
) |
|
) |
|
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) |
|
|
|
if st.download_button( |
|
label="Download Trained Model", |
|
key="trained_model", |
|
on_click=None, |
|
data=pickle.dumps(reg), |
|
file_name="decision_tree_regression_model.pkl", |
|
mime="application/octet-stream", |
|
): |
|
with open( |
|
"decision_tree_regression_model.pkl", "wb" |
|
) as model_file: |
|
pickle.dump(reg, model_file) |
|
|
|
elif model == "Random Forest Regressor": |
|
reg = RandomForestRegressor(max_depth=10, n_estimators=100) |
|
reg.fit(X_train, y_train) |
|
pred = reg.predict(X_test) |
|
st.write( |
|
"Mean Absolute Error (MAE): {:.4f}".format( |
|
mean_absolute_error(pred, y_test) |
|
) |
|
) |
|
st.write( |
|
"Mean Squared Error (MSE): {:.4f}".format( |
|
mean_squared_error(pred, y_test) |
|
) |
|
) |
|
st.write( |
|
"Root Mean Squared Error (RMSE): {:.4f}".format( |
|
mean_squared_error(pred, y_test, squared=False) |
|
) |
|
) |
|
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) |
|
|
|
if st.download_button( |
|
label="Download Trained Model", |
|
key="trained_model", |
|
on_click=None, |
|
data=pickle.dumps(reg), |
|
file_name="random_forest_regression_model.pkl", |
|
mime="application/octet-stream", |
|
): |
|
with open( |
|
"random_forest_regression_model.pkl", "wb" |
|
) as model_file: |
|
pickle.dump(reg, model_file) |
|
|
|
elif model == "SVR": |
|
reg = SVR(C=1.0, epsilon=0.2) |
|
reg.fit(X_train, y_train) |
|
pred = reg.predict(X_test) |
|
st.write( |
|
"Mean Absolute Error (MAE): {:.4f}".format( |
|
mean_absolute_error(pred, y_test) |
|
) |
|
) |
|
st.write( |
|
"Mean Squared Error (MSE): {:.4f}".format( |
|
mean_squared_error(pred, y_test) |
|
) |
|
) |
|
st.write( |
|
"Root Mean Squared Error (RMSE): {:.4f}".format( |
|
mean_squared_error(pred, y_test, squared=False) |
|
) |
|
) |
|
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) |
|
|
|
if st.download_button( |
|
label="Download Trained Model", |
|
key="trained_model", |
|
on_click=None, |
|
data=pickle.dumps(reg), |
|
file_name="svr_model.pkl", |
|
mime="application/octet-stream", |
|
): |
|
with open("svr_model.pkl", "wb") as model_file: |
|
pickle.dump(reg, model_file) |
|
|
|
elif model == "XGBRF Regressor": |
|
reg = XGBRFRegressor(reg_lambda=1) |
|
reg.fit(X_train, y_train) |
|
pred = reg.predict(X_test) |
|
st.write( |
|
"Mean Absolute Error (MAE): {:.4f}".format( |
|
mean_absolute_error(pred, y_test) |
|
) |
|
) |
|
st.write( |
|
"Mean Squared Error (MSE): {:.4f}".format( |
|
mean_squared_error(pred, y_test) |
|
) |
|
) |
|
st.write( |
|
"Root Mean Squared Error (RMSE): {:.4f}".format( |
|
mean_squared_error(pred, y_test, squared=False) |
|
) |
|
) |
|
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) |
|
|
|
if st.download_button( |
|
label="Download Trained Model", |
|
key="trained_model", |
|
on_click=None, |
|
data=pickle.dumps(reg), |
|
file_name="xgbrf_regression_model.pkl", |
|
mime="application/octet-stream", |
|
): |
|
with open("xgbrf_regression_model.pkl", "wb") as model_file: |
|
pickle.dump(reg, model_file) |
|
|
|
elif model == "LGBM Regressor": |
|
reg = LGBMRegressor(reg_lambda=1) |
|
reg.fit(X_train, y_train) |
|
pred = reg.predict(X_test) |
|
st.write( |
|
"Mean Absolute Error (MAE): {:.4f}".format( |
|
mean_absolute_error(pred, y_test) |
|
) |
|
) |
|
st.write( |
|
"Mean Squared Error (MSE): {:.4f}".format( |
|
mean_squared_error(pred, y_test) |
|
) |
|
) |
|
st.write( |
|
"Root Mean Squared Error (RMSE): {:.4f}".format( |
|
mean_squared_error(pred, y_test, squared=False) |
|
) |
|
) |
|
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) |
|
|
|
if st.download_button( |
|
label="Download Trained Model", |
|
key="trained_model", |
|
on_click=None, |
|
data=pickle.dumps(reg), |
|
file_name="lgbm_regression_model.pkl", |
|
mime="application/octet-stream", |
|
): |
|
with open("lgbm_regression_model.pkl", "wb") as model_file: |
|
pickle.dump(reg, model_file) |
|
|
|
elif algo == "Classification": |
|
target = st.selectbox("Chose Target Variable (Y): ", list(data.columns)) |
|
|
|
try: |
|
X = data.drop(target, axis=1) |
|
Y = data[target] |
|
except Exception as e: |
|
st.write(str(e)) |
|
|
|
st.write( |
|
"80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model." |
|
) |
|
X_train, X_test, y_train, y_test = train_test_split( |
|
X, Y, test_size=0.2, random_state=42 |
|
) |
|
|
|
balance = st.selectbox( |
|
"Do you want to balance dataset?", ("", "Yes", "No") |
|
) |
|
if balance == "Yes": |
|
piechart(data, target) |
|
|
|
sample = st.selectbox( |
|
"Which approach you want to use?", |
|
("", "Random Under Sampling", "Random Over Sampling", "SMOTE"), |
|
) |
|
|
|
if sample == "Random Under Sampling": |
|
rus = RandomUnderSampler(random_state=42) |
|
X_train, y_train = rus.fit_resample(X_train, y_train) |
|
|
|
elif sample == "Random Over Sampling": |
|
ros = RandomOverSampler(random_state=42) |
|
X_train, y_train = ros.fit_resample(X_train, y_train) |
|
|
|
elif sample == "SMOTE": |
|
smote = SMOTE(random_state=42) |
|
X_train, y_train = smote.fit_resample(X_train, y_train) |
|
|
|
scale = st.selectbox( |
|
"Choose how do you want to scale features:", |
|
("", "Standard Scaler", "Min Max Scaler"), |
|
) |
|
|
|
|
|
if scale == "Standard Scaler": |
|
scaler = StandardScaler() |
|
X_train = scaler.fit_transform(X_train) |
|
X_test = scaler.transform(X_test) |
|
|
|
elif scale == "Min Max Scaler": |
|
scaler = MinMaxScaler() |
|
X_train = scaler.fit_transform(X_train) |
|
X_test = scaler.transform(X_test) |
|
|
|
model = st.selectbox( |
|
"Choose Classification Model for training: ", |
|
( |
|
"", |
|
"Logistic Regression", |
|
"Decision Tree Classifier", |
|
"Random Forest Classifier", |
|
"SVC", |
|
"XGBRF Classifier", |
|
"LGBM Classifier", |
|
), |
|
) |
|
|
|
if model == "Logistic Regression": |
|
clf = LogisticRegression(penalty="l2") |
|
clf.fit(X_train, y_train) |
|
pred = clf.predict(X_test) |
|
st.write( |
|
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) |
|
) |
|
|
|
try: |
|
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) |
|
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) |
|
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) |
|
except ValueError: |
|
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) |
|
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) |
|
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) |
|
|
|
|
|
plot_confusion_matrix( |
|
pred, y_test, "Logistic Regression Confusion Matrix " |
|
) |
|
|
|
if st.download_button( |
|
label="Download Trained Model", |
|
key="trained_model", |
|
on_click=None, |
|
data=pickle.dumps(clf), |
|
file_name="logistic_regression_model.pkl", |
|
mime="application/octet-stream", |
|
): |
|
with open("logistic_regression_model.pkl", "wb") as model_file: |
|
pickle.dump(clf, model_file) |
|
|
|
if model == "Decision Tree Classifier": |
|
clf = DecisionTreeClassifier(max_depth=5) |
|
clf.fit(X_train, y_train) |
|
pred = clf.predict(X_test) |
|
st.write( |
|
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) |
|
) |
|
try: |
|
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) |
|
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) |
|
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) |
|
except ValueError: |
|
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) |
|
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) |
|
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) |
|
|
|
plot_confusion_matrix( |
|
pred, y_test, "DecisionTree Classifier Confusion Matrix " |
|
) |
|
|
|
if st.download_button( |
|
label="Download Trained Model", |
|
key="trained_model", |
|
on_click=None, |
|
data=pickle.dumps(clf), |
|
file_name="decision_tree_classifier_model.pkl", |
|
mime="application/octet-stream", |
|
): |
|
with open( |
|
"decision_tree_classifier_model.pkl", "wb" |
|
) as model_file: |
|
pickle.dump(clf, model_file) |
|
|
|
if model == "Random Forest Classifier": |
|
clf = RandomForestClassifier(n_estimators=100, max_depth=5) |
|
clf.fit(X_train, y_train) |
|
pred = clf.predict(X_test) |
|
st.write( |
|
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) |
|
) |
|
try: |
|
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) |
|
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) |
|
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) |
|
except ValueError: |
|
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) |
|
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) |
|
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) |
|
|
|
plot_confusion_matrix( |
|
pred, y_test, "RandomForest Classifier Confusion Matrix " |
|
) |
|
|
|
if st.download_button( |
|
label="Download Trained Model", |
|
key="trained_model", |
|
on_click=None, |
|
data=pickle.dumps(clf), |
|
file_name="random_forest_classifier_model.pkl", |
|
mime="application/octet-stream", |
|
): |
|
with open( |
|
"random_forest_classifier_model.pkl", "wb" |
|
) as model_file: |
|
pickle.dump(clf, model_file) |
|
|
|
if model == "SVC": |
|
clf = SVC(C=1.5) |
|
clf.fit(X_train, y_train) |
|
pred = clf.predict(X_test) |
|
st.write( |
|
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) |
|
) |
|
try: |
|
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) |
|
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) |
|
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) |
|
except ValueError: |
|
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) |
|
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) |
|
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) |
|
|
|
|
|
plot_confusion_matrix(pred, y_test, "SVC Confusion Matrix ") |
|
|
|
if st.download_button( |
|
label="Download Trained Model", |
|
key="trained_model", |
|
on_click=None, |
|
data=pickle.dumps(clf), |
|
file_name="svc_model.pkl", |
|
mime="application/octet-stream", |
|
): |
|
with open("svc_model.pkl", "wb") as model_file: |
|
pickle.dump(clf, model_file) |
|
|
|
if model == "XGBRF Classifier": |
|
clf = XGBRFClassifier(reg_lambda=1.0) |
|
clf.fit(X_train, y_train) |
|
pred = clf.predict(X_test) |
|
st.write( |
|
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) |
|
) |
|
try: |
|
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) |
|
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) |
|
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) |
|
except ValueError: |
|
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) |
|
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) |
|
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) |
|
|
|
|
|
plot_confusion_matrix( |
|
pred, y_test, "XGBRF Classifier Confusion Matrix " |
|
) |
|
|
|
if st.download_button( |
|
label="Download Trained Model", |
|
key="trained_model", |
|
on_click=None, |
|
data=pickle.dumps(clf), |
|
file_name="xgbrf_classifier_model.pkl", |
|
mime="application/octet-stream", |
|
): |
|
with open("xgbrf_classifier_model.pkl", "wb") as model_file: |
|
pickle.dump(clf, model_file) |
|
|
|
if model == "LGBM Classifier": |
|
clf = LGBMClassifier(reg_lambda=1.0) |
|
clf.fit(X_train, y_train) |
|
pred = clf.predict(X_test) |
|
st.write( |
|
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) |
|
) |
|
try: |
|
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) |
|
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) |
|
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) |
|
except ValueError: |
|
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) |
|
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) |
|
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) |
|
|
|
plot_confusion_matrix( |
|
pred, y_test, "LGBM Classifier Confusion Matrix " |
|
) |
|
|
|
if st.download_button( |
|
label="Download Trained Model", |
|
key="trained_model", |
|
on_click=None, |
|
data=pickle.dumps(clf), |
|
file_name="lgbm_classifier_model.pkl", |
|
mime="application/octet-stream", |
|
): |
|
with open("lgbm_classifier_model.pkl", "wb") as model_file: |
|
pickle.dump(clf, model_file) |
|
|
|
|
|
def load_csv(file): |
|
data = pd.read_csv(file) |
|
return data |
|
|
|
|
|
def data_overview(data): |
|
r, c = data.shape |
|
st.write(f"Number of Rows: {r}") |
|
return f"Number of Columns: {c}" |
|
|
|
|
|
def missing_data(data): |
|
missing_values = data.isna().sum() |
|
missing_values = missing_values[missing_values > 0] |
|
missing_value_per = (missing_values / data.shape[0]) * 100 |
|
missing_value_per = missing_value_per.round(2).astype(str) + "%" |
|
missing_df = pd.DataFrame( |
|
{"Missing Values": missing_values, "Percentage": missing_value_per} |
|
) |
|
missing_df_html = missing_df.to_html( |
|
classes="table table-striped", justify="center" |
|
) |
|
return st.markdown(missing_df_html, unsafe_allow_html=True) |
|
|
|
|
|
def display_data_info(data): |
|
dtypes = pd.DataFrame(data.dtypes, columns=["Data Type"]) |
|
dtypes.reset_index(inplace=True) |
|
nunique = pd.DataFrame(data.nunique(), columns=["Unique Counts"]) |
|
nunique.reset_index(inplace=True) |
|
dtypes.columns = ["Column", "Data Type"] |
|
nunique.columns = ["Column", "Unique Counts"] |
|
combined_df = pd.merge(dtypes, nunique, on="Column") |
|
combined_df_html = combined_df.to_html( |
|
classes="table table-striped", justify="center" |
|
) |
|
return st.markdown(combined_df_html, unsafe_allow_html=True) |
|
|
|
|
|
def value_counts(data): |
|
column = st.selectbox("Select a Column", [""] + list(data.columns)) |
|
if column: |
|
st.write(data[column].value_counts()) |
|
|
|
|
|
def duplicate(data): |
|
if data.duplicated().any(): |
|
st.write( |
|
f"There is/are {data.duplicated().sum()} duplicate rows in the DataFrame. Duplicated values will be dropped." |
|
) |
|
data.drop_duplicates(keep="first", inplace=True) |
|
return "" |
|
|
|
else: |
|
return "There are no duplicate rows in the DataFrame." |
|
|
|
def countplot(data, col): |
|
plt.figure(figsize=(10, 6)) |
|
sns.countplot(y=data[col], palette=palette[1:], edgecolor="#1c1c1c", linewidth=2) |
|
plt.title(f"Countplot of {col} Column") |
|
st.pyplot(plt) |
|
|
|
|
|
def piechart(data, col): |
|
value_counts = data[col].value_counts() |
|
plt.figure(figsize=(8, 6)) |
|
plt.pie( |
|
value_counts, |
|
labels=value_counts.index, |
|
autopct="%1.1f%%", |
|
colors=palette, |
|
shadow=False, |
|
wedgeprops=dict(edgecolor="#1c1c1c"), |
|
) |
|
plt.title(f"Pie Chart of {col} Column") |
|
st.pyplot(plt) |
|
|
|
|
|
def histogram(data, col): |
|
plt.figure(figsize=(10, 6)) |
|
sns.histplot( |
|
data[col], |
|
kde=True, |
|
color=palette[4], |
|
fill=True, |
|
edgecolor="#1c1c1c", |
|
linewidth=2, |
|
) |
|
plt.title(f"Histogram of {col} Column") |
|
st.pyplot(plt) |
|
|
|
|
|
def violinplot(data, col): |
|
plt.figure(figsize=(10, 6)) |
|
sns.violinplot(data[col], color=palette[8]) |
|
plt.title(f"Violin Plot of {col} Column") |
|
st.pyplot(plt) |
|
|
|
|
|
def scatterplot(data, col): |
|
plt.figure(figsize=(10, 8)) |
|
sns.scatterplot(data[col], color=palette[3]) |
|
plt.title(f"Scatter Plot of {col} Column") |
|
st.pyplot(plt) |
|
|
|
|
|
def biscatterplot(data, cols): |
|
try: |
|
plt.figure(figsize=(10, 8)) |
|
sns.scatterplot( |
|
data=data, |
|
x=cols[0], |
|
y=cols[1], |
|
palette=palette[1:], |
|
edgecolor="#1c1c1c", |
|
linewidth=2, |
|
) |
|
plt.title(f"Scatter Plot of {cols[0]} and {cols[1]} Columns") |
|
st.pyplot(plt) |
|
except Exception as e: |
|
st.write(str(e)) |
|
|
|
|
|
def bibarplot(data, cols): |
|
try: |
|
plt.figure(figsize=(10, 8)) |
|
sns.barplot( |
|
data=data, |
|
x=cols[0], |
|
y=cols[1], |
|
palette=palette[1:], |
|
edgecolor="#1c1c1c", |
|
linewidth=2, |
|
) |
|
plt.title(f"Bar Plot of {cols[0]} and {cols[1]} Columns") |
|
st.pyplot(plt) |
|
except Exception as e: |
|
st.write(str(e)) |
|
|
|
|
|
def biboxplot(data, cols): |
|
try: |
|
plt.figure(figsize=(10, 8)) |
|
sns.boxplot(data=data, x=cols[0], y=cols[1], palette=palette[1:], linewidth=2) |
|
plt.title(f"Box Plot of {cols[0]} and {cols[1]} Columns") |
|
st.pyplot(plt) |
|
except Exception as e: |
|
st.write(str(e)) |
|
|
|
|
|
def paretoplot(data, categorical_col): |
|
try: |
|
value_counts = data[categorical_col].value_counts() |
|
cumulative_percentage = (value_counts / value_counts.sum()).cumsum() |
|
pareto_df = pd.DataFrame( |
|
{ |
|
"Categories": value_counts.index, |
|
"Frequency": value_counts.values, |
|
"Cumulative Percentage": cumulative_percentage.values * 100, |
|
} |
|
) |
|
pareto_df = pareto_df.sort_values(by="Frequency", ascending=False) |
|
|
|
fig, ax1 = plt.subplots(figsize=(10, 8)) |
|
ax1.bar( |
|
pareto_df["Categories"], |
|
pareto_df["Frequency"], |
|
color=palette[1:], |
|
edgecolor="#1c1c1c", |
|
linewidth=2, |
|
) |
|
ax2 = ax1.twinx() |
|
ax2.yaxis.set_major_formatter(PercentFormatter()) |
|
ax2.plot( |
|
pareto_df["Categories"], |
|
pareto_df["Cumulative Percentage"], |
|
color=palette[3], |
|
marker="D", |
|
ms=10, |
|
) |
|
ax1.set_xlabel(categorical_col) |
|
ax1.set_ylabel("Frequency", color=palette[0]) |
|
ax2.set_ylabel("Cumulative Percentage", color=palette[3]) |
|
st.pyplot(fig) |
|
|
|
except Exception as e: |
|
pass |
|
|
|
|
|
def plot_confusion_matrix(y_true, y_pred, title): |
|
cm = confusion_matrix(y_true, y_pred) |
|
plt.figure(figsize=(6, 4)) |
|
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False) |
|
plt.xlabel("Predicted Label") |
|
plt.ylabel("True Label") |
|
plt.title(title) |
|
st.pyplot(plt) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|