import streamlit as st
import numpy as np
import pandas as pd
import io
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns
from sklearn.preprocessing import (
OneHotEncoder,
OrdinalEncoder,
StandardScaler,
MinMaxScaler,
)
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.linear_model import Ridge, Lasso, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBRFRegressor, XGBRFClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.metrics import (
mean_absolute_error,
mean_squared_error,
mean_squared_error,
r2_score,
)
from sklearn.metrics import (
accuracy_score,
f1_score,
confusion_matrix,
precision_score,
recall_score,
)
import pickle
st.set_page_config(page_title="Tabular Data Analysis and Auto ML", page_icon="🤖")
sns.set_style("white")
sns.set_context("poster", font_scale=0.7)
palette = [
"#1d7874",
"#679289",
"#f4c095",
"#ee2e31",
"#ffb563",
"#918450",
"#f85e00",
"#a41623",
"#9a031e",
"#d6d6d6",
"#ffee32",
"#ffd100",
"#333533",
"#202020",
]
def main():
file = st.sidebar.file_uploader("Upload Your CSV File Here: ")
process = st.sidebar.button("Process")
option = st.sidebar.radio(
"Select an Option: ",
(
"Basic EDA",
"Univariate Analysis",
"Bivariate Analysis",
"Preprocess",
"Training and Evaluation",
),
)
placeholder = st.empty()
placeholder.markdown(
"
Welcome to Tabular Data Analysis and Auto ML🤖
",
unsafe_allow_html=True
)
if file is not None and process:
data = load_csv(file)
st.session_state["data"] = data
if "data" in st.session_state:
data = st.session_state["data"]
placeholder.empty()
if option == "Basic EDA":
st.markdown(
"Basic EDA
", unsafe_allow_html=True
)
st.subheader("Data Overview")
st.write(data_overview(data))
st.write(duplicate(data))
st.dataframe(data.head())
st.subheader("Data Types and Unique Value Counts")
display_data_info(data)
st.subheader("Missing Data")
missing_data(data)
st.subheader("Value Counts")
value_counts(data)
st.subheader("Descriptive Statistics")
st.write(data.describe().T)
if option == "Univariate Analysis":
st.markdown(
"Univariate Analysis
",
unsafe_allow_html=True,
)
plot = st.radio(
"Select a chart: ",
("Count Plot", "Pie Chart", "Histogram", "Violin Plot", "Scatter Plot"),
)
if plot == "Count Plot":
column = st.selectbox(
"Select a column", [""] + list(data.select_dtypes("O"))
)
if column:
countplot(data, column)
if plot == "Pie Chart":
column = st.selectbox(
"Select a column", [""] + list(data.select_dtypes("O"))
)
if column:
piechart(data, column)
if plot == "Histogram":
column = st.selectbox(
"Select a column",
[""] + list(data.select_dtypes(include=["int", "float"])),
)
if column:
histogram(data, column)
if plot == "Violin Plot":
column = st.selectbox(
"Select a column",
[""] + list(data.select_dtypes(include=["int", "float"])),
)
if column:
violinplot(data, column)
if plot == "Scatter Plot":
column = st.selectbox(
"Select a column",
[""] + list(data.select_dtypes(include=["int", "float"])),
)
if column:
scatterplot(data, column)
if option == "Bivariate Analysis":
st.markdown(
"Bivariate Analysis
",
unsafe_allow_html=True,
)
plot = st.radio(
"Select a chart: ",
("Scatter Plot", "Bar Plot", "Box Plot", "Pareto Chart"),
)
if plot == "Scatter Plot":
columns = st.multiselect(
"Select two columns",
[""] + list(data.select_dtypes(include=["int", "float"])),
)
if columns:
biscatterplot(data, columns)
if plot == "Bar Plot":
columns = st.multiselect("Select two columns", list(data.columns))
if columns:
bibarplot(data, columns)
if plot == "Box Plot":
columns = st.multiselect("Select two columns", list(data.columns))
if columns:
biboxplot(data, columns)
if plot == "Pareto Chart":
column = st.selectbox(
"Select a columns",
[""] + list(data.select_dtypes(include="object")),
)
if column:
paretoplot(data, column)
if option == "Preprocess":
st.markdown(
"Data Preprocessing
",
unsafe_allow_html=True,
)
operation = st.radio(
"Select preprocessing step: ",
(
"Drop Columns",
"Handling Missing Values",
"Encode Categorical Features",
),
)
if operation == "Drop Columns":
columns = st.multiselect("Select Columns to drop: ", (data.columns))
drop_columns = st.button("Drop Columns")
if drop_columns:
data.drop(columns, axis=1, inplace=True)
st.success("Dropped selected columns✅✅✅")
elif operation == "Handling Missing Values":
num_missing = st.selectbox(
"Select a Approach (Numerical columns only): ",
("", "Drop", "Backward Fill", "Forward Fill", "Mean", "Median"),
).lower()
cat_missing = st.selectbox(
"Select a Approach (Categorical columns only): ",
("", "Drop", "Most Frequent Values", "Replace with 'Unknown'"),
).lower()
hmv = st.button("Handle Missing Values")
if hmv:
if num_missing:
num_data = data.select_dtypes(include=["int64", "float64"])
if num_missing == "drop":
data = data.dropna(subset=num_data.columns)
elif num_missing in [
"mean",
"median",
"backward fill",
"forward fill",
]:
if num_missing == "mean":
fill_values = num_data.mean()
elif num_missing == "median":
fill_values = num_data.median()
elif num_missing == "backward fill":
fill_values = num_data.bfill()
elif num_missing == "forward fill":
fill_values = num_data.ffill()
data.fillna(value=fill_values, inplace=True)
st.success(
"Imputed missing values in numerical columns with selected approach."
)
if cat_missing:
cat_data = data.select_dtypes(exclude=["int", "float"])
if cat_missing == "drop":
data = data.dropna(subset=cat_data.columns)
elif cat_missing == "most frequent values":
mode_values = data[cat_data.columns].mode().iloc[0]
data[cat_data.columns] = data[cat_data.columns].fillna(
mode_values
)
elif cat_missing == "replace with 'unknown'":
data[cat_data.columns] = data[cat_data.columns].fillna(
"Unknown"
)
st.success(
"Imputed missing values in categorical columns with selected approach."
)
elif operation == "Encode Categorical Features":
oe_columns = st.multiselect(
"Choose Columns for Ordinal Encoding",
[""] + list(data.select_dtypes(include="object")),
)
st.info("Other columns will be One Hot Encoded.")
encode_columns = st.button("Encode Columns")
if encode_columns:
bool_columns = data.select_dtypes(include=bool).columns
data[bool_columns] = data[bool_columns].astype(int)
if oe_columns:
oe = OrdinalEncoder()
data[oe_columns] = oe.fit_transform(
data[oe_columns].astype("str")
)
try:
remaining_cat_cols = [
col
for col in data.select_dtypes(include="object")
if col not in oe_columns
]
except:
pass
if len(remaining_cat_cols) > 0:
data = pd.get_dummies(
data, columns=remaining_cat_cols, drop_first=False
)
st.success("Encoded categorical columns")
bool_columns = data.select_dtypes(include=bool).columns
data[bool_columns] = data[bool_columns].astype(int)
st.session_state["data"] = data
preprocessed_data_csv = data.to_csv(index=False)
preprocessed_data_buffer = io.StringIO()
preprocessed_data_buffer.write(preprocessed_data_csv)
preprocessed_data_bytes = preprocessed_data_buffer.getvalue()
if st.download_button(
label="Download Preprocessed Data",
key="preprocessed_data",
on_click=None,
data=preprocessed_data_bytes.encode(),
file_name="preprocessed_data.csv",
mime="text/csv",
):
st.success('Data Downloaded')
if option == "Training and Evaluation":
st.markdown(
"Training and Evaluation
",
unsafe_allow_html=True,
)
algo = st.selectbox("Choose Algorithm Type:", ("", "Regression", "Classification"))
if algo == "Regression":
target = st.selectbox("Chose Target Variable (Y): ", list(data.columns))
try:
X = data.drop(target, axis=1)
Y = data[target]
except Exception as e:
st.write(str(e))
st.write(
"80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model."
)
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.2, random_state=42
)
scale = st.selectbox(
"Choose how do you want to scale features:",
("", "Standard Scaler", "Min Max Scaler"),
)
if scale == "Standard Scaler":
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
elif scale == "Min Max Scaler":
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
model = st.selectbox(
"Choose Regression Model for training: ",
(
"",
"Ridge Regression",
"Decision Tree Regressor",
"Random Forest Regressor",
"SVR",
"XGBRF Regressor",
"LGBM Regressor",
),
)
if model == "Ridge Regression":
reg = Ridge(alpha=1.0)
reg.fit(X_train, y_train)
pred = reg.predict(X_test)
st.write(
"Mean Absolute Error (MAE): {:.4f}".format(
mean_absolute_error(pred, y_test)
)
)
st.write(
"Mean Squared Error (MSE): {:.4f}".format(
mean_squared_error(pred, y_test)
)
)
st.write(
"Root Mean Squared Error (RMSE): {:.4f}".format(
mean_squared_error(pred, y_test, squared=False)
)
)
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(reg),
file_name="ridge_regression_model.pkl",
mime="application/octet-stream",
):
with open("ridge_regression_model.pkl", "wb") as model_file:
pickle.dump(reg, model_file)
elif model == "Decision Tree Regressor":
reg = DecisionTreeRegressor(max_depth=10)
reg.fit(X_train, y_train)
pred = reg.predict(X_test)
st.write(
"Mean Absolute Error (MAE): {:.4f}".format(
mean_absolute_error(pred, y_test)
)
)
st.write(
"Mean Squared Error (MSE): {:.4f}".format(
mean_squared_error(pred, y_test)
)
)
st.write(
"Root Mean Squared Error (RMSE): {:.4f}".format(
mean_squared_error(pred, y_test, squared=False)
)
)
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(reg),
file_name="decision_tree_regression_model.pkl",
mime="application/octet-stream",
):
with open(
"decision_tree_regression_model.pkl", "wb"
) as model_file:
pickle.dump(reg, model_file)
elif model == "Random Forest Regressor":
reg = RandomForestRegressor(max_depth=10, n_estimators=100)
reg.fit(X_train, y_train)
pred = reg.predict(X_test)
st.write(
"Mean Absolute Error (MAE): {:.4f}".format(
mean_absolute_error(pred, y_test)
)
)
st.write(
"Mean Squared Error (MSE): {:.4f}".format(
mean_squared_error(pred, y_test)
)
)
st.write(
"Root Mean Squared Error (RMSE): {:.4f}".format(
mean_squared_error(pred, y_test, squared=False)
)
)
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(reg),
file_name="random_forest_regression_model.pkl",
mime="application/octet-stream",
):
with open(
"random_forest_regression_model.pkl", "wb"
) as model_file:
pickle.dump(reg, model_file)
elif model == "SVR":
reg = SVR(C=1.0, epsilon=0.2)
reg.fit(X_train, y_train)
pred = reg.predict(X_test)
st.write(
"Mean Absolute Error (MAE): {:.4f}".format(
mean_absolute_error(pred, y_test)
)
)
st.write(
"Mean Squared Error (MSE): {:.4f}".format(
mean_squared_error(pred, y_test)
)
)
st.write(
"Root Mean Squared Error (RMSE): {:.4f}".format(
mean_squared_error(pred, y_test, squared=False)
)
)
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(reg),
file_name="svr_model.pkl",
mime="application/octet-stream",
):
with open("svr_model.pkl", "wb") as model_file:
pickle.dump(reg, model_file)
elif model == "XGBRF Regressor":
reg = XGBRFRegressor(reg_lambda=1)
reg.fit(X_train, y_train)
pred = reg.predict(X_test)
st.write(
"Mean Absolute Error (MAE): {:.4f}".format(
mean_absolute_error(pred, y_test)
)
)
st.write(
"Mean Squared Error (MSE): {:.4f}".format(
mean_squared_error(pred, y_test)
)
)
st.write(
"Root Mean Squared Error (RMSE): {:.4f}".format(
mean_squared_error(pred, y_test, squared=False)
)
)
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(reg),
file_name="xgbrf_regression_model.pkl",
mime="application/octet-stream",
):
with open("xgbrf_regression_model.pkl", "wb") as model_file:
pickle.dump(reg, model_file)
elif model == "LGBM Regressor":
reg = LGBMRegressor(reg_lambda=1)
reg.fit(X_train, y_train)
pred = reg.predict(X_test)
st.write(
"Mean Absolute Error (MAE): {:.4f}".format(
mean_absolute_error(pred, y_test)
)
)
st.write(
"Mean Squared Error (MSE): {:.4f}".format(
mean_squared_error(pred, y_test)
)
)
st.write(
"Root Mean Squared Error (RMSE): {:.4f}".format(
mean_squared_error(pred, y_test, squared=False)
)
)
st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(reg),
file_name="lgbm_regression_model.pkl",
mime="application/octet-stream",
):
with open("lgbm_regression_model.pkl", "wb") as model_file:
pickle.dump(reg, model_file)
elif algo == "Classification":
target = st.selectbox("Chose Target Variable (Y): ", list(data.columns))
try:
X = data.drop(target, axis=1)
Y = data[target]
except Exception as e:
st.write(str(e))
st.write(
"80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model."
)
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.2, random_state=42
)
balance = st.selectbox(
"Do you want to balance dataset?", ("", "Yes", "No")
)
if balance == "Yes":
piechart(data, target)
sample = st.selectbox(
"Which approach you want to use?",
("", "Random Under Sampling", "Random Over Sampling", "SMOTE"),
)
if sample == "Random Under Sampling":
rus = RandomUnderSampler(random_state=42)
X_train, y_train = rus.fit_resample(X_train, y_train)
elif sample == "Random Over Sampling":
ros = RandomOverSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train, y_train)
elif sample == "SMOTE":
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)
scale = st.selectbox(
"Choose how do you want to scale features:",
("", "Standard Scaler", "Min Max Scaler"),
)
if scale == "Standard Scaler":
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
elif scale == "Min Max Scaler":
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
model = st.selectbox(
"Choose Classification Model for training: ",
(
"",
"Logistic Regression",
"Decision Tree Classifier",
"Random Forest Classifier",
"SVC",
"XGBRF Classifier",
"LGBM Classifier",
),
)
if model == "Logistic Regression":
clf = LogisticRegression(penalty="l2")
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
st.write(
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
)
try:
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
except ValueError:
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
plot_confusion_matrix(
pred, y_test, "Logistic Regression Confusion Matrix "
)
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(clf),
file_name="logistic_regression_model.pkl",
mime="application/octet-stream",
):
with open("logistic_regression_model.pkl", "wb") as model_file:
pickle.dump(clf, model_file)
if model == "Decision Tree Classifier":
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
st.write(
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
)
try:
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
except ValueError:
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
plot_confusion_matrix(
pred, y_test, "DecisionTree Classifier Confusion Matrix "
)
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(clf),
file_name="decision_tree_classifier_model.pkl",
mime="application/octet-stream",
):
with open(
"decision_tree_classifier_model.pkl", "wb"
) as model_file:
pickle.dump(clf, model_file)
if model == "Random Forest Classifier":
clf = RandomForestClassifier(n_estimators=100, max_depth=5)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
st.write(
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
)
try:
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
except ValueError:
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
plot_confusion_matrix(
pred, y_test, "RandomForest Classifier Confusion Matrix "
)
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(clf),
file_name="random_forest_classifier_model.pkl",
mime="application/octet-stream",
):
with open(
"random_forest_classifier_model.pkl", "wb"
) as model_file:
pickle.dump(clf, model_file)
if model == "SVC":
clf = SVC(C=1.5)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
st.write(
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
)
try:
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
except ValueError:
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
plot_confusion_matrix(pred, y_test, "SVC Confusion Matrix ")
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(clf),
file_name="svc_model.pkl",
mime="application/octet-stream",
):
with open("svc_model.pkl", "wb") as model_file:
pickle.dump(clf, model_file)
if model == "XGBRF Classifier":
clf = XGBRFClassifier(reg_lambda=1.0)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
st.write(
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
)
try:
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
except ValueError:
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
plot_confusion_matrix(
pred, y_test, "XGBRF Classifier Confusion Matrix "
)
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(clf),
file_name="xgbrf_classifier_model.pkl",
mime="application/octet-stream",
):
with open("xgbrf_classifier_model.pkl", "wb") as model_file:
pickle.dump(clf, model_file)
if model == "LGBM Classifier":
clf = LGBMClassifier(reg_lambda=1.0)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
st.write(
"Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
)
try:
st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
except ValueError:
st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
plot_confusion_matrix(
pred, y_test, "LGBM Classifier Confusion Matrix "
)
if st.download_button(
label="Download Trained Model",
key="trained_model",
on_click=None,
data=pickle.dumps(clf),
file_name="lgbm_classifier_model.pkl",
mime="application/octet-stream",
):
with open("lgbm_classifier_model.pkl", "wb") as model_file:
pickle.dump(clf, model_file)
def load_csv(file):
data = pd.read_csv(file)
return data
def data_overview(data):
r, c = data.shape
st.write(f"Number of Rows: {r}")
return f"Number of Columns: {c}"
def missing_data(data):
missing_values = data.isna().sum()
missing_values = missing_values[missing_values > 0]
missing_value_per = (missing_values / data.shape[0]) * 100
missing_value_per = missing_value_per.round(2).astype(str) + "%"
missing_df = pd.DataFrame(
{"Missing Values": missing_values, "Percentage": missing_value_per}
)
missing_df_html = missing_df.to_html(
classes="table table-striped", justify="center"
)
return st.markdown(missing_df_html, unsafe_allow_html=True)
def display_data_info(data):
dtypes = pd.DataFrame(data.dtypes, columns=["Data Type"])
dtypes.reset_index(inplace=True)
nunique = pd.DataFrame(data.nunique(), columns=["Unique Counts"])
nunique.reset_index(inplace=True)
dtypes.columns = ["Column", "Data Type"]
nunique.columns = ["Column", "Unique Counts"]
combined_df = pd.merge(dtypes, nunique, on="Column")
combined_df_html = combined_df.to_html(
classes="table table-striped", justify="center"
)
return st.markdown(combined_df_html, unsafe_allow_html=True)
def value_counts(data):
column = st.selectbox("Select a Column", [""] + list(data.columns))
if column:
st.write(data[column].value_counts())
def duplicate(data):
if data.duplicated().any():
st.write(
f"There is/are {data.duplicated().sum()} duplicate rows in the DataFrame. Duplicated values will be dropped."
)
data.drop_duplicates(keep="first", inplace=True)
return ""
else:
return "There are no duplicate rows in the DataFrame."
def countplot(data, col):
plt.figure(figsize=(10, 6))
sns.countplot(y=data[col], palette=palette[1:], edgecolor="#1c1c1c", linewidth=2)
plt.title(f"Countplot of {col} Column")
st.pyplot(plt)
def piechart(data, col):
value_counts = data[col].value_counts()
plt.figure(figsize=(8, 6))
plt.pie(
value_counts,
labels=value_counts.index,
autopct="%1.1f%%",
colors=palette,
shadow=False,
wedgeprops=dict(edgecolor="#1c1c1c"),
)
plt.title(f"Pie Chart of {col} Column")
st.pyplot(plt)
def histogram(data, col):
plt.figure(figsize=(10, 6))
sns.histplot(
data[col],
kde=True,
color=palette[4],
fill=True,
edgecolor="#1c1c1c",
linewidth=2,
)
plt.title(f"Histogram of {col} Column")
st.pyplot(plt)
def violinplot(data, col):
plt.figure(figsize=(10, 6))
sns.violinplot(data[col], color=palette[8])
plt.title(f"Violin Plot of {col} Column")
st.pyplot(plt)
def scatterplot(data, col):
plt.figure(figsize=(10, 8))
sns.scatterplot(data[col], color=palette[3])
plt.title(f"Scatter Plot of {col} Column")
st.pyplot(plt)
def biscatterplot(data, cols):
try:
plt.figure(figsize=(10, 8))
sns.scatterplot(
data=data,
x=cols[0],
y=cols[1],
palette=palette[1:],
edgecolor="#1c1c1c",
linewidth=2,
)
plt.title(f"Scatter Plot of {cols[0]} and {cols[1]} Columns")
st.pyplot(plt)
except Exception as e:
st.write(str(e))
def bibarplot(data, cols):
try:
plt.figure(figsize=(10, 8))
sns.barplot(
data=data,
x=cols[0],
y=cols[1],
palette=palette[1:],
edgecolor="#1c1c1c",
linewidth=2,
)
plt.title(f"Bar Plot of {cols[0]} and {cols[1]} Columns")
st.pyplot(plt)
except Exception as e:
st.write(str(e))
def biboxplot(data, cols):
try:
plt.figure(figsize=(10, 8))
sns.boxplot(data=data, x=cols[0], y=cols[1], palette=palette[1:], linewidth=2)
plt.title(f"Box Plot of {cols[0]} and {cols[1]} Columns")
st.pyplot(plt)
except Exception as e:
st.write(str(e))
def paretoplot(data, categorical_col):
try:
value_counts = data[categorical_col].value_counts()
cumulative_percentage = (value_counts / value_counts.sum()).cumsum()
pareto_df = pd.DataFrame(
{
"Categories": value_counts.index,
"Frequency": value_counts.values,
"Cumulative Percentage": cumulative_percentage.values * 100,
}
)
pareto_df = pareto_df.sort_values(by="Frequency", ascending=False)
fig, ax1 = plt.subplots(figsize=(10, 8))
ax1.bar(
pareto_df["Categories"],
pareto_df["Frequency"],
color=palette[1:],
edgecolor="#1c1c1c",
linewidth=2,
)
ax2 = ax1.twinx()
ax2.yaxis.set_major_formatter(PercentFormatter())
ax2.plot(
pareto_df["Categories"],
pareto_df["Cumulative Percentage"],
color=palette[3],
marker="D",
ms=10,
)
ax1.set_xlabel(categorical_col)
ax1.set_ylabel("Frequency", color=palette[0])
ax2.set_ylabel("Cumulative Percentage", color=palette[3])
st.pyplot(fig)
except Exception as e:
pass
def plot_confusion_matrix(y_true, y_pred, title):
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title(title)
st.pyplot(plt)
if __name__ == "__main__":
main()