import streamlit as st import numpy as np import pandas as pd import io import matplotlib.pyplot as plt from matplotlib.ticker import PercentFormatter import seaborn as sns from sklearn.preprocessing import ( OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, ) from sklearn.model_selection import train_test_split from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import RandomOverSampler, SMOTE from sklearn.linear_model import Ridge, Lasso, LogisticRegression from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.svm import SVR, SVC from sklearn.naive_bayes import MultinomialNB from xgboost import XGBRFRegressor, XGBRFClassifier from lightgbm import LGBMRegressor, LGBMClassifier from sklearn.metrics import ( mean_absolute_error, mean_squared_error, mean_squared_error, r2_score, ) from sklearn.metrics import ( accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, ) import pickle st.set_page_config(page_title="Tabular Data Analysis and Auto ML", page_icon="🤖") sns.set_style("white") sns.set_context("poster", font_scale=0.7) palette = [ "#1d7874", "#679289", "#f4c095", "#ee2e31", "#ffb563", "#918450", "#f85e00", "#a41623", "#9a031e", "#d6d6d6", "#ffee32", "#ffd100", "#333533", "#202020", ] def main(): file = st.sidebar.file_uploader("Upload Your CSV File Here: ") process = st.sidebar.button("Process") option = st.sidebar.radio( "Select an Option: ", ( "Basic EDA", "Univariate Analysis", "Bivariate Analysis", "Preprocess", "Training and Evaluation", ), ) placeholder = st.empty() placeholder.markdown( "

Welcome to Tabular Data Analysis and Auto ML🤖

", unsafe_allow_html=True ) if file is not None and process: data = load_csv(file) st.session_state["data"] = data if "data" in st.session_state: data = st.session_state["data"] placeholder.empty() if option == "Basic EDA": st.markdown( "

Basic EDA

", unsafe_allow_html=True ) st.subheader("Data Overview") st.write(data_overview(data)) st.write(duplicate(data)) st.dataframe(data.head()) st.subheader("Data Types and Unique Value Counts") display_data_info(data) st.subheader("Missing Data") missing_data(data) st.subheader("Value Counts") value_counts(data) st.subheader("Descriptive Statistics") st.write(data.describe().T) if option == "Univariate Analysis": st.markdown( "

Univariate Analysis

", unsafe_allow_html=True, ) plot = st.radio( "Select a chart: ", ("Count Plot", "Pie Chart", "Histogram", "Violin Plot", "Scatter Plot"), ) if plot == "Count Plot": column = st.selectbox( "Select a column", [""] + list(data.select_dtypes("O")) ) if column: countplot(data, column) if plot == "Pie Chart": column = st.selectbox( "Select a column", [""] + list(data.select_dtypes("O")) ) if column: piechart(data, column) if plot == "Histogram": column = st.selectbox( "Select a column", [""] + list(data.select_dtypes(include=["int", "float"])), ) if column: histogram(data, column) if plot == "Violin Plot": column = st.selectbox( "Select a column", [""] + list(data.select_dtypes(include=["int", "float"])), ) if column: violinplot(data, column) if plot == "Scatter Plot": column = st.selectbox( "Select a column", [""] + list(data.select_dtypes(include=["int", "float"])), ) if column: scatterplot(data, column) if option == "Bivariate Analysis": st.markdown( "

Bivariate Analysis

", unsafe_allow_html=True, ) plot = st.radio( "Select a chart: ", ("Scatter Plot", "Bar Plot", "Box Plot", "Pareto Chart"), ) if plot == "Scatter Plot": columns = st.multiselect( "Select two columns", [""] + list(data.select_dtypes(include=["int", "float"])), ) if columns: biscatterplot(data, columns) if plot == "Bar Plot": columns = st.multiselect("Select two columns", list(data.columns)) if columns: bibarplot(data, columns) if plot == "Box Plot": columns = st.multiselect("Select two columns", list(data.columns)) if columns: biboxplot(data, columns) if plot == "Pareto Chart": column = st.selectbox( "Select a columns", [""] + list(data.select_dtypes(include="object")), ) if column: paretoplot(data, column) if option == "Preprocess": st.markdown( "

Data Preprocessing

", unsafe_allow_html=True, ) operation = st.radio( "Select preprocessing step: ", ( "Drop Columns", "Handling Missing Values", "Encode Categorical Features", ), ) if operation == "Drop Columns": columns = st.multiselect("Select Columns to drop: ", (data.columns)) drop_columns = st.button("Drop Columns") if drop_columns: data.drop(columns, axis=1, inplace=True) st.success("Dropped selected columns✅✅✅") elif operation == "Handling Missing Values": num_missing = st.selectbox( "Select a Approach (Numerical columns only): ", ("", "Drop", "Backward Fill", "Forward Fill", "Mean", "Median"), ).lower() cat_missing = st.selectbox( "Select a Approach (Categorical columns only): ", ("", "Drop", "Most Frequent Values", "Replace with 'Unknown'"), ).lower() hmv = st.button("Handle Missing Values") if hmv: if num_missing: num_data = data.select_dtypes(include=["int64", "float64"]) if num_missing == "drop": data = data.dropna(subset=num_data.columns) elif num_missing in [ "mean", "median", "backward fill", "forward fill", ]: if num_missing == "mean": fill_values = num_data.mean() elif num_missing == "median": fill_values = num_data.median() elif num_missing == "backward fill": fill_values = num_data.bfill() elif num_missing == "forward fill": fill_values = num_data.ffill() data.fillna(value=fill_values, inplace=True) st.success( "Imputed missing values in numerical columns with selected approach." ) if cat_missing: cat_data = data.select_dtypes(exclude=["int", "float"]) if cat_missing == "drop": data = data.dropna(subset=cat_data.columns) elif cat_missing == "most frequent values": mode_values = data[cat_data.columns].mode().iloc[0] data[cat_data.columns] = data[cat_data.columns].fillna( mode_values ) elif cat_missing == "replace with 'unknown'": data[cat_data.columns] = data[cat_data.columns].fillna( "Unknown" ) st.success( "Imputed missing values in categorical columns with selected approach." ) elif operation == "Encode Categorical Features": oe_columns = st.multiselect( "Choose Columns for Ordinal Encoding", [""] + list(data.select_dtypes(include="object")), ) st.info("Other columns will be One Hot Encoded.") encode_columns = st.button("Encode Columns") if encode_columns: bool_columns = data.select_dtypes(include=bool).columns data[bool_columns] = data[bool_columns].astype(int) if oe_columns: oe = OrdinalEncoder() data[oe_columns] = oe.fit_transform( data[oe_columns].astype("str") ) try: remaining_cat_cols = [ col for col in data.select_dtypes(include="object") if col not in oe_columns ] except: pass if len(remaining_cat_cols) > 0: data = pd.get_dummies( data, columns=remaining_cat_cols, drop_first=False ) st.success("Encoded categorical columns") bool_columns = data.select_dtypes(include=bool).columns data[bool_columns] = data[bool_columns].astype(int) st.session_state["data"] = data preprocessed_data_csv = data.to_csv(index=False) preprocessed_data_buffer = io.StringIO() preprocessed_data_buffer.write(preprocessed_data_csv) preprocessed_data_bytes = preprocessed_data_buffer.getvalue() if st.download_button( label="Download Preprocessed Data", key="preprocessed_data", on_click=None, data=preprocessed_data_bytes.encode(), file_name="preprocessed_data.csv", mime="text/csv", ): st.success('Data Downloaded') if option == "Training and Evaluation": st.markdown( "

Training and Evaluation

", unsafe_allow_html=True, ) algo = st.selectbox("Choose Algorithm Type:", ("", "Regression", "Classification")) if algo == "Regression": target = st.selectbox("Chose Target Variable (Y): ", list(data.columns)) try: X = data.drop(target, axis=1) Y = data[target] except Exception as e: st.write(str(e)) st.write( "80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model." ) X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.2, random_state=42 ) scale = st.selectbox( "Choose how do you want to scale features:", ("", "Standard Scaler", "Min Max Scaler"), ) if scale == "Standard Scaler": scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) elif scale == "Min Max Scaler": scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) model = st.selectbox( "Choose Regression Model for training: ", ( "", "Ridge Regression", "Decision Tree Regressor", "Random Forest Regressor", "SVR", "XGBRF Regressor", "LGBM Regressor", ), ) if model == "Ridge Regression": reg = Ridge(alpha=1.0) reg.fit(X_train, y_train) pred = reg.predict(X_test) st.write( "Mean Absolute Error (MAE): {:.4f}".format( mean_absolute_error(pred, y_test) ) ) st.write( "Mean Squared Error (MSE): {:.4f}".format( mean_squared_error(pred, y_test) ) ) st.write( "Root Mean Squared Error (RMSE): {:.4f}".format( mean_squared_error(pred, y_test, squared=False) ) ) st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) if st.download_button( label="Download Trained Model", key="trained_model", on_click=None, data=pickle.dumps(reg), file_name="ridge_regression_model.pkl", mime="application/octet-stream", ): with open("ridge_regression_model.pkl", "wb") as model_file: pickle.dump(reg, model_file) elif model == "Decision Tree Regressor": reg = DecisionTreeRegressor(max_depth=10) reg.fit(X_train, y_train) pred = reg.predict(X_test) st.write( "Mean Absolute Error (MAE): {:.4f}".format( mean_absolute_error(pred, y_test) ) ) st.write( "Mean Squared Error (MSE): {:.4f}".format( mean_squared_error(pred, y_test) ) ) st.write( "Root Mean Squared Error (RMSE): {:.4f}".format( mean_squared_error(pred, y_test, squared=False) ) ) st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) if st.download_button( label="Download Trained Model", key="trained_model", on_click=None, data=pickle.dumps(reg), file_name="decision_tree_regression_model.pkl", mime="application/octet-stream", ): with open( "decision_tree_regression_model.pkl", "wb" ) as model_file: pickle.dump(reg, model_file) elif model == "Random Forest Regressor": reg = RandomForestRegressor(max_depth=10, n_estimators=100) reg.fit(X_train, y_train) pred = reg.predict(X_test) st.write( "Mean Absolute Error (MAE): {:.4f}".format( mean_absolute_error(pred, y_test) ) ) st.write( "Mean Squared Error (MSE): {:.4f}".format( mean_squared_error(pred, y_test) ) ) st.write( "Root Mean Squared Error (RMSE): {:.4f}".format( mean_squared_error(pred, y_test, squared=False) ) ) st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) if st.download_button( label="Download Trained Model", key="trained_model", on_click=None, data=pickle.dumps(reg), file_name="random_forest_regression_model.pkl", mime="application/octet-stream", ): with open( "random_forest_regression_model.pkl", "wb" ) as model_file: pickle.dump(reg, model_file) elif model == "SVR": reg = SVR(C=1.0, epsilon=0.2) reg.fit(X_train, y_train) pred = reg.predict(X_test) st.write( "Mean Absolute Error (MAE): {:.4f}".format( mean_absolute_error(pred, y_test) ) ) st.write( "Mean Squared Error (MSE): {:.4f}".format( mean_squared_error(pred, y_test) ) ) st.write( "Root Mean Squared Error (RMSE): {:.4f}".format( mean_squared_error(pred, y_test, squared=False) ) ) st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) if st.download_button( label="Download Trained Model", key="trained_model", on_click=None, data=pickle.dumps(reg), file_name="svr_model.pkl", mime="application/octet-stream", ): with open("svr_model.pkl", "wb") as model_file: pickle.dump(reg, model_file) elif model == "XGBRF Regressor": reg = XGBRFRegressor(reg_lambda=1) reg.fit(X_train, y_train) pred = reg.predict(X_test) st.write( "Mean Absolute Error (MAE): {:.4f}".format( mean_absolute_error(pred, y_test) ) ) st.write( "Mean Squared Error (MSE): {:.4f}".format( mean_squared_error(pred, y_test) ) ) st.write( "Root Mean Squared Error (RMSE): {:.4f}".format( mean_squared_error(pred, y_test, squared=False) ) ) st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) if st.download_button( label="Download Trained Model", key="trained_model", on_click=None, data=pickle.dumps(reg), file_name="xgbrf_regression_model.pkl", mime="application/octet-stream", ): with open("xgbrf_regression_model.pkl", "wb") as model_file: pickle.dump(reg, model_file) elif model == "LGBM Regressor": reg = LGBMRegressor(reg_lambda=1) reg.fit(X_train, y_train) pred = reg.predict(X_test) st.write( "Mean Absolute Error (MAE): {:.4f}".format( mean_absolute_error(pred, y_test) ) ) st.write( "Mean Squared Error (MSE): {:.4f}".format( mean_squared_error(pred, y_test) ) ) st.write( "Root Mean Squared Error (RMSE): {:.4f}".format( mean_squared_error(pred, y_test, squared=False) ) ) st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) if st.download_button( label="Download Trained Model", key="trained_model", on_click=None, data=pickle.dumps(reg), file_name="lgbm_regression_model.pkl", mime="application/octet-stream", ): with open("lgbm_regression_model.pkl", "wb") as model_file: pickle.dump(reg, model_file) elif algo == "Classification": target = st.selectbox("Chose Target Variable (Y): ", list(data.columns)) try: X = data.drop(target, axis=1) Y = data[target] except Exception as e: st.write(str(e)) st.write( "80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model." ) X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.2, random_state=42 ) balance = st.selectbox( "Do you want to balance dataset?", ("", "Yes", "No") ) if balance == "Yes": piechart(data, target) sample = st.selectbox( "Which approach you want to use?", ("", "Random Under Sampling", "Random Over Sampling", "SMOTE"), ) if sample == "Random Under Sampling": rus = RandomUnderSampler(random_state=42) X_train, y_train = rus.fit_resample(X_train, y_train) elif sample == "Random Over Sampling": ros = RandomOverSampler(random_state=42) X_train, y_train = ros.fit_resample(X_train, y_train) elif sample == "SMOTE": smote = SMOTE(random_state=42) X_train, y_train = smote.fit_resample(X_train, y_train) scale = st.selectbox( "Choose how do you want to scale features:", ("", "Standard Scaler", "Min Max Scaler"), ) if scale == "Standard Scaler": scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) elif scale == "Min Max Scaler": scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) model = st.selectbox( "Choose Classification Model for training: ", ( "", "Logistic Regression", "Decision Tree Classifier", "Random Forest Classifier", "SVC", "XGBRF Classifier", "LGBM Classifier", ), ) if model == "Logistic Regression": clf = LogisticRegression(penalty="l2") clf.fit(X_train, y_train) pred = clf.predict(X_test) st.write( "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) ) try: st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) except ValueError: st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) plot_confusion_matrix( pred, y_test, "Logistic Regression Confusion Matrix " ) if st.download_button( label="Download Trained Model", key="trained_model", on_click=None, data=pickle.dumps(clf), file_name="logistic_regression_model.pkl", mime="application/octet-stream", ): with open("logistic_regression_model.pkl", "wb") as model_file: pickle.dump(clf, model_file) if model == "Decision Tree Classifier": clf = DecisionTreeClassifier(max_depth=5) clf.fit(X_train, y_train) pred = clf.predict(X_test) st.write( "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) ) try: st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) except ValueError: st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) plot_confusion_matrix( pred, y_test, "DecisionTree Classifier Confusion Matrix " ) if st.download_button( label="Download Trained Model", key="trained_model", on_click=None, data=pickle.dumps(clf), file_name="decision_tree_classifier_model.pkl", mime="application/octet-stream", ): with open( "decision_tree_classifier_model.pkl", "wb" ) as model_file: pickle.dump(clf, model_file) if model == "Random Forest Classifier": clf = RandomForestClassifier(n_estimators=100, max_depth=5) clf.fit(X_train, y_train) pred = clf.predict(X_test) st.write( "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) ) try: st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) except ValueError: st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) plot_confusion_matrix( pred, y_test, "RandomForest Classifier Confusion Matrix " ) if st.download_button( label="Download Trained Model", key="trained_model", on_click=None, data=pickle.dumps(clf), file_name="random_forest_classifier_model.pkl", mime="application/octet-stream", ): with open( "random_forest_classifier_model.pkl", "wb" ) as model_file: pickle.dump(clf, model_file) if model == "SVC": clf = SVC(C=1.5) clf.fit(X_train, y_train) pred = clf.predict(X_test) st.write( "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) ) try: st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) except ValueError: st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) plot_confusion_matrix(pred, y_test, "SVC Confusion Matrix ") if st.download_button( label="Download Trained Model", key="trained_model", on_click=None, data=pickle.dumps(clf), file_name="svc_model.pkl", mime="application/octet-stream", ): with open("svc_model.pkl", "wb") as model_file: pickle.dump(clf, model_file) if model == "XGBRF Classifier": clf = XGBRFClassifier(reg_lambda=1.0) clf.fit(X_train, y_train) pred = clf.predict(X_test) st.write( "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) ) try: st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) except ValueError: st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) plot_confusion_matrix( pred, y_test, "XGBRF Classifier Confusion Matrix " ) if st.download_button( label="Download Trained Model", key="trained_model", on_click=None, data=pickle.dumps(clf), file_name="xgbrf_classifier_model.pkl", mime="application/octet-stream", ): with open("xgbrf_classifier_model.pkl", "wb") as model_file: pickle.dump(clf, model_file) if model == "LGBM Classifier": clf = LGBMClassifier(reg_lambda=1.0) clf.fit(X_train, y_train) pred = clf.predict(X_test) st.write( "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) ) try: st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) except ValueError: st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) plot_confusion_matrix( pred, y_test, "LGBM Classifier Confusion Matrix " ) if st.download_button( label="Download Trained Model", key="trained_model", on_click=None, data=pickle.dumps(clf), file_name="lgbm_classifier_model.pkl", mime="application/octet-stream", ): with open("lgbm_classifier_model.pkl", "wb") as model_file: pickle.dump(clf, model_file) def load_csv(file): data = pd.read_csv(file) return data def data_overview(data): r, c = data.shape st.write(f"Number of Rows: {r}") return f"Number of Columns: {c}" def missing_data(data): missing_values = data.isna().sum() missing_values = missing_values[missing_values > 0] missing_value_per = (missing_values / data.shape[0]) * 100 missing_value_per = missing_value_per.round(2).astype(str) + "%" missing_df = pd.DataFrame( {"Missing Values": missing_values, "Percentage": missing_value_per} ) missing_df_html = missing_df.to_html( classes="table table-striped", justify="center" ) return st.markdown(missing_df_html, unsafe_allow_html=True) def display_data_info(data): dtypes = pd.DataFrame(data.dtypes, columns=["Data Type"]) dtypes.reset_index(inplace=True) nunique = pd.DataFrame(data.nunique(), columns=["Unique Counts"]) nunique.reset_index(inplace=True) dtypes.columns = ["Column", "Data Type"] nunique.columns = ["Column", "Unique Counts"] combined_df = pd.merge(dtypes, nunique, on="Column") combined_df_html = combined_df.to_html( classes="table table-striped", justify="center" ) return st.markdown(combined_df_html, unsafe_allow_html=True) def value_counts(data): column = st.selectbox("Select a Column", [""] + list(data.columns)) if column: st.write(data[column].value_counts()) def duplicate(data): if data.duplicated().any(): st.write( f"There is/are {data.duplicated().sum()} duplicate rows in the DataFrame. Duplicated values will be dropped." ) data.drop_duplicates(keep="first", inplace=True) return "" else: return "There are no duplicate rows in the DataFrame." def countplot(data, col): plt.figure(figsize=(10, 6)) sns.countplot(y=data[col], palette=palette[1:], edgecolor="#1c1c1c", linewidth=2) plt.title(f"Countplot of {col} Column") st.pyplot(plt) def piechart(data, col): value_counts = data[col].value_counts() plt.figure(figsize=(8, 6)) plt.pie( value_counts, labels=value_counts.index, autopct="%1.1f%%", colors=palette, shadow=False, wedgeprops=dict(edgecolor="#1c1c1c"), ) plt.title(f"Pie Chart of {col} Column") st.pyplot(plt) def histogram(data, col): plt.figure(figsize=(10, 6)) sns.histplot( data[col], kde=True, color=palette[4], fill=True, edgecolor="#1c1c1c", linewidth=2, ) plt.title(f"Histogram of {col} Column") st.pyplot(plt) def violinplot(data, col): plt.figure(figsize=(10, 6)) sns.violinplot(data[col], color=palette[8]) plt.title(f"Violin Plot of {col} Column") st.pyplot(plt) def scatterplot(data, col): plt.figure(figsize=(10, 8)) sns.scatterplot(data[col], color=palette[3]) plt.title(f"Scatter Plot of {col} Column") st.pyplot(plt) def biscatterplot(data, cols): try: plt.figure(figsize=(10, 8)) sns.scatterplot( data=data, x=cols[0], y=cols[1], palette=palette[1:], edgecolor="#1c1c1c", linewidth=2, ) plt.title(f"Scatter Plot of {cols[0]} and {cols[1]} Columns") st.pyplot(plt) except Exception as e: st.write(str(e)) def bibarplot(data, cols): try: plt.figure(figsize=(10, 8)) sns.barplot( data=data, x=cols[0], y=cols[1], palette=palette[1:], edgecolor="#1c1c1c", linewidth=2, ) plt.title(f"Bar Plot of {cols[0]} and {cols[1]} Columns") st.pyplot(plt) except Exception as e: st.write(str(e)) def biboxplot(data, cols): try: plt.figure(figsize=(10, 8)) sns.boxplot(data=data, x=cols[0], y=cols[1], palette=palette[1:], linewidth=2) plt.title(f"Box Plot of {cols[0]} and {cols[1]} Columns") st.pyplot(plt) except Exception as e: st.write(str(e)) def paretoplot(data, categorical_col): try: value_counts = data[categorical_col].value_counts() cumulative_percentage = (value_counts / value_counts.sum()).cumsum() pareto_df = pd.DataFrame( { "Categories": value_counts.index, "Frequency": value_counts.values, "Cumulative Percentage": cumulative_percentage.values * 100, } ) pareto_df = pareto_df.sort_values(by="Frequency", ascending=False) fig, ax1 = plt.subplots(figsize=(10, 8)) ax1.bar( pareto_df["Categories"], pareto_df["Frequency"], color=palette[1:], edgecolor="#1c1c1c", linewidth=2, ) ax2 = ax1.twinx() ax2.yaxis.set_major_formatter(PercentFormatter()) ax2.plot( pareto_df["Categories"], pareto_df["Cumulative Percentage"], color=palette[3], marker="D", ms=10, ) ax1.set_xlabel(categorical_col) ax1.set_ylabel("Frequency", color=palette[0]) ax2.set_ylabel("Cumulative Percentage", color=palette[3]) st.pyplot(fig) except Exception as e: pass def plot_confusion_matrix(y_true, y_pred, title): cm = confusion_matrix(y_true, y_pred) plt.figure(figsize=(6, 4)) sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False) plt.xlabel("Predicted Label") plt.ylabel("True Label") plt.title(title) st.pyplot(plt) if __name__ == "__main__": main()