import streamlit as st import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.linear_model import LinearRegression, Lasso from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score import joblib import plotly.express as px import plotly.figure_factory as ff # Main function def main(): st.set_page_config(page_title="Data Automation-Machine Learning") st.title("Machine Learning") with st.expander("1: Add Your Data Source"): uploaded_file = st.file_uploader("Upload your CSV or Excel file", type=["csv", "xlsx", "xls"]) if uploaded_file is None: try: data = pd.read_csv('example.csv') # Load example CSV st.info("Loaded example.csv") except FileNotFoundError: st.error("Example CSV file not found. Please upload your own CSV or Excel file.") except pd.errors.EmptyDataError: st.error("Example CSV file is empty or invalid.") else: try: if uploaded_file.name.endswith('.csv'): data = pd.read_csv(uploaded_file) elif uploaded_file.name.endswith(('.xlsx', '.xls')): data = pd.read_excel(uploaded_file) # Check if the file has content if data.empty: st.error("Uploaded file is empty. Please upload a valid CSV or Excel file.") else: st.success("File uploaded successfully!") except pd.errors.EmptyDataError: st.error("The uploaded file is empty or contains no readable data.") except ValueError: st.error("Error in file format. Please ensure the file is a valid CSV or Excel.") except Exception as e: st.error(f"An error occurred: {e}") with st.expander("2: DataSet Preview"): if uploaded_file is not None: st.write("Data Overview") st.dataframe(data.head()) st.write("Data Description") st.write(data.describe()) st.write("Missing Values") st.write(data.isnull().sum()) st.write("Data Types") st.write(data.dtypes) with st.expander("3: Data Cleaning"): st.write("Data Summary Before Cleaning") st.write(data.describe()) st.write("Missing Values Before Cleaning:") st.write(data.isnull().sum()) if st.checkbox("Show Missing Values Heatmap"): fig, ax = plt.subplots(figsize=(10, 6)) sns.heatmap(data.isnull(), cbar=False, cmap='viridis', ax=ax) plt.title("Missing Values Heatmap") st.pyplot(fig) if st.checkbox("Remove Duplicate Rows"): initial_shape = data.shape data = data.drop_duplicates() st.success(f"Removed {initial_shape[0] - data.shape[0]} duplicate rows.") missing_strategy = st.selectbox( "Choose a strategy for handling missing values", options=["Drop Missing Values", "Fill with Mean", "Fill with Median", "Fill with Mode", "Do Nothing"] ) if st.button("Apply Missing Value Strategy"): if missing_strategy == "Drop Missing Values": data.dropna(inplace=True) st.success("Dropped rows with missing values.") elif missing_strategy == "Fill with Mean": data.fillna(data.mean(), inplace=True) st.success("Filled missing values with the mean.") elif missing_strategy == "Fill with Median": data.fillna(data.median(), inplace=True) st.success("Filled missing values with the median.") elif missing_strategy == "Fill with Mode": for column in data.select_dtypes(include=['object']).columns: data[column].fillna(data[column].mode()[0], inplace=True) st.success("Filled missing values with the mode for categorical columns.") elif missing_strategy == "Do Nothing": st.info("No changes made to missing values.") st.write("Data Summary After Cleaning") st.write(data.describe()) st.write("Missing Values After Cleaning:") st.write(data.isnull().sum()) with st.expander('4: EDA'): st.write("Correlation Matrix") correlation_matrix = data.corr() fig = ff.create_annotated_heatmap( z=correlation_matrix.values, x=list(correlation_matrix.columns), y=list(correlation_matrix.index), ) fig.update_layout( title="Correlation Matrix", xaxis_title="Features", yaxis_title="Features", width=700, height=500, ) st.plotly_chart(fig) if st.checkbox("Show Distribution Plots for Numeric Features"): for column in data.select_dtypes(include=[int, float]).columns: fig, ax = plt.subplots(figsize=(8, 4)) sns.histplot(data[column], bins=30, kde=True, ax=ax) plt.title(f'Distribution of {column}') st.pyplot(fig) if st.checkbox("Show Boxplots for Numeric Features"): for column in data.select_dtypes(include=[int, float]).columns: fig, ax = plt.subplots(figsize=(8, 4)) sns.boxplot(x=data[column], ax=ax) plt.title(f'Boxplot of {column}') st.pyplot(fig) with st.expander("5: Feature Engineering"): target_column = st.selectbox("Select the target variable", options=data.columns) feature_columns = st.multiselect("Select features", options=data.columns.drop(target_column)) with st.expander("6: Modelling"): if 'model_plot' not in st.session_state: st.session_state.model_plot = None if 'model_metrics' not in st.session_state: st.session_state.model_metrics = None model_option = st.selectbox("Select Regression Model", options=["Linear Regression", "Random Forest Regression", "Lasso Regression"]) if st.button("Train Model (Without Hyperparameter Tuning)"): if feature_columns: X = data[feature_columns] y = data[target_column] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) if model_option == "Linear Regression": model = LinearRegression() elif model_option == "Random Forest Regression": model = RandomForestRegressor(random_state=42) elif model_option == "Lasso Regression": model = Lasso() model.fit(X_train, y_train) model_name = st.text_input('Enter model name', 'my_model') model_file_path = f'{model_name}.pkl' joblib.dump(model, model_file_path) st.success("Model saved successfully!") with open(model_file_path, "rb") as f: st.download_button( label="Download Model", data=f, file_name=model_file_path, mime="application/octet-stream" ) y_pred = model.predict(X_test) mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) st.session_state.model_plot = (y_test.reset_index(drop=True), y_pred) st.session_state.model_metrics = (mse, r2) st.success(f"Mean Squared Error: {mse:.2f}") st.success(f"R^2 Score: {r2:.2f}") if st.session_state.model_plot is not None: y_test, y_pred = st.session_state.model_plot fig, ax = plt.subplots(figsize=(10, 6)) ax.plot(y_test, label="True Values", color="blue", linestyle="--") ax.plot(y_pred, label="Predicted Values", color="orange") ax.set_title(f'{model_option}: True Values vs Predictions') ax.set_xlabel('Index') ax.set_ylabel('Values') ax.legend() st.pyplot(fig) if st.session_state.model_metrics is not None: mse, r2 = st.session_state.model_metrics st.success(f"Mean Squared Error: {mse:.2f}") st.success(f"R^2 Score: {r2:.2f}") with st.expander("7: HyperParameter"): if feature_columns: hyperparam_model_option = st.selectbox("Select Model for Hyperparameter Tuning", options=["Linear Regression", "Random Forest Regression", "Lasso Regression"]) if hyperparam_model_option == "Linear Regression": param_grid = {'fit_intercept': [True, False]} elif hyperparam_model_option == "Random Forest Regression": param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10]} elif hyperparam_model_option == "Lasso Regression": param_grid = {'alpha': [0.01, 0.1, 1, 10], 'max_iter': [1000, 5000, 10000]} if st.button("Train Model with Hyperparameter Tuning"): X = data[feature_columns] y = data[target_column] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) if hyperparam_model_option == "Linear Regression": model = LinearRegression() grid_search = GridSearchCV(model, param_grid, cv=5) elif hyperparam_model_option == "Random Forest Regression": model = RandomForestRegressor(random_state=42) grid_search = GridSearchCV(model, param_grid, cv=5) elif hyperparam_model_option == "Lasso Regression": model = Lasso() grid_search = GridSearchCV(model, param_grid, cv=5) grid_search.fit(X_train, y_train) best_params = grid_search.best_params_ st.success(f"Best Hyperparameters: {best_params}") # Run the application if __name__ == '__main__': main()