File size: 10,396 Bytes
6c88ced
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63a5421
8ff1bfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c88ced
 
bd4548a
 
 
 
 
 
 
 
 
6c88ced
 
8ff1bfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c88ced
8ff1bfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c88ced
 
8ff1bfe
 
 
 
 
 
6c88ced
8ff1bfe
 
 
6c88ced
8ff1bfe
 
 
 
 
 
 
 
 
6c88ced
8ff1bfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c88ced
 
8ff1bfe
 
6c88ced
8ff1bfe
 
6c88ced
8ff1bfe
 
 
 
 
 
 
 
 
 
6c88ced
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import plotly.express as px
import plotly.figure_factory as ff

# Main function
def main():
    st.set_page_config(page_title="Data Automation-Machine Learning")
    st.title("Machine Learning")

    with st.expander("1: Add Your Data Source"):
        uploaded_file = st.file_uploader("Upload your CSV or Excel file- Only numbers", type=["csv", "xlsx", "xls"])

        if uploaded_file is None:
            try:
                data = pd.read_csv('example.csv')  # Load example CSV
                st.info("Loaded example.csv")
            except FileNotFoundError:
                st.error("Example CSV file not found. Please upload your own CSV or Excel file.")
            except pd.errors.EmptyDataError:
                st.error("Example CSV file is empty or invalid.")
        else:
            try:
                if uploaded_file.name.endswith('.csv'):
                    data = pd.read_csv(uploaded_file)
                elif uploaded_file.name.endswith(('.xlsx', '.xls')):
                    data = pd.read_excel(uploaded_file)
                
                # Check if the file has content
                if data.empty:
                    st.error("Uploaded file is empty. Please upload a valid CSV or Excel file.")
                else:
                    st.success("File uploaded successfully!")
            except pd.errors.EmptyDataError:
                st.error("The uploaded file is empty or contains no readable data.")
            except ValueError:
                st.error("Error in file format. Please ensure the file is a valid CSV or Excel.")
            except Exception as e:
                st.error(f"An error occurred: {e}")

    with st.expander("2: DataSet Preview"):

        st.write("Data Overview")
        st.dataframe(data.head())
        st.write("Data Description")
        st.write(data.describe())
        st.write("Missing Values")
        st.write(data.isnull().sum())
        st.write("Data Types")
        st.write(data.dtypes)

    with st.expander("3: Data Cleaning"):           
        st.write("Data Summary Before Cleaning")
        st.write(data.describe())
        st.write("Missing Values Before Cleaning:")
        st.write(data.isnull().sum())

        if st.checkbox("Show Missing Values Heatmap"):
            fig, ax = plt.subplots(figsize=(10, 6))
            sns.heatmap(data.isnull(), cbar=False, cmap='viridis', ax=ax)
            plt.title("Missing Values Heatmap")
            st.pyplot(fig)

        if st.checkbox("Remove Duplicate Rows"):
            initial_shape = data.shape
            data = data.drop_duplicates()
            st.success(f"Removed {initial_shape[0] - data.shape[0]} duplicate rows.")

        missing_strategy = st.selectbox(
            "Choose a strategy for handling missing values",
            options=["Drop Missing Values", "Fill with Mean", "Fill with Median", "Fill with Mode", "Do Nothing"]
        )

        if st.button("Apply Missing Value Strategy"):
            if missing_strategy == "Drop Missing Values":
                data.dropna(inplace=True)
                st.success("Dropped rows with missing values.")
            elif missing_strategy == "Fill with Mean":
                data.fillna(data.mean(), inplace=True)
                st.success("Filled missing values with the mean.")
            elif missing_strategy == "Fill with Median":
                data.fillna(data.median(), inplace=True)
                st.success("Filled missing values with the median.")
            elif missing_strategy == "Fill with Mode":
                for column in data.select_dtypes(include=['object']).columns:
                    data[column].fillna(data[column].mode()[0], inplace=True)
                st.success("Filled missing values with the mode for categorical columns.")
            elif missing_strategy == "Do Nothing":
                st.info("No changes made to missing values.")

        st.write("Data Summary After Cleaning")
        st.write(data.describe())
        st.write("Missing Values After Cleaning:")
        st.write(data.isnull().sum())

    with st.expander('4: EDA'):
        st.write("Correlation Matrix")
        correlation_matrix = data.corr()
        fig = ff.create_annotated_heatmap(
            z=correlation_matrix.values,
            x=list(correlation_matrix.columns),
            y=list(correlation_matrix.index),
        )
        fig.update_layout(
            title="Correlation Matrix",
            xaxis_title="Features",
            yaxis_title="Features",
            width=700,
            height=500,
        )
        st.plotly_chart(fig)

        if st.checkbox("Show Distribution Plots for Numeric Features"):
            for column in data.select_dtypes(include=[int, float]).columns:
                fig, ax = plt.subplots(figsize=(8, 4))
                sns.histplot(data[column], bins=30, kde=True, ax=ax)
                plt.title(f'Distribution of {column}')
                st.pyplot(fig)

        if st.checkbox("Show Boxplots for Numeric Features"):
            for column in data.select_dtypes(include=[int, float]).columns:
                fig, ax = plt.subplots(figsize=(8, 4))
                sns.boxplot(x=data[column], ax=ax)
                plt.title(f'Boxplot of {column}')
                st.pyplot(fig)

    with st.expander("5: Feature Engineering"):
        target_column = st.selectbox("Select the target variable", options=data.columns)
        feature_columns = st.multiselect("Select features", options=data.columns.drop(target_column))

    with st.expander("6: Modelling"):
        if 'model_plot' not in st.session_state:
            st.session_state.model_plot = None
        if 'model_metrics' not in st.session_state:
            st.session_state.model_metrics = None

        model_option = st.selectbox("Select Regression Model", options=["Linear Regression", "Random Forest Regression", "Lasso Regression"])

        if st.button("Train Model (Without Hyperparameter Tuning)"):
            if feature_columns:
                X = data[feature_columns]
                y = data[target_column]
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

                if model_option == "Linear Regression":
                    model = LinearRegression()
                elif model_option == "Random Forest Regression":
                    model = RandomForestRegressor(random_state=42)
                elif model_option == "Lasso Regression":
                    model = Lasso()

                model.fit(X_train, y_train)

                model_name = st.text_input('Enter model name', 'my_model')
                model_file_path = f'{model_name}.pkl'
                joblib.dump(model, model_file_path)
                st.success("Model saved successfully!")

                with open(model_file_path, "rb") as f:
                    st.download_button(
                        label="Download Model",
                        data=f,
                        file_name=model_file_path,
                        mime="application/octet-stream"
                    )

                y_pred = model.predict(X_test)
                mse = mean_squared_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)

                st.session_state.model_plot = (y_test.reset_index(drop=True), y_pred)
                st.session_state.model_metrics = (mse, r2)

                st.success(f"Mean Squared Error: {mse:.2f}")
                st.success(f"R^2 Score: {r2:.2f}")

        if st.session_state.model_plot is not None:
            y_test, y_pred = st.session_state.model_plot
            fig, ax = plt.subplots(figsize=(10, 6))
            ax.plot(y_test, label="True Values", color="blue", linestyle="--")
            ax.plot(y_pred, label="Predicted Values", color="orange")
            ax.set_title(f'{model_option}: True Values vs Predictions')
            ax.set_xlabel('Index')
            ax.set_ylabel('Values')
            ax.legend()
            st.pyplot(fig)

            if st.session_state.model_metrics is not None:
                mse, r2 = st.session_state.model_metrics
                st.success(f"Mean Squared Error: {mse:.2f}")
                st.success(f"R^2 Score: {r2:.2f}")

    with st.expander("7: HyperParameter"):
        if feature_columns:
            hyperparam_model_option = st.selectbox("Select Model for Hyperparameter Tuning", options=["Linear Regression", "Random Forest Regression", "Lasso Regression"])

            if hyperparam_model_option == "Linear Regression":
                param_grid = {'fit_intercept': [True, False]}
            elif hyperparam_model_option == "Random Forest Regression":
                param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10]}
            elif hyperparam_model_option == "Lasso Regression":
                param_grid = {'alpha': [0.01, 0.1, 1, 10], 'max_iter': [1000, 5000, 10000]}

            if st.button("Train Model with Hyperparameter Tuning"):
                X = data[feature_columns]
                y = data[target_column]
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

                if hyperparam_model_option == "Linear Regression":
                    model = LinearRegression()
                    grid_search = GridSearchCV(model, param_grid, cv=5)
                elif hyperparam_model_option == "Random Forest Regression":
                    model = RandomForestRegressor(random_state=42)
                    grid_search = GridSearchCV(model, param_grid, cv=5)
                elif hyperparam_model_option == "Lasso Regression":
                    model = Lasso()
                    grid_search = GridSearchCV(model, param_grid, cv=5)

                grid_search.fit(X_train, y_train)
                best_params = grid_search.best_params_

                st.success(f"Best Hyperparameters: {best_params}")

# Run the application
if __name__ == '__main__':
    main()