diff --git "a/notebook.ipynb" "b/notebook.ipynb" new file mode 100644--- /dev/null +++ "b/notebook.ipynb" @@ -0,0 +1,18475 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Iris Flower Species Prediction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.0 Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hide_input": true + }, + "source": [ + "### 1.1 Business Understanding / Project Objective" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The objective of the project is to build a machine learning model that predicts the species of an iris flower when given the lengths and widths of the flower's sepals and petals.\n", + "\n", + "This challenge is part of the requirements for the SLightly Techie community." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.2 Data Understanding" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset contains ... The columns in the dataset are described below:\n", + "\n", + "- *sepal_length*: the length of the flower's sepals\n", + "- *sepal_width*: the width of the flower's sepals\n", + "- *petal_length*: the length of the flower's petals\n", + "- *petal_width*: the width of the flower's petals\n", + "- *species*: the specie of the flower" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.0 Toolbox Loading" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "hide_input": false, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading complete. Warnings hidden.\n" + ] + } + ], + "source": [ + "# Data Manipulation\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "# Visualization\n", + "import matplotlib.pyplot as plt\n", + "import plotly.express as px\n", + "import seaborn as sns\n", + "\n", + "# Warnings\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\") # Hiding the warnings\n", + "\n", + "# Modelling\n", + "from sklearn import metrics\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import *\n", + "from sklearn.model_selection import *\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "import xgboost as xgb\n", + "from xgboost import *\n", + "import lightgbm as lgb\n", + "from catboost import CatBoostClassifier\n", + "\n", + "# Additional libraries\n", + "import sweetviz as sv\n", + "import os\n", + "import pickle\n", + "\n", + "\n", + "print(\"Loading complete.\", \"Warnings hidden.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Removing the restriction on columns to display\n", + "pd.set_option(\"display.max_columns\", None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.0 Data Exploration" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | sepal_length | \n", + "sepal_width | \n", + "petal_length | \n", + "petal_width | \n", + "species | \n", + "
---|---|---|---|---|---|
0 | \n", + "5.1 | \n", + "3.5 | \n", + "1.4 | \n", + "0.2 | \n", + "Iris-setosa | \n", + "
1 | \n", + "4.9 | \n", + "3.0 | \n", + "1.4 | \n", + "0.2 | \n", + "Iris-setosa | \n", + "
2 | \n", + "4.7 | \n", + "3.2 | \n", + "1.3 | \n", + "0.2 | \n", + "Iris-setosa | \n", + "
3 | \n", + "4.6 | \n", + "3.1 | \n", + "1.5 | \n", + "0.2 | \n", + "Iris-setosa | \n", + "
4 | \n", + "5.0 | \n", + "3.6 | \n", + "1.4 | \n", + "0.2 | \n", + "Iris-setosa | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
145 | \n", + "6.7 | \n", + "3.0 | \n", + "5.2 | \n", + "2.3 | \n", + "Iris-virginica | \n", + "
146 | \n", + "6.3 | \n", + "2.5 | \n", + "5.0 | \n", + "1.9 | \n", + "Iris-virginica | \n", + "
147 | \n", + "6.5 | \n", + "3.0 | \n", + "5.2 | \n", + "2.0 | \n", + "Iris-virginica | \n", + "
148 | \n", + "6.2 | \n", + "3.4 | \n", + "5.4 | \n", + "2.3 | \n", + "Iris-virginica | \n", + "
149 | \n", + "5.9 | \n", + "3.0 | \n", + "5.1 | \n", + "1.8 | \n", + "Iris-virginica | \n", + "
150 rows × 5 columns
\n", + "\n", + " | sepal_length | \n", + "sepal_width | \n", + "petal_length | \n", + "petal_width | \n", + "species | \n", + "
---|---|---|---|---|---|
34 | \n", + "4.9 | \n", + "3.1 | \n", + "1.5 | \n", + "0.1 | \n", + "Iris-setosa | \n", + "
37 | \n", + "4.9 | \n", + "3.1 | \n", + "1.5 | \n", + "0.1 | \n", + "Iris-setosa | \n", + "
142 | \n", + "5.8 | \n", + "2.7 | \n", + "5.1 | \n", + "1.9 | \n", + "Iris-virginica | \n", + "
\n", + " | sepal_length | \n", + "sepal_width | \n", + "petal_length | \n", + "petal_width | \n", + "
---|---|---|---|---|
count | \n", + "150.000000 | \n", + "150.000000 | \n", + "150.000000 | \n", + "150.000000 | \n", + "
mean | \n", + "5.843333 | \n", + "3.054000 | \n", + "3.758667 | \n", + "1.198667 | \n", + "
std | \n", + "0.828066 | \n", + "0.433594 | \n", + "1.764420 | \n", + "0.763161 | \n", + "
min | \n", + "4.300000 | \n", + "2.000000 | \n", + "1.000000 | \n", + "0.100000 | \n", + "
25% | \n", + "5.100000 | \n", + "2.800000 | \n", + "1.600000 | \n", + "0.300000 | \n", + "
50% | \n", + "5.800000 | \n", + "3.000000 | \n", + "4.350000 | \n", + "1.300000 | \n", + "
75% | \n", + "6.400000 | \n", + "3.300000 | \n", + "5.100000 | \n", + "1.800000 | \n", + "
max | \n", + "7.900000 | \n", + "4.400000 | \n", + "6.900000 | \n", + "2.500000 | \n", + "
\n", + " | sepal_length | \n", + "sepal_width | \n", + "petal_length | \n", + "petal_width | \n", + "species | \n", + "
---|---|---|---|---|---|
0 | \n", + "5.1 | \n", + "3.5 | \n", + "1.4 | \n", + "0.2 | \n", + "0 | \n", + "
1 | \n", + "4.9 | \n", + "3.0 | \n", + "1.4 | \n", + "0.2 | \n", + "0 | \n", + "
2 | \n", + "4.7 | \n", + "3.2 | \n", + "1.3 | \n", + "0.2 | \n", + "0 | \n", + "
3 | \n", + "4.6 | \n", + "3.1 | \n", + "1.5 | \n", + "0.2 | \n", + "0 | \n", + "
4 | \n", + "5.0 | \n", + "3.6 | \n", + "1.4 | \n", + "0.2 | \n", + "0 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
145 | \n", + "6.7 | \n", + "3.0 | \n", + "5.2 | \n", + "2.3 | \n", + "2 | \n", + "
146 | \n", + "6.3 | \n", + "2.5 | \n", + "5.0 | \n", + "1.9 | \n", + "2 | \n", + "
147 | \n", + "6.5 | \n", + "3.0 | \n", + "5.2 | \n", + "2.0 | \n", + "2 | \n", + "
148 | \n", + "6.2 | \n", + "3.4 | \n", + "5.4 | \n", + "2.3 | \n", + "2 | \n", + "
149 | \n", + "5.9 | \n", + "3.0 | \n", + "5.1 | \n", + "1.8 | \n", + "2 | \n", + "
150 rows × 5 columns
\n", + "\n", + " | precision | \n", + "recall | \n", + "f1_weighted | \n", + "accuracy | \n", + "
---|---|---|---|---|
model | \n", + "\n", + " | \n", + " | \n", + " | \n", + " |
Decision Tree | \n", + "0.978947 | \n", + "0.977778 | \n", + "0.977595 | \n", + "0.977778 | \n", + "
XGBoost | \n", + "0.978947 | \n", + "0.977778 | \n", + "0.977595 | \n", + "0.977778 | \n", + "
LightGBM | \n", + "0.978947 | \n", + "0.977778 | \n", + "0.977595 | \n", + "0.977778 | \n", + "
Logistic Regression | \n", + "0.955556 | \n", + "0.955556 | \n", + "0.955556 | \n", + "0.955556 | \n", + "
Random Forest | \n", + "0.955556 | \n", + "0.955556 | \n", + "0.955556 | \n", + "0.955556 | \n", + "
CatBoost | \n", + "0.955556 | \n", + "0.955556 | \n", + "0.955556 | \n", + "0.955556 | \n", + "
RandomizedSearchCV(cv=15,\n", + " estimator=XGBClassifier(base_score=None, booster=None,\n", + " callbacks=None,\n", + " colsample_bylevel=None,\n", + " colsample_bynode=None,\n", + " colsample_bytree=None,\n", + " early_stopping_rounds=None,\n", + " enable_categorical=False,\n", + " eval_metric=None, feature_types=None,\n", + " gamma=None, gpu_id=None,\n", + " grow_policy=None,\n", + " importance_type=None,\n", + " interaction_constraints=None,\n", + " learning_rat...\n", + " monotone_constraints=None,\n", + " n_estimators=100, n_jobs=None,\n", + " num_parallel_tree=None,\n", + " objective='multi:softprob',\n", + " predictor=None, ...),\n", + " n_iter=30, n_jobs=-1,\n", + " param_distributions={'booster': ['gbtree', 'gblinear',\n", + " 'dart'],\n", + " 'colsample_bytree': [0.1, 0.3, 0.5,\n", + " 0.7],\n", + " 'learning_rate': [0.1, 0.3, 0.5, 0.7,\n", + " 1.0],\n", + " 'max_depth': [5, 10, 15, 20, 25, 30,\n", + " 35],\n", + " 'n_estimators': [5, 10, 20, 50, 80,\n", + " 100]},\n", + " random_state=24)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomizedSearchCV(cv=15,\n", + " estimator=XGBClassifier(base_score=None, booster=None,\n", + " callbacks=None,\n", + " colsample_bylevel=None,\n", + " colsample_bynode=None,\n", + " colsample_bytree=None,\n", + " early_stopping_rounds=None,\n", + " enable_categorical=False,\n", + " eval_metric=None, feature_types=None,\n", + " gamma=None, gpu_id=None,\n", + " grow_policy=None,\n", + " importance_type=None,\n", + " interaction_constraints=None,\n", + " learning_rat...\n", + " monotone_constraints=None,\n", + " n_estimators=100, n_jobs=None,\n", + " num_parallel_tree=None,\n", + " objective='multi:softprob',\n", + " predictor=None, ...),\n", + " n_iter=30, n_jobs=-1,\n", + " param_distributions={'booster': ['gbtree', 'gblinear',\n", + " 'dart'],\n", + " 'colsample_bytree': [0.1, 0.3, 0.5,\n", + " 0.7],\n", + " 'learning_rate': [0.1, 0.3, 0.5, 0.7,\n", + " 1.0],\n", + " 'max_depth': [5, 10, 15, 20, 25, 30,\n", + " 35],\n", + " 'n_estimators': [5, 10, 20, 50, 80,\n", + " 100]},\n", + " random_state=24)
XGBClassifier(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n", + " interaction_constraints=None, learning_rate=None, max_bin=None,\n", + " max_cat_threshold=None, max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=None, max_leaves=None,\n", + " min_child_weight=None, missing=nan, monotone_constraints=None,\n", + " n_estimators=100, n_jobs=None, num_parallel_tree=None,\n", + " objective='multi:softprob', predictor=None, ...)
XGBClassifier(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n", + " interaction_constraints=None, learning_rate=None, max_bin=None,\n", + " max_cat_threshold=None, max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=None, max_leaves=None,\n", + " min_child_weight=None, missing=nan, monotone_constraints=None,\n", + " n_estimators=100, n_jobs=None, num_parallel_tree=None,\n", + " objective='multi:softprob', predictor=None, ...)