{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Iris Flower Species Prediction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1.0 Introduction" ] }, { "cell_type": "markdown", "metadata": { "hide_input": true }, "source": [ "### 1.1 Business Understanding / Project Objective" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The objective of the project is to build a machine learning model that predicts the species of an iris flower when given the lengths and widths of the flower's sepals and petals.\n", "\n", "This challenge is part of the requirements for the SLightly Techie community." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.2 Data Understanding" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The dataset contains ... The columns in the dataset are described below:\n", "\n", "- *sepal_length*: the length of the flower's sepals\n", "- *sepal_width*: the width of the flower's sepals\n", "- *petal_length*: the length of the flower's petals\n", "- *petal_width*: the width of the flower's petals\n", "- *species*: the specie of the flower" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2.0 Toolbox Loading" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "hide_input": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading complete. Warnings hidden.\n" ] } ], "source": [ "# Data Manipulation\n", "import numpy as np\n", "import pandas as pd\n", "\n", "# Visualization\n", "import matplotlib.pyplot as plt\n", "import plotly.express as px\n", "import seaborn as sns\n", "\n", "# Warnings\n", "import warnings\n", "warnings.filterwarnings(\"ignore\") # Hiding the warnings\n", "\n", "# Modelling\n", "from sklearn import metrics\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import *\n", "from sklearn.model_selection import *\n", "from sklearn.preprocessing import MinMaxScaler\n", "from sklearn.tree import DecisionTreeClassifier\n", "import xgboost as xgb\n", "from xgboost import *\n", "import lightgbm as lgb\n", "from catboost import CatBoostClassifier\n", "\n", "# Additional libraries\n", "import sweetviz as sv\n", "import os\n", "import pickle\n", "\n", "\n", "print(\"Loading complete.\", \"Warnings hidden.\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Removing the restriction on columns to display\n", "pd.set_option(\"display.max_columns\", None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3.0 Data Exploration" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | sepal_length | \n", "sepal_width | \n", "petal_length | \n", "petal_width | \n", "species | \n", "
---|---|---|---|---|---|
0 | \n", "5.1 | \n", "3.5 | \n", "1.4 | \n", "0.2 | \n", "Iris-setosa | \n", "
1 | \n", "4.9 | \n", "3.0 | \n", "1.4 | \n", "0.2 | \n", "Iris-setosa | \n", "
2 | \n", "4.7 | \n", "3.2 | \n", "1.3 | \n", "0.2 | \n", "Iris-setosa | \n", "
3 | \n", "4.6 | \n", "3.1 | \n", "1.5 | \n", "0.2 | \n", "Iris-setosa | \n", "
4 | \n", "5.0 | \n", "3.6 | \n", "1.4 | \n", "0.2 | \n", "Iris-setosa | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
145 | \n", "6.7 | \n", "3.0 | \n", "5.2 | \n", "2.3 | \n", "Iris-virginica | \n", "
146 | \n", "6.3 | \n", "2.5 | \n", "5.0 | \n", "1.9 | \n", "Iris-virginica | \n", "
147 | \n", "6.5 | \n", "3.0 | \n", "5.2 | \n", "2.0 | \n", "Iris-virginica | \n", "
148 | \n", "6.2 | \n", "3.4 | \n", "5.4 | \n", "2.3 | \n", "Iris-virginica | \n", "
149 | \n", "5.9 | \n", "3.0 | \n", "5.1 | \n", "1.8 | \n", "Iris-virginica | \n", "
150 rows × 5 columns
\n", "\n", " | sepal_length | \n", "sepal_width | \n", "petal_length | \n", "petal_width | \n", "species | \n", "
---|---|---|---|---|---|
34 | \n", "4.9 | \n", "3.1 | \n", "1.5 | \n", "0.1 | \n", "Iris-setosa | \n", "
37 | \n", "4.9 | \n", "3.1 | \n", "1.5 | \n", "0.1 | \n", "Iris-setosa | \n", "
142 | \n", "5.8 | \n", "2.7 | \n", "5.1 | \n", "1.9 | \n", "Iris-virginica | \n", "
\n", " | sepal_length | \n", "sepal_width | \n", "petal_length | \n", "petal_width | \n", "
---|---|---|---|---|
count | \n", "150.000000 | \n", "150.000000 | \n", "150.000000 | \n", "150.000000 | \n", "
mean | \n", "5.843333 | \n", "3.054000 | \n", "3.758667 | \n", "1.198667 | \n", "
std | \n", "0.828066 | \n", "0.433594 | \n", "1.764420 | \n", "0.763161 | \n", "
min | \n", "4.300000 | \n", "2.000000 | \n", "1.000000 | \n", "0.100000 | \n", "
25% | \n", "5.100000 | \n", "2.800000 | \n", "1.600000 | \n", "0.300000 | \n", "
50% | \n", "5.800000 | \n", "3.000000 | \n", "4.350000 | \n", "1.300000 | \n", "
75% | \n", "6.400000 | \n", "3.300000 | \n", "5.100000 | \n", "1.800000 | \n", "
max | \n", "7.900000 | \n", "4.400000 | \n", "6.900000 | \n", "2.500000 | \n", "
\n", " | sepal_length | \n", "sepal_width | \n", "petal_length | \n", "petal_width | \n", "species | \n", "
---|---|---|---|---|---|
0 | \n", "5.1 | \n", "3.5 | \n", "1.4 | \n", "0.2 | \n", "0 | \n", "
1 | \n", "4.9 | \n", "3.0 | \n", "1.4 | \n", "0.2 | \n", "0 | \n", "
2 | \n", "4.7 | \n", "3.2 | \n", "1.3 | \n", "0.2 | \n", "0 | \n", "
3 | \n", "4.6 | \n", "3.1 | \n", "1.5 | \n", "0.2 | \n", "0 | \n", "
4 | \n", "5.0 | \n", "3.6 | \n", "1.4 | \n", "0.2 | \n", "0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
145 | \n", "6.7 | \n", "3.0 | \n", "5.2 | \n", "2.3 | \n", "2 | \n", "
146 | \n", "6.3 | \n", "2.5 | \n", "5.0 | \n", "1.9 | \n", "2 | \n", "
147 | \n", "6.5 | \n", "3.0 | \n", "5.2 | \n", "2.0 | \n", "2 | \n", "
148 | \n", "6.2 | \n", "3.4 | \n", "5.4 | \n", "2.3 | \n", "2 | \n", "
149 | \n", "5.9 | \n", "3.0 | \n", "5.1 | \n", "1.8 | \n", "2 | \n", "
150 rows × 5 columns
\n", "\n", " | precision | \n", "recall | \n", "f1_weighted | \n", "accuracy | \n", "
---|---|---|---|---|
model | \n", "\n", " | \n", " | \n", " | \n", " |
Decision Tree | \n", "0.978947 | \n", "0.977778 | \n", "0.977595 | \n", "0.977778 | \n", "
XGBoost | \n", "0.978947 | \n", "0.977778 | \n", "0.977595 | \n", "0.977778 | \n", "
LightGBM | \n", "0.978947 | \n", "0.977778 | \n", "0.977595 | \n", "0.977778 | \n", "
Logistic Regression | \n", "0.955556 | \n", "0.955556 | \n", "0.955556 | \n", "0.955556 | \n", "
Random Forest | \n", "0.955556 | \n", "0.955556 | \n", "0.955556 | \n", "0.955556 | \n", "
CatBoost | \n", "0.955556 | \n", "0.955556 | \n", "0.955556 | \n", "0.955556 | \n", "
RandomizedSearchCV(cv=15,\n", " estimator=XGBClassifier(base_score=None, booster=None,\n", " callbacks=None,\n", " colsample_bylevel=None,\n", " colsample_bynode=None,\n", " colsample_bytree=None,\n", " early_stopping_rounds=None,\n", " enable_categorical=False,\n", " eval_metric=None, feature_types=None,\n", " gamma=None, gpu_id=None,\n", " grow_policy=None,\n", " importance_type=None,\n", " interaction_constraints=None,\n", " learning_rat...\n", " monotone_constraints=None,\n", " n_estimators=100, n_jobs=None,\n", " num_parallel_tree=None,\n", " objective='multi:softprob',\n", " predictor=None, ...),\n", " n_iter=30, n_jobs=-1,\n", " param_distributions={'booster': ['gbtree', 'gblinear',\n", " 'dart'],\n", " 'colsample_bytree': [0.1, 0.3, 0.5,\n", " 0.7],\n", " 'learning_rate': [0.1, 0.3, 0.5, 0.7,\n", " 1.0],\n", " 'max_depth': [5, 10, 15, 20, 25, 30,\n", " 35],\n", " 'n_estimators': [5, 10, 20, 50, 80,\n", " 100]},\n", " random_state=24)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomizedSearchCV(cv=15,\n", " estimator=XGBClassifier(base_score=None, booster=None,\n", " callbacks=None,\n", " colsample_bylevel=None,\n", " colsample_bynode=None,\n", " colsample_bytree=None,\n", " early_stopping_rounds=None,\n", " enable_categorical=False,\n", " eval_metric=None, feature_types=None,\n", " gamma=None, gpu_id=None,\n", " grow_policy=None,\n", " importance_type=None,\n", " interaction_constraints=None,\n", " learning_rat...\n", " monotone_constraints=None,\n", " n_estimators=100, n_jobs=None,\n", " num_parallel_tree=None,\n", " objective='multi:softprob',\n", " predictor=None, ...),\n", " n_iter=30, n_jobs=-1,\n", " param_distributions={'booster': ['gbtree', 'gblinear',\n", " 'dart'],\n", " 'colsample_bytree': [0.1, 0.3, 0.5,\n", " 0.7],\n", " 'learning_rate': [0.1, 0.3, 0.5, 0.7,\n", " 1.0],\n", " 'max_depth': [5, 10, 15, 20, 25, 30,\n", " 35],\n", " 'n_estimators': [5, 10, 20, 50, 80,\n", " 100]},\n", " random_state=24)
XGBClassifier(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=None, early_stopping_rounds=None,\n", " enable_categorical=False, eval_metric=None, feature_types=None,\n", " gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n", " interaction_constraints=None, learning_rate=None, max_bin=None,\n", " max_cat_threshold=None, max_cat_to_onehot=None,\n", " max_delta_step=None, max_depth=None, max_leaves=None,\n", " min_child_weight=None, missing=nan, monotone_constraints=None,\n", " n_estimators=100, n_jobs=None, num_parallel_tree=None,\n", " objective='multi:softprob', predictor=None, ...)
XGBClassifier(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=None, early_stopping_rounds=None,\n", " enable_categorical=False, eval_metric=None, feature_types=None,\n", " gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n", " interaction_constraints=None, learning_rate=None, max_bin=None,\n", " max_cat_threshold=None, max_cat_to_onehot=None,\n", " max_delta_step=None, max_depth=None, max_leaves=None,\n", " min_child_weight=None, missing=nan, monotone_constraints=None,\n", " n_estimators=100, n_jobs=None, num_parallel_tree=None,\n", " objective='multi:softprob', predictor=None, ...)