{ "cells": [ { "cell_type": "markdown", "id": "5e4ce795", "metadata": { "papermill": { "duration": 0.033432, "end_time": "2022-05-11T11:03:04.321959", "exception": false, "start_time": "2022-05-11T11:03:04.288527", "status": "completed" }, "tags": [] }, "source": [ "Problem statement\n", "\n", "The May edition of the 2022 Tabular Playground series binary classification problem that includes a number of different feature interactions. This competition is an opportunity to explore various methods for identifying and exploiting these feature interactions." ] }, { "cell_type": "markdown", "id": "af70a983", "metadata": { "papermill": { "duration": 0.032628, "end_time": "2022-05-11T11:03:04.386417", "exception": false, "start_time": "2022-05-11T11:03:04.353789", "status": "completed" }, "tags": [] }, "source": [ "Import libraries" ] }, { "cell_type": "code", "execution_count": 1, "id": "a8932043", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:04.453540Z", "iopub.status.busy": "2022-05-11T11:03:04.452968Z", "iopub.status.idle": "2022-05-11T11:03:05.559143Z", "shell.execute_reply": "2022-05-11T11:03:05.558125Z" }, "papermill": { "duration": 1.143269, "end_time": "2022-05-11T11:03:05.561715", "exception": false, "start_time": "2022-05-11T11:03:04.418446", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 5, "id": "66d2b4ae", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:05.627534Z", "iopub.status.busy": "2022-05-11T11:03:05.627253Z", "iopub.status.idle": "2022-05-11T11:03:05.633896Z", "shell.execute_reply": "2022-05-11T11:03:05.633045Z" }, "papermill": { "duration": 0.043253, "end_time": "2022-05-11T11:03:05.637363", "exception": false, "start_time": "2022-05-11T11:03:05.594110", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "../coal-price-data/binary_classification/sample_submission.csv\n", "../coal-price-data/binary_classification/train.csv\n", "../coal-price-data/binary_classification/test.csv\n" ] } ], "source": [ "import os\n", "\n", "path = \"../coal-price-data/binary_classification\"\n", "for dirname, _, filenames in os.walk(path):\n", " for filename in filenames:\n", " print(os.path.join(dirname, filename))" ] }, { "cell_type": "markdown", "id": "be1e26a5", "metadata": { "papermill": { "duration": 0.035619, "end_time": "2022-05-11T11:03:05.708148", "exception": false, "start_time": "2022-05-11T11:03:05.672529", "status": "completed" }, "tags": [] }, "source": [ "Read files" ] }, { "cell_type": "code", "execution_count": 6, "id": "1c56f757", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:05.776270Z", "iopub.status.busy": "2022-05-11T11:03:05.775581Z", "iopub.status.idle": "2022-05-11T11:03:21.346102Z", "shell.execute_reply": "2022-05-11T11:03:21.345294Z" }, "papermill": { "duration": 15.60683, "end_time": "2022-05-11T11:03:21.348390", "exception": false, "start_time": "2022-05-11T11:03:05.741560", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "train = pd.read_csv(f\"{path}/train.csv\")\n", "test = pd.read_csv(f\"{path}/test.csv\")\n", "submission = pd.read_csv(f\"{path}/sample_submission.csv\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "7dc3af51", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:21.416502Z", "iopub.status.busy": "2022-05-11T11:03:21.416126Z", "iopub.status.idle": "2022-05-11T11:03:21.717153Z", "shell.execute_reply": "2022-05-11T11:03:21.716495Z" }, "papermill": { "duration": 0.338273, "end_time": "2022-05-11T11:03:21.719304", "exception": false, "start_time": "2022-05-11T11:03:21.381031", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idf_00f_01f_02f_03f_04f_05f_06f_07f_08...f_22f_23f_24f_25f_26f_27f_28f_29f_30target
00-1.3732460.238887-0.2433760.567405-0.6477150.8393260.11313315...-2.5407390.766952-2.730628-0.2081771.363402ABABDADBAB67.609153000
111.697021-1.710322-2.230332-0.5456611.113173-1.5521750.44782513...2.278315-0.633658-1.217077-3.782194-0.058316ACACCADCEB377.096415001
221.6817260.616746-1.0276890.810492-0.6090860.113965-0.70866010...-1.385775-0.520558-0.0091212.788536-3.703488AAAEABCKAD-195.599702021
33-0.118172-0.587835-0.8046382.0868220.371005-0.128831-0.28257532...0.572594-1.6532131.686035-2.533098-0.608601BDBBAACBCB210.826205001
441.148481-0.176567-0.664871-1.1013430.4678750.5001170.40751533...-3.912929-1.4303662.127649-3.3067844.371371BDBCBBCHFE-217.211798011
..................................................................
8999958999951.380145-0.0388840.5971110.8545600.684301-1.0586181.31069921...-1.5947440.5220190.8330472.7141251.290094BABBCBBBED455.033851021
899996899996-1.3697890.0448410.0154580.376565-0.380529-0.830815-1.79845841...2.413899-0.674942-0.412111-0.030436-3.144047BBBGBBDQBE134.703577010
8999978999971.386201-0.9611500.725994-0.1328440.873911-0.245339-1.04578600...-0.151930-4.560773-1.2491541.7935352.253696AEBEDBBHBA-99.536313010
899998899998-1.590572-0.509938-1.715397-0.2499881.3599331.650808-0.05859202...2.4236702.1100080.561271-2.1496101.019982ADBAAADDAE47.823039120
899999899999-0.636210-0.425986-1.826699-0.5987971.589577-0.482298-0.21409371...1.3406963.7623511.797137-0.4128372.090440BCAACADSCE-44.559296021
\n", "

900000 rows × 33 columns

\n", "
" ], "text/plain": [ " id f_00 f_01 f_02 f_03 f_04 f_05 \\\n", "0 0 -1.373246 0.238887 -0.243376 0.567405 -0.647715 0.839326 \n", "1 1 1.697021 -1.710322 -2.230332 -0.545661 1.113173 -1.552175 \n", "2 2 1.681726 0.616746 -1.027689 0.810492 -0.609086 0.113965 \n", "3 3 -0.118172 -0.587835 -0.804638 2.086822 0.371005 -0.128831 \n", "4 4 1.148481 -0.176567 -0.664871 -1.101343 0.467875 0.500117 \n", "... ... ... ... ... ... ... ... \n", "899995 899995 1.380145 -0.038884 0.597111 0.854560 0.684301 -1.058618 \n", "899996 899996 -1.369789 0.044841 0.015458 0.376565 -0.380529 -0.830815 \n", "899997 899997 1.386201 -0.961150 0.725994 -0.132844 0.873911 -0.245339 \n", "899998 899998 -1.590572 -0.509938 -1.715397 -0.249988 1.359933 1.650808 \n", "899999 899999 -0.636210 -0.425986 -1.826699 -0.598797 1.589577 -0.482298 \n", "\n", " f_06 f_07 f_08 ... f_22 f_23 f_24 f_25 \\\n", "0 0.113133 1 5 ... -2.540739 0.766952 -2.730628 -0.208177 \n", "1 0.447825 1 3 ... 2.278315 -0.633658 -1.217077 -3.782194 \n", "2 -0.708660 1 0 ... -1.385775 -0.520558 -0.009121 2.788536 \n", "3 -0.282575 3 2 ... 0.572594 -1.653213 1.686035 -2.533098 \n", "4 0.407515 3 3 ... -3.912929 -1.430366 2.127649 -3.306784 \n", "... ... ... ... ... ... ... ... ... \n", "899995 1.310699 2 1 ... -1.594744 0.522019 0.833047 2.714125 \n", "899996 -1.798458 4 1 ... 2.413899 -0.674942 -0.412111 -0.030436 \n", "899997 -1.045786 0 0 ... -0.151930 -4.560773 -1.249154 1.793535 \n", "899998 -0.058592 0 2 ... 2.423670 2.110008 0.561271 -2.149610 \n", "899999 -0.214093 7 1 ... 1.340696 3.762351 1.797137 -0.412837 \n", "\n", " f_26 f_27 f_28 f_29 f_30 target \n", "0 1.363402 ABABDADBAB 67.609153 0 0 0 \n", "1 -0.058316 ACACCADCEB 377.096415 0 0 1 \n", "2 -3.703488 AAAEABCKAD -195.599702 0 2 1 \n", "3 -0.608601 BDBBAACBCB 210.826205 0 0 1 \n", "4 4.371371 BDBCBBCHFE -217.211798 0 1 1 \n", "... ... ... ... ... ... ... \n", "899995 1.290094 BABBCBBBED 455.033851 0 2 1 \n", "899996 -3.144047 BBBGBBDQBE 134.703577 0 1 0 \n", "899997 2.253696 AEBEDBBHBA -99.536313 0 1 0 \n", "899998 1.019982 ADBAAADDAE 47.823039 1 2 0 \n", "899999 2.090440 BCAACADSCE -44.559296 0 2 1 \n", "\n", "[900000 rows x 33 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train" ] }, { "cell_type": "code", "execution_count": 5, "id": "56d9b5e5", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:21.789492Z", "iopub.status.busy": "2022-05-11T11:03:21.788702Z", "iopub.status.idle": "2022-05-11T11:03:22.024768Z", "shell.execute_reply": "2022-05-11T11:03:22.023935Z" }, "papermill": { "duration": 0.272034, "end_time": "2022-05-11T11:03:22.026588", "exception": false, "start_time": "2022-05-11T11:03:21.754554", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idf_00f_01f_02f_03f_04f_05f_06f_07f_08...f_21f_22f_23f_24f_25f_26f_27f_28f_29f_30
09000000.4425170.174380-0.9998160.7627410.186778-1.0747750.50188866...-1.006400-1.193879-2.435736-2.427430-1.9668875.734205BAAABADLAC99.47841900
1900001-0.605598-0.3057150.627667-0.578898-1.7509311.355550-0.19091113...2.3824050.1494421.883322-2.848714-0.7251553.194219AFABBAEGCB-65.99382510
29000020.3039902.4451100.2465150.8182480.359731-1.3318451.35862233...-7.0260981.312277-5.1571921.7140050.5850320.066898BBACABBKEE-87.40562201
39000030.1540530.260126-1.367092-0.093175-1.111034-0.9484811.11922000...-0.594532-3.9394751.754570-2.364007-1.0033203.893099AEBEAACQCC-281.29346000
4900004-1.651904-0.424266-0.667356-0.322124-0.0894620.1817051.78498322...0.084906-0.985736-0.130467-3.5578931.2106871.861884AEBBBBDABF25.62941502
..................................................................
69999515999950.6401100.897808-0.5239561.563760-0.092281-0.6108670.53542601...2.6040481.1228670.5181101.2438370.5751110.076372BCBCEBHMCD204.18653900
6999961599996-0.191771-0.035246-0.1185330.5847502.1269770.568659-0.05266343...3.0298571.384682-1.1357402.982713-1.5117602.225218BAABCADQFC-97.69459102
6999971599997-0.331704-0.328845-1.1855031.022128-0.483099-0.107146-0.96828111...4.021273-1.8452661.096011-2.734508-4.885955-2.248739AAAJCBGQBA130.62274510
6999981599998-2.031073-1.2383980.964699-1.0459500.9060640.634301-0.70747451...1.453864-1.6966061.0189951.973697-0.353068-3.333449BCBBCABNDE-364.62514800
6999991599999-0.085906-0.0021242.2273750.2171453.179153-1.6601880.89198903...-3.549082-4.325318-5.0172210.251268-3.236026-0.362070AFBEBACHFF-155.41734201
\n", "

700000 rows × 32 columns

\n", "
" ], "text/plain": [ " id f_00 f_01 f_02 f_03 f_04 f_05 \\\n", "0 900000 0.442517 0.174380 -0.999816 0.762741 0.186778 -1.074775 \n", "1 900001 -0.605598 -0.305715 0.627667 -0.578898 -1.750931 1.355550 \n", "2 900002 0.303990 2.445110 0.246515 0.818248 0.359731 -1.331845 \n", "3 900003 0.154053 0.260126 -1.367092 -0.093175 -1.111034 -0.948481 \n", "4 900004 -1.651904 -0.424266 -0.667356 -0.322124 -0.089462 0.181705 \n", "... ... ... ... ... ... ... ... \n", "699995 1599995 0.640110 0.897808 -0.523956 1.563760 -0.092281 -0.610867 \n", "699996 1599996 -0.191771 -0.035246 -0.118533 0.584750 2.126977 0.568659 \n", "699997 1599997 -0.331704 -0.328845 -1.185503 1.022128 -0.483099 -0.107146 \n", "699998 1599998 -2.031073 -1.238398 0.964699 -1.045950 0.906064 0.634301 \n", "699999 1599999 -0.085906 -0.002124 2.227375 0.217145 3.179153 -1.660188 \n", "\n", " f_06 f_07 f_08 ... f_21 f_22 f_23 f_24 \\\n", "0 0.501888 6 6 ... -1.006400 -1.193879 -2.435736 -2.427430 \n", "1 -0.190911 1 3 ... 2.382405 0.149442 1.883322 -2.848714 \n", "2 1.358622 3 3 ... -7.026098 1.312277 -5.157192 1.714005 \n", "3 1.119220 0 0 ... -0.594532 -3.939475 1.754570 -2.364007 \n", "4 1.784983 2 2 ... 0.084906 -0.985736 -0.130467 -3.557893 \n", "... ... ... ... ... ... ... ... ... \n", "699995 0.535426 0 1 ... 2.604048 1.122867 0.518110 1.243837 \n", "699996 -0.052663 4 3 ... 3.029857 1.384682 -1.135740 2.982713 \n", "699997 -0.968281 1 1 ... 4.021273 -1.845266 1.096011 -2.734508 \n", "699998 -0.707474 5 1 ... 1.453864 -1.696606 1.018995 1.973697 \n", "699999 0.891989 0 3 ... -3.549082 -4.325318 -5.017221 0.251268 \n", "\n", " f_25 f_26 f_27 f_28 f_29 f_30 \n", "0 -1.966887 5.734205 BAAABADLAC 99.478419 0 0 \n", "1 -0.725155 3.194219 AFABBAEGCB -65.993825 1 0 \n", "2 0.585032 0.066898 BBACABBKEE -87.405622 0 1 \n", "3 -1.003320 3.893099 AEBEAACQCC -281.293460 0 0 \n", "4 1.210687 1.861884 AEBBBBDABF 25.629415 0 2 \n", "... ... ... ... ... ... ... \n", "699995 0.575111 0.076372 BCBCEBHMCD 204.186539 0 0 \n", "699996 -1.511760 2.225218 BAABCADQFC -97.694591 0 2 \n", "699997 -4.885955 -2.248739 AAAJCBGQBA 130.622745 1 0 \n", "699998 -0.353068 -3.333449 BCBBCABNDE -364.625148 0 0 \n", "699999 -3.236026 -0.362070 AFBEBACHFF -155.417342 0 1 \n", "\n", "[700000 rows x 32 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test" ] }, { "cell_type": "code", "execution_count": 6, "id": "d6f2b103", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:22.097149Z", "iopub.status.busy": "2022-05-11T11:03:22.096385Z", "iopub.status.idle": "2022-05-11T11:03:22.108954Z", "shell.execute_reply": "2022-05-11T11:03:22.107976Z" }, "papermill": { "duration": 0.049962, "end_time": "2022-05-11T11:03:22.110994", "exception": false, "start_time": "2022-05-11T11:03:22.061032", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtarget
09000000.5
19000010.5
29000020.5
39000030.5
49000040.5
.........
69999515999950.5
69999615999960.5
69999715999970.5
69999815999980.5
69999915999990.5
\n", "

700000 rows × 2 columns

\n", "
" ], "text/plain": [ " id target\n", "0 900000 0.5\n", "1 900001 0.5\n", "2 900002 0.5\n", "3 900003 0.5\n", "4 900004 0.5\n", "... ... ...\n", "699995 1599995 0.5\n", "699996 1599996 0.5\n", "699997 1599997 0.5\n", "699998 1599998 0.5\n", "699999 1599999 0.5\n", "\n", "[700000 rows x 2 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "submission" ] }, { "cell_type": "markdown", "id": "52d8140f", "metadata": { "papermill": { "duration": 0.035034, "end_time": "2022-05-11T11:03:22.180689", "exception": false, "start_time": "2022-05-11T11:03:22.145655", "status": "completed" }, "tags": [] }, "source": [ "Analyse" ] }, { "cell_type": "code", "execution_count": 7, "id": "68db965c", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:22.251536Z", "iopub.status.busy": "2022-05-11T11:03:22.251122Z", "iopub.status.idle": "2022-05-11T11:03:22.429558Z", "shell.execute_reply": "2022-05-11T11:03:22.428055Z" }, "papermill": { "duration": 0.216562, "end_time": "2022-05-11T11:03:22.431705", "exception": false, "start_time": "2022-05-11T11:03:22.215143", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 900000 entries, 0 to 899999\n", "Data columns (total 33 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 900000 non-null int64 \n", " 1 f_00 900000 non-null float64\n", " 2 f_01 900000 non-null float64\n", " 3 f_02 900000 non-null float64\n", " 4 f_03 900000 non-null float64\n", " 5 f_04 900000 non-null float64\n", " 6 f_05 900000 non-null float64\n", " 7 f_06 900000 non-null float64\n", " 8 f_07 900000 non-null int64 \n", " 9 f_08 900000 non-null int64 \n", " 10 f_09 900000 non-null int64 \n", " 11 f_10 900000 non-null int64 \n", " 12 f_11 900000 non-null int64 \n", " 13 f_12 900000 non-null int64 \n", " 14 f_13 900000 non-null int64 \n", " 15 f_14 900000 non-null int64 \n", " 16 f_15 900000 non-null int64 \n", " 17 f_16 900000 non-null int64 \n", " 18 f_17 900000 non-null int64 \n", " 19 f_18 900000 non-null int64 \n", " 20 f_19 900000 non-null float64\n", " 21 f_20 900000 non-null float64\n", " 22 f_21 900000 non-null float64\n", " 23 f_22 900000 non-null float64\n", " 24 f_23 900000 non-null float64\n", " 25 f_24 900000 non-null float64\n", " 26 f_25 900000 non-null float64\n", " 27 f_26 900000 non-null float64\n", " 28 f_27 900000 non-null object \n", " 29 f_28 900000 non-null float64\n", " 30 f_29 900000 non-null int64 \n", " 31 f_30 900000 non-null int64 \n", " 32 target 900000 non-null int64 \n", "dtypes: float64(16), int64(16), object(1)\n", "memory usage: 226.6+ MB\n" ] } ], "source": [ "train.info()" ] }, { "cell_type": "code", "execution_count": 8, "id": "ac090171", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:22.505768Z", "iopub.status.busy": "2022-05-11T11:03:22.505490Z", "iopub.status.idle": "2022-05-11T11:03:23.698698Z", "shell.execute_reply": "2022-05-11T11:03:23.697778Z" }, "papermill": { "duration": 1.231653, "end_time": "2022-05-11T11:03:23.700854", "exception": false, "start_time": "2022-05-11T11:03:22.469201", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idf_00f_01f_02f_03f_04f_05f_06f_07f_08...f_21f_22f_23f_24f_25f_26f_28f_29f_30target
count900000.000000900000.000000900000.000000900000.000000900000.000000900000.000000900000.000000900000.000000900000.000000900000.000000...900000.000000900000.000000900000.000000900000.000000900000.000000900000.000000900000.000000900000.000000900000.000000900000.000000
mean449999.500000-0.0002860.0011650.001174-0.001368-0.0005710.000284-0.0007092.0314602.057998...-0.156307-0.009273-0.369459-0.3427380.1765490.357591-0.3808760.3456611.0026540.486488
std259807.7654730.9988880.9991931.0005141.0001751.0001670.9998750.9999421.6561721.590955...2.4847062.4507972.4534052.3869412.4169592.476020238.7730540.4755840.8189890.499818
min0.000000-4.599856-4.682199-4.642676-4.658816-4.748501-4.750214-4.8429190.0000000.000000...-13.310146-11.853530-12.301097-11.416189-11.918306-14.300577-1229.7530520.0000000.0000000.000000
25%224999.750000-0.675490-0.675162-0.674369-0.676114-0.675909-0.673437-0.6748761.0000001.000000...-1.820063-1.645585-2.019739-1.955956-1.440424-1.261598-159.4274180.0000000.0000000.000000
50%449999.5000000.0011440.0020140.002218-0.002227-0.001662-0.000438-0.0014922.0000002.000000...-0.1526680.030850-0.390966-0.3407460.1609120.404212-0.5198080.0000001.0000000.000000
75%674999.2500000.6743370.6750210.6775050.6725440.6737890.6750280.6747493.0000003.000000...1.5070711.6616761.2554081.2666731.7959282.028219158.9873571.0000002.0000001.000000
max899999.0000004.7493014.8156994.9619824.4549204.9489834.9718814.82266815.00000016.000000...14.45542611.34408012.24710012.38984412.52917912.9130411229.5625771.0000002.0000001.000000
\n", "

8 rows × 32 columns

\n", "
" ], "text/plain": [ " id f_00 f_01 f_02 \\\n", "count 900000.000000 900000.000000 900000.000000 900000.000000 \n", "mean 449999.500000 -0.000286 0.001165 0.001174 \n", "std 259807.765473 0.998888 0.999193 1.000514 \n", "min 0.000000 -4.599856 -4.682199 -4.642676 \n", "25% 224999.750000 -0.675490 -0.675162 -0.674369 \n", "50% 449999.500000 0.001144 0.002014 0.002218 \n", "75% 674999.250000 0.674337 0.675021 0.677505 \n", "max 899999.000000 4.749301 4.815699 4.961982 \n", "\n", " f_03 f_04 f_05 f_06 \\\n", "count 900000.000000 900000.000000 900000.000000 900000.000000 \n", "mean -0.001368 -0.000571 0.000284 -0.000709 \n", "std 1.000175 1.000167 0.999875 0.999942 \n", "min -4.658816 -4.748501 -4.750214 -4.842919 \n", "25% -0.676114 -0.675909 -0.673437 -0.674876 \n", "50% -0.002227 -0.001662 -0.000438 -0.001492 \n", "75% 0.672544 0.673789 0.675028 0.674749 \n", "max 4.454920 4.948983 4.971881 4.822668 \n", "\n", " f_07 f_08 ... f_21 f_22 \\\n", "count 900000.000000 900000.000000 ... 900000.000000 900000.000000 \n", "mean 2.031460 2.057998 ... -0.156307 -0.009273 \n", "std 1.656172 1.590955 ... 2.484706 2.450797 \n", "min 0.000000 0.000000 ... -13.310146 -11.853530 \n", "25% 1.000000 1.000000 ... -1.820063 -1.645585 \n", "50% 2.000000 2.000000 ... -0.152668 0.030850 \n", "75% 3.000000 3.000000 ... 1.507071 1.661676 \n", "max 15.000000 16.000000 ... 14.455426 11.344080 \n", "\n", " f_23 f_24 f_25 f_26 \\\n", "count 900000.000000 900000.000000 900000.000000 900000.000000 \n", "mean -0.369459 -0.342738 0.176549 0.357591 \n", "std 2.453405 2.386941 2.416959 2.476020 \n", "min -12.301097 -11.416189 -11.918306 -14.300577 \n", "25% -2.019739 -1.955956 -1.440424 -1.261598 \n", "50% -0.390966 -0.340746 0.160912 0.404212 \n", "75% 1.255408 1.266673 1.795928 2.028219 \n", "max 12.247100 12.389844 12.529179 12.913041 \n", "\n", " f_28 f_29 f_30 target \n", "count 900000.000000 900000.000000 900000.000000 900000.000000 \n", "mean -0.380876 0.345661 1.002654 0.486488 \n", "std 238.773054 0.475584 0.818989 0.499818 \n", "min -1229.753052 0.000000 0.000000 0.000000 \n", "25% -159.427418 0.000000 0.000000 0.000000 \n", "50% -0.519808 0.000000 1.000000 0.000000 \n", "75% 158.987357 1.000000 2.000000 1.000000 \n", "max 1229.562577 1.000000 2.000000 1.000000 \n", "\n", "[8 rows x 32 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.describe()" ] }, { "cell_type": "markdown", "id": "56e154be", "metadata": { "papermill": { "duration": 0.035465, "end_time": "2022-05-11T11:03:23.773992", "exception": false, "start_time": "2022-05-11T11:03:23.738527", "status": "completed" }, "tags": [] }, "source": [ "Analyse target" ] }, { "cell_type": "code", "execution_count": 9, "id": "8ff4e8b1", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:23.846653Z", "iopub.status.busy": "2022-05-11T11:03:23.846387Z", "iopub.status.idle": "2022-05-11T11:03:26.891259Z", "shell.execute_reply": "2022-05-11T11:03:26.890221Z" }, "papermill": { "duration": 3.083778, "end_time": "2022-05-11T11:03:26.893405", "exception": false, "start_time": "2022-05-11T11:03:23.809627", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.7/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n", " warnings.warn(msg, FutureWarning)\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "sns.distplot(train['target'])" ] }, { "cell_type": "markdown", "id": "89084ac0", "metadata": { "papermill": { "duration": 0.038491, "end_time": "2022-05-11T11:03:26.971134", "exception": false, "start_time": "2022-05-11T11:03:26.932643", "status": "completed" }, "tags": [] }, "source": [ "Define target" ] }, { "cell_type": "code", "execution_count": 10, "id": "fb918316", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:27.049570Z", "iopub.status.busy": "2022-05-11T11:03:27.049053Z", "iopub.status.idle": "2022-05-11T11:03:27.055599Z", "shell.execute_reply": "2022-05-11T11:03:27.054810Z" }, "papermill": { "duration": 0.049053, "end_time": "2022-05-11T11:03:27.057883", "exception": false, "start_time": "2022-05-11T11:03:27.008830", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "0 0\n", "1 1\n", "2 1\n", "3 1\n", "4 1\n", " ..\n", "899995 1\n", "899996 0\n", "899997 0\n", "899998 0\n", "899999 1\n", "Name: target, Length: 900000, dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target = train['target']\n", "target" ] }, { "cell_type": "markdown", "id": "d388de97", "metadata": { "papermill": { "duration": 0.037479, "end_time": "2022-05-11T11:03:27.134251", "exception": false, "start_time": "2022-05-11T11:03:27.096772", "status": "completed" }, "tags": [] }, "source": [ "Combine train and test" ] }, { "cell_type": "code", "execution_count": 11, "id": "06fe1a2d", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:27.213073Z", "iopub.status.busy": "2022-05-11T11:03:27.212416Z", "iopub.status.idle": "2022-05-11T11:03:28.484296Z", "shell.execute_reply": "2022-05-11T11:03:28.483417Z" }, "papermill": { "duration": 1.313462, "end_time": "2022-05-11T11:03:28.486506", "exception": false, "start_time": "2022-05-11T11:03:27.173044", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
f_00f_01f_02f_03f_04f_05f_06f_07f_08f_09...f_20f_21f_22f_23f_24f_25f_26f_28f_29f_30
0-1.3732460.238887-0.2433760.567405-0.6477150.8393260.113133151...-0.9197173.058541-2.5407390.766952-2.730628-0.2081771.36340267.60915300
11.697021-1.710322-2.230332-0.5456611.113173-1.5521750.447825134...-1.0754342.1790502.278315-0.633658-1.217077-3.782194-0.058316377.09641500
21.6817260.616746-1.0276890.810492-0.6090860.113965-0.708660102...-3.485342-0.784235-1.385775-0.520558-0.0091212.788536-3.703488-195.59970202
3-0.118172-0.587835-0.8046382.0868220.371005-0.128831-0.282575321...-2.100177-2.3438190.572594-1.6532131.686035-2.533098-0.608601210.82620500
41.148481-0.176567-0.664871-1.1013430.4678750.5001170.407515330...0.6050331.133665-3.912929-1.4303662.127649-3.3067844.371371-217.21179801
..................................................................
6999950.6401100.897808-0.5239561.563760-0.092281-0.6108670.535426016...-2.1851902.6040481.1228670.5181101.2438370.5751110.076372204.18653900
699996-0.191771-0.035246-0.1185330.5847502.1269770.568659-0.052663434...-0.2395523.0298571.384682-1.1357402.982713-1.5117602.225218-97.69459102
699997-0.331704-0.328845-1.1855031.022128-0.483099-0.107146-0.968281112...-0.9226264.021273-1.8452661.096011-2.734508-4.885955-2.248739130.62274510
699998-2.031073-1.2383980.964699-1.0459500.9060640.634301-0.707474511...-3.0799961.453864-1.6966061.0189951.973697-0.353068-3.333449-364.62514800
699999-0.085906-0.0021242.2273750.2171453.179153-1.6601880.891989034...-2.128546-3.549082-4.325318-5.0172210.251268-3.236026-0.362070-155.41734201
\n", "

1600000 rows × 30 columns

\n", "
" ], "text/plain": [ " f_00 f_01 f_02 f_03 f_04 f_05 f_06 \\\n", "0 -1.373246 0.238887 -0.243376 0.567405 -0.647715 0.839326 0.113133 \n", "1 1.697021 -1.710322 -2.230332 -0.545661 1.113173 -1.552175 0.447825 \n", "2 1.681726 0.616746 -1.027689 0.810492 -0.609086 0.113965 -0.708660 \n", "3 -0.118172 -0.587835 -0.804638 2.086822 0.371005 -0.128831 -0.282575 \n", "4 1.148481 -0.176567 -0.664871 -1.101343 0.467875 0.500117 0.407515 \n", "... ... ... ... ... ... ... ... \n", "699995 0.640110 0.897808 -0.523956 1.563760 -0.092281 -0.610867 0.535426 \n", "699996 -0.191771 -0.035246 -0.118533 0.584750 2.126977 0.568659 -0.052663 \n", "699997 -0.331704 -0.328845 -1.185503 1.022128 -0.483099 -0.107146 -0.968281 \n", "699998 -2.031073 -1.238398 0.964699 -1.045950 0.906064 0.634301 -0.707474 \n", "699999 -0.085906 -0.002124 2.227375 0.217145 3.179153 -1.660188 0.891989 \n", "\n", " f_07 f_08 f_09 ... f_20 f_21 f_22 f_23 \\\n", "0 1 5 1 ... -0.919717 3.058541 -2.540739 0.766952 \n", "1 1 3 4 ... -1.075434 2.179050 2.278315 -0.633658 \n", "2 1 0 2 ... -3.485342 -0.784235 -1.385775 -0.520558 \n", "3 3 2 1 ... -2.100177 -2.343819 0.572594 -1.653213 \n", "4 3 3 0 ... 0.605033 1.133665 -3.912929 -1.430366 \n", "... ... ... ... ... ... ... ... ... \n", "699995 0 1 6 ... -2.185190 2.604048 1.122867 0.518110 \n", "699996 4 3 4 ... -0.239552 3.029857 1.384682 -1.135740 \n", "699997 1 1 2 ... -0.922626 4.021273 -1.845266 1.096011 \n", "699998 5 1 1 ... -3.079996 1.453864 -1.696606 1.018995 \n", "699999 0 3 4 ... -2.128546 -3.549082 -4.325318 -5.017221 \n", "\n", " f_24 f_25 f_26 f_28 f_29 f_30 \n", "0 -2.730628 -0.208177 1.363402 67.609153 0 0 \n", "1 -1.217077 -3.782194 -0.058316 377.096415 0 0 \n", "2 -0.009121 2.788536 -3.703488 -195.599702 0 2 \n", "3 1.686035 -2.533098 -0.608601 210.826205 0 0 \n", "4 2.127649 -3.306784 4.371371 -217.211798 0 1 \n", "... ... ... ... ... ... ... \n", "699995 1.243837 0.575111 0.076372 204.186539 0 0 \n", "699996 2.982713 -1.511760 2.225218 -97.694591 0 2 \n", "699997 -2.734508 -4.885955 -2.248739 130.622745 1 0 \n", "699998 1.973697 -0.353068 -3.333449 -364.625148 0 0 \n", "699999 0.251268 -3.236026 -0.362070 -155.417342 0 1 \n", "\n", "[1600000 rows x 30 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "combi = train.drop(['target'], axis=1).append(test)\n", "combi = combi.drop(['id', 'f_27'], axis=1)\n", "combi" ] }, { "cell_type": "markdown", "id": "aa697977", "metadata": { "papermill": { "duration": 0.039281, "end_time": "2022-05-11T11:03:28.565424", "exception": false, "start_time": "2022-05-11T11:03:28.526143", "status": "completed" }, "tags": [] }, "source": [ "Heatmap" ] }, { "cell_type": "code", "execution_count": 12, "id": "0e948e76", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:28.645338Z", "iopub.status.busy": "2022-05-11T11:03:28.645031Z", "iopub.status.idle": "2022-05-11T11:03:33.606584Z", "shell.execute_reply": "2022-05-11T11:03:33.605587Z" }, "papermill": { "duration": 5.004322, "end_time": "2022-05-11T11:03:33.608988", "exception": false, "start_time": "2022-05-11T11:03:28.604666", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "corr = combi.corr()\n", "f, ax = plt.subplots(figsize=(12, 9))\n", "sns.heatmap(corr, vmax=.8, square=True);" ] }, { "cell_type": "code", "execution_count": 13, "id": "ba6957d3", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:33.692201Z", "iopub.status.busy": "2022-05-11T11:03:33.691895Z", "iopub.status.idle": "2022-05-11T11:03:33.715345Z", "shell.execute_reply": "2022-05-11T11:03:33.714306Z" }, "papermill": { "duration": 0.06877, "end_time": "2022-05-11T11:03:33.718205", "exception": false, "start_time": "2022-05-11T11:03:33.649435", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " f_00 f_01 f_02 f_03 f_04 f_05 f_06 \\\n", "f_00 1.000000 -0.000373 -0.000155 -0.001402 -0.000391 -0.000088 -0.001257 \n", "f_01 -0.000373 1.000000 0.001066 0.000231 -0.001221 0.000028 -0.000963 \n", "f_02 -0.000155 0.001066 1.000000 -0.000307 0.000768 -0.000394 0.000661 \n", "f_03 -0.001402 0.000231 -0.000307 1.000000 0.000483 0.001895 -0.000237 \n", "f_04 -0.000391 -0.001221 0.000768 0.000483 1.000000 0.000198 -0.000347 \n", "f_05 -0.000088 0.000028 -0.000394 0.001895 0.000198 1.000000 0.001312 \n", "f_06 -0.001257 -0.000963 0.000661 -0.000237 -0.000347 0.001312 1.000000 \n", "f_07 0.000268 -0.001054 0.000925 0.000644 0.000317 0.000282 -0.000048 \n", "f_08 -0.000945 -0.000522 -0.000284 -0.000512 0.000721 -0.000602 0.000229 \n", "f_09 0.000175 -0.000363 0.000052 0.000678 0.000846 0.000830 0.000046 \n", "f_10 0.000347 0.000363 0.000580 -0.000808 -0.002530 -0.000209 -0.000022 \n", "f_11 0.001084 0.000897 -0.000922 0.000700 0.001119 -0.000094 -0.000209 \n", "f_12 0.000300 -0.000526 0.001041 0.000740 0.001370 0.001240 0.000957 \n", "f_13 0.001492 0.000129 -0.001560 -0.001747 -0.000624 0.000479 0.000286 \n", "f_14 0.001599 0.000820 0.001278 -0.000388 0.000224 0.000056 -0.000079 \n", "f_15 -0.000687 -0.000014 0.001568 -0.001294 0.000272 0.000727 0.000612 \n", "f_16 0.000163 -0.000759 -0.000873 0.000486 -0.000711 -0.000264 -0.000025 \n", "f_17 -0.000202 -0.000345 0.000392 0.001029 -0.000427 0.000930 0.000511 \n", "f_18 -0.000226 -0.000116 -0.000205 0.000235 0.001201 0.000152 0.000143 \n", "f_19 -0.000310 -0.000941 -0.000755 0.000652 -0.000401 0.000635 0.000661 \n", "f_20 -0.000489 0.001159 0.000051 0.000124 0.000223 0.000278 -0.000120 \n", "f_21 0.000768 -0.000546 -0.000196 0.000156 0.000082 0.000051 -0.000620 \n", "f_22 -0.001141 -0.000207 -0.000636 0.000725 0.000627 0.000474 -0.000243 \n", "f_23 0.000197 -0.000510 0.000123 0.000361 -0.001822 -0.000784 -0.001081 \n", "f_24 0.001522 -0.000183 -0.000447 0.001403 0.000876 -0.000349 -0.001327 \n", "f_25 0.000285 0.000097 -0.000667 0.001246 -0.001457 0.000513 0.001279 \n", "f_26 -0.000172 0.000420 0.000305 -0.000089 -0.000656 0.000433 0.000436 \n", "f_28 0.188822 0.194247 0.208815 0.328754 0.169644 0.296369 0.165917 \n", "f_29 -0.000032 -0.000232 -0.000377 0.000161 0.000141 -0.000608 0.000858 \n", "f_30 -0.000781 0.000817 -0.000630 -0.000484 -0.000664 -0.000723 0.000495 \n", "\n", " f_07 f_08 f_09 ... f_20 f_21 f_22 \\\n", "f_00 0.000268 -0.000945 0.000175 ... -0.000489 0.000768 -0.001141 \n", "f_01 -0.001054 -0.000522 -0.000363 ... 0.001159 -0.000546 -0.000207 \n", "f_02 0.000925 -0.000284 0.000052 ... 0.000051 -0.000196 -0.000636 \n", "f_03 0.000644 -0.000512 0.000678 ... 0.000124 0.000156 0.000725 \n", "f_04 0.000317 0.000721 0.000846 ... 0.000223 0.000082 0.000627 \n", "f_05 0.000282 -0.000602 0.000830 ... 0.000278 0.000051 0.000474 \n", "f_06 -0.000048 0.000229 0.000046 ... -0.000120 -0.000620 -0.000243 \n", "f_07 1.000000 0.113097 0.006419 ... -0.002478 0.001269 0.000464 \n", "f_08 0.113097 1.000000 -0.067602 ... -0.004145 0.004364 -0.001753 \n", "f_09 0.006419 -0.067602 1.000000 ... -0.004613 0.005410 -0.001343 \n", "f_10 -0.089812 -0.085315 -0.050998 ... 0.003826 -0.003314 0.000885 \n", "f_11 -0.134704 -0.102443 0.007286 ... 0.004513 -0.005434 0.002350 \n", "f_12 0.093957 0.011274 0.035231 ... -0.000851 -0.000410 -0.001399 \n", "f_13 0.059271 0.038777 -0.047612 ... 0.002026 -0.001728 0.000846 \n", "f_14 -0.061668 -0.043743 0.032393 ... 0.001488 -0.002593 -0.001102 \n", "f_15 0.055368 0.010021 -0.000042 ... -0.002765 0.003701 -0.001018 \n", "f_16 0.062274 0.049848 -0.019949 ... 0.001838 -0.001320 0.001152 \n", "f_17 -0.149490 -0.049167 0.005606 ... 0.001073 0.000038 -0.000294 \n", "f_18 0.026738 0.066641 -0.051665 ... -0.001864 0.000212 0.000678 \n", "f_19 -0.003340 -0.005489 -0.010161 ... -0.081401 0.027210 -0.070250 \n", "f_20 -0.002478 -0.004145 -0.004613 ... 1.000000 -0.012057 -0.063454 \n", "f_21 0.001269 0.004364 0.005410 ... -0.012057 1.000000 -0.155678 \n", "f_22 0.000464 -0.001753 -0.001343 ... -0.063454 -0.155678 1.000000 \n", "f_23 0.005402 0.002331 0.008653 ... -0.062111 0.116442 -0.088164 \n", "f_24 0.002533 0.003782 0.009826 ... 0.087198 0.054088 -0.016601 \n", "f_25 -0.001417 -0.002369 -0.005436 ... -0.062476 -0.084967 0.149289 \n", "f_26 -0.003336 -0.004363 -0.008274 ... 0.116582 0.139453 -0.035476 \n", "f_28 0.000336 -0.000379 -0.000692 ... 0.000399 0.000206 -0.000204 \n", "f_29 -0.048202 0.065008 -0.103734 ... -0.003260 0.003343 0.001211 \n", "f_30 0.000243 0.000014 -0.001397 ... 0.072440 -0.154615 0.315087 \n", "\n", " f_23 f_24 f_25 f_26 f_28 f_29 f_30 \n", "f_00 0.000197 0.001522 0.000285 -0.000172 0.188822 -0.000032 -0.000781 \n", "f_01 -0.000510 -0.000183 0.000097 0.000420 0.194247 -0.000232 0.000817 \n", "f_02 0.000123 -0.000447 -0.000667 0.000305 0.208815 -0.000377 -0.000630 \n", "f_03 0.000361 0.001403 0.001246 -0.000089 0.328754 0.000161 -0.000484 \n", "f_04 -0.001822 0.000876 -0.001457 -0.000656 0.169644 0.000141 -0.000664 \n", "f_05 -0.000784 -0.000349 0.000513 0.000433 0.296369 -0.000608 -0.000723 \n", "f_06 -0.001081 -0.001327 0.001279 0.000436 0.165917 0.000858 0.000495 \n", "f_07 0.005402 0.002533 -0.001417 -0.003336 0.000336 -0.048202 0.000243 \n", "f_08 0.002331 0.003782 -0.002369 -0.004363 -0.000379 0.065008 0.000014 \n", "f_09 0.008653 0.009826 -0.005436 -0.008274 -0.000692 -0.103734 -0.001397 \n", "f_10 -0.003461 -0.003243 0.001780 0.004973 -0.000685 0.119974 -0.001564 \n", "f_11 -0.003118 -0.007147 0.002607 0.006271 0.000484 0.079092 -0.003122 \n", "f_12 0.003382 0.000881 -0.000297 -0.000923 0.000998 -0.111824 -0.001839 \n", "f_13 -0.002761 -0.002755 0.000574 0.003935 -0.000507 0.006873 -0.000301 \n", "f_14 -0.003426 -0.001403 -0.000042 0.003581 0.001091 -0.038155 -0.001668 \n", "f_15 0.005291 0.005004 -0.001737 -0.004642 -0.000461 -0.001241 0.000669 \n", "f_16 -0.002896 -0.003837 0.002054 0.004904 -0.001156 0.090912 0.000364 \n", "f_17 -0.001100 -0.001235 -0.000385 0.002392 0.000479 -0.077153 0.000416 \n", "f_18 0.001505 0.002620 -0.000008 -0.001464 -0.000377 0.158233 0.000114 \n", "f_19 -0.057858 -0.101625 -0.020306 -0.014043 -0.000726 -0.008606 0.096469 \n", "f_20 -0.062111 0.087198 -0.062476 0.116582 0.000399 -0.003260 0.072440 \n", "f_21 0.116442 0.054088 -0.084967 0.139453 0.000206 0.003343 -0.154615 \n", "f_22 -0.088164 -0.016601 0.149289 -0.035476 -0.000204 0.001211 0.315087 \n", "f_23 1.000000 0.011719 -0.231012 0.039106 -0.001049 0.009140 -0.171359 \n", "f_24 0.011719 1.000000 -0.086888 0.013721 0.000375 0.009089 -0.083660 \n", "f_25 -0.231012 -0.086888 1.000000 0.010003 0.000412 -0.004410 0.178792 \n", "f_26 0.039106 0.013721 0.010003 1.000000 -0.000673 -0.008045 -0.022952 \n", "f_28 -0.001049 0.000375 0.000412 -0.000673 1.000000 -0.000940 -0.000932 \n", "f_29 0.009140 0.009089 -0.004410 -0.008045 -0.000940 1.000000 -0.000315 \n", "f_30 -0.171359 -0.083660 0.178792 -0.022952 -0.000932 -0.000315 1.000000 \n", "\n", "[30 rows x 30 columns]\n" ] } ], "source": [ "print(corr)" ] }, { "cell_type": "markdown", "id": "513955a1", "metadata": { "papermill": { "duration": 0.040925, "end_time": "2022-05-11T11:03:33.801214", "exception": false, "start_time": "2022-05-11T11:03:33.760289", "status": "completed" }, "tags": [] }, "source": [ "Remove columns that have a high correlation" ] }, { "cell_type": "code", "execution_count": 14, "id": "830ae1b5", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:33.885247Z", "iopub.status.busy": "2022-05-11T11:03:33.884695Z", "iopub.status.idle": "2022-05-11T11:03:34.454640Z", "shell.execute_reply": "2022-05-11T11:03:34.453834Z" }, "papermill": { "duration": 0.614369, "end_time": "2022-05-11T11:03:34.456908", "exception": false, "start_time": "2022-05-11T11:03:33.842539", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
f_00f_01f_02f_03f_04f_05f_06f_07f_08f_09...f_20f_21f_22f_23f_24f_25f_26f_28f_29f_30
0-1.3732460.238887-0.2433760.567405-0.6477150.8393260.113133151...-0.9197173.058541-2.5407390.766952-2.730628-0.2081771.36340267.60915300
11.697021-1.710322-2.230332-0.5456611.113173-1.5521750.447825134...-1.0754342.1790502.278315-0.633658-1.217077-3.782194-0.058316377.09641500
21.6817260.616746-1.0276890.810492-0.6090860.113965-0.708660102...-3.485342-0.784235-1.385775-0.520558-0.0091212.788536-3.703488-195.59970202
3-0.118172-0.587835-0.8046382.0868220.371005-0.128831-0.282575321...-2.100177-2.3438190.572594-1.6532131.686035-2.533098-0.608601210.82620500
41.148481-0.176567-0.664871-1.1013430.4678750.5001170.407515330...0.6050331.133665-3.912929-1.4303662.127649-3.3067844.371371-217.21179801
..................................................................
6999950.6401100.897808-0.5239561.563760-0.092281-0.6108670.535426016...-2.1851902.6040481.1228670.5181101.2438370.5751110.076372204.18653900
699996-0.191771-0.035246-0.1185330.5847502.1269770.568659-0.052663434...-0.2395523.0298571.384682-1.1357402.982713-1.5117602.225218-97.69459102
699997-0.331704-0.328845-1.1855031.022128-0.483099-0.107146-0.968281112...-0.9226264.021273-1.8452661.096011-2.734508-4.885955-2.248739130.62274510
699998-2.031073-1.2383980.964699-1.0459500.9060640.634301-0.707474511...-3.0799961.453864-1.6966061.0189951.973697-0.353068-3.333449-364.62514800
699999-0.085906-0.0021242.2273750.2171453.179153-1.6601880.891989034...-2.128546-3.549082-4.325318-5.0172210.251268-3.236026-0.362070-155.41734201
\n", "

1600000 rows × 30 columns

\n", "
" ], "text/plain": [ " f_00 f_01 f_02 f_03 f_04 f_05 f_06 \\\n", "0 -1.373246 0.238887 -0.243376 0.567405 -0.647715 0.839326 0.113133 \n", "1 1.697021 -1.710322 -2.230332 -0.545661 1.113173 -1.552175 0.447825 \n", "2 1.681726 0.616746 -1.027689 0.810492 -0.609086 0.113965 -0.708660 \n", "3 -0.118172 -0.587835 -0.804638 2.086822 0.371005 -0.128831 -0.282575 \n", "4 1.148481 -0.176567 -0.664871 -1.101343 0.467875 0.500117 0.407515 \n", "... ... ... ... ... ... ... ... \n", "699995 0.640110 0.897808 -0.523956 1.563760 -0.092281 -0.610867 0.535426 \n", "699996 -0.191771 -0.035246 -0.118533 0.584750 2.126977 0.568659 -0.052663 \n", "699997 -0.331704 -0.328845 -1.185503 1.022128 -0.483099 -0.107146 -0.968281 \n", "699998 -2.031073 -1.238398 0.964699 -1.045950 0.906064 0.634301 -0.707474 \n", "699999 -0.085906 -0.002124 2.227375 0.217145 3.179153 -1.660188 0.891989 \n", "\n", " f_07 f_08 f_09 ... f_20 f_21 f_22 f_23 \\\n", "0 1 5 1 ... -0.919717 3.058541 -2.540739 0.766952 \n", "1 1 3 4 ... -1.075434 2.179050 2.278315 -0.633658 \n", "2 1 0 2 ... -3.485342 -0.784235 -1.385775 -0.520558 \n", "3 3 2 1 ... -2.100177 -2.343819 0.572594 -1.653213 \n", "4 3 3 0 ... 0.605033 1.133665 -3.912929 -1.430366 \n", "... ... ... ... ... ... ... ... ... \n", "699995 0 1 6 ... -2.185190 2.604048 1.122867 0.518110 \n", "699996 4 3 4 ... -0.239552 3.029857 1.384682 -1.135740 \n", "699997 1 1 2 ... -0.922626 4.021273 -1.845266 1.096011 \n", "699998 5 1 1 ... -3.079996 1.453864 -1.696606 1.018995 \n", "699999 0 3 4 ... -2.128546 -3.549082 -4.325318 -5.017221 \n", "\n", " f_24 f_25 f_26 f_28 f_29 f_30 \n", "0 -2.730628 -0.208177 1.363402 67.609153 0 0 \n", "1 -1.217077 -3.782194 -0.058316 377.096415 0 0 \n", "2 -0.009121 2.788536 -3.703488 -195.599702 0 2 \n", "3 1.686035 -2.533098 -0.608601 210.826205 0 0 \n", "4 2.127649 -3.306784 4.371371 -217.211798 0 1 \n", "... ... ... ... ... ... ... \n", "699995 1.243837 0.575111 0.076372 204.186539 0 0 \n", "699996 2.982713 -1.511760 2.225218 -97.694591 0 2 \n", "699997 -2.734508 -4.885955 -2.248739 130.622745 1 0 \n", "699998 1.973697 -0.353068 -3.333449 -364.625148 0 0 \n", "699999 0.251268 -3.236026 -0.362070 -155.417342 0 1 \n", "\n", "[1600000 rows x 30 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columns = np.full((corr.shape[0],), True, dtype=bool)\n", "for i in range(corr.shape[0]):\n", " for j in range(i+1, corr.shape[0]):\n", " if corr.iloc[i,j] >= 0.80:\n", " if columns[j]:\n", " columns[j] = False\n", "selected_columns = combi.columns[columns]\n", "combi = combi[selected_columns]\n", "combi" ] }, { "cell_type": "markdown", "id": "8a8e289f", "metadata": { "papermill": { "duration": 0.041725, "end_time": "2022-05-11T11:03:34.540522", "exception": false, "start_time": "2022-05-11T11:03:34.498797", "status": "completed" }, "tags": [] }, "source": [ "Scale data" ] }, { "cell_type": "code", "execution_count": 15, "id": "d42bf25d", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:34.627208Z", "iopub.status.busy": "2022-05-11T11:03:34.626677Z", "iopub.status.idle": "2022-05-11T11:03:35.523105Z", "shell.execute_reply": "2022-05-11T11:03:35.522255Z" }, "papermill": { "duration": 0.942607, "end_time": "2022-05-11T11:03:35.525139", "exception": false, "start_time": "2022-05-11T11:03:34.582532", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
f_00f_01f_02f_03f_04f_05f_06f_07f_08f_09...f_20f_21f_22f_23f_24f_25f_26f_28f_29f_30
00.3152380.5300250.4580380.5467490.4228710.5913720.5127520.06250.31250.0625...0.4547610.5895320.4014550.5323430.3759160.5071060.5755930.5275300.00.0
10.6098900.3298680.2511640.4303040.6044530.3548990.5473800.06250.18750.2500...0.4479120.5578560.6091940.4752870.4383860.3688030.5233510.6533730.00.0
20.6084230.5688260.3763790.5721800.4268540.5196480.4277300.06250.00000.1250...0.3419030.4511310.4512430.4798940.4882440.6230690.3894040.4205050.01.0
30.4356870.4451320.3996020.7057050.5279210.4956400.4718120.18750.12500.0625...0.4028350.3949610.5356640.4337540.5582100.4171390.5031300.5857640.00.0
40.5572470.4873640.4141540.3721700.5379100.5578310.5432090.18750.18750.0000...0.5218330.5202060.3423030.4428320.5764370.3872000.6861250.4117170.00.5
..................................................................
6999950.5084590.5976870.4288250.6509840.4801470.4479760.5564430.00000.06250.3750...0.3990950.5731630.5593850.5222060.5399580.5374160.5283000.5830640.00.0
6999960.4286240.5018750.4710360.5485640.7089960.5646080.4955990.25000.18750.2500...0.4846810.5884990.5706710.4548340.6117290.4566610.6072620.4603140.01.0
6999970.4151950.4717270.3599480.5943210.4398460.4977840.4008690.06250.06250.1250...0.4546330.6242050.4314350.5457470.3757560.3260910.4428610.5531521.00.0
6999980.2521070.3783280.5838180.3779650.5830960.5710990.4278520.31250.06250.0625...0.3597340.5317380.4378440.5426100.5700830.5014990.4030010.3517760.00.0
6999990.4387840.5052770.7152830.5101060.8174960.3442190.5933330.00000.18750.2500...0.4015870.3515530.3245250.2967170.4989910.3899380.5121890.4368430.00.5
\n", "

1600000 rows × 30 columns

\n", "
" ], "text/plain": [ " f_00 f_01 f_02 f_03 f_04 f_05 f_06 \\\n", "0 0.315238 0.530025 0.458038 0.546749 0.422871 0.591372 0.512752 \n", "1 0.609890 0.329868 0.251164 0.430304 0.604453 0.354899 0.547380 \n", "2 0.608423 0.568826 0.376379 0.572180 0.426854 0.519648 0.427730 \n", "3 0.435687 0.445132 0.399602 0.705705 0.527921 0.495640 0.471812 \n", "4 0.557247 0.487364 0.414154 0.372170 0.537910 0.557831 0.543209 \n", "... ... ... ... ... ... ... ... \n", "699995 0.508459 0.597687 0.428825 0.650984 0.480147 0.447976 0.556443 \n", "699996 0.428624 0.501875 0.471036 0.548564 0.708996 0.564608 0.495599 \n", "699997 0.415195 0.471727 0.359948 0.594321 0.439846 0.497784 0.400869 \n", "699998 0.252107 0.378328 0.583818 0.377965 0.583096 0.571099 0.427852 \n", "699999 0.438784 0.505277 0.715283 0.510106 0.817496 0.344219 0.593333 \n", "\n", " f_07 f_08 f_09 ... f_20 f_21 f_22 f_23 \\\n", "0 0.0625 0.3125 0.0625 ... 0.454761 0.589532 0.401455 0.532343 \n", "1 0.0625 0.1875 0.2500 ... 0.447912 0.557856 0.609194 0.475287 \n", "2 0.0625 0.0000 0.1250 ... 0.341903 0.451131 0.451243 0.479894 \n", "3 0.1875 0.1250 0.0625 ... 0.402835 0.394961 0.535664 0.433754 \n", "4 0.1875 0.1875 0.0000 ... 0.521833 0.520206 0.342303 0.442832 \n", "... ... ... ... ... ... ... ... ... \n", "699995 0.0000 0.0625 0.3750 ... 0.399095 0.573163 0.559385 0.522206 \n", "699996 0.2500 0.1875 0.2500 ... 0.484681 0.588499 0.570671 0.454834 \n", "699997 0.0625 0.0625 0.1250 ... 0.454633 0.624205 0.431435 0.545747 \n", "699998 0.3125 0.0625 0.0625 ... 0.359734 0.531738 0.437844 0.542610 \n", "699999 0.0000 0.1875 0.2500 ... 0.401587 0.351553 0.324525 0.296717 \n", "\n", " f_24 f_25 f_26 f_28 f_29 f_30 \n", "0 0.375916 0.507106 0.575593 0.527530 0.0 0.0 \n", "1 0.438386 0.368803 0.523351 0.653373 0.0 0.0 \n", "2 0.488244 0.623069 0.389404 0.420505 0.0 1.0 \n", "3 0.558210 0.417139 0.503130 0.585764 0.0 0.0 \n", "4 0.576437 0.387200 0.686125 0.411717 0.0 0.5 \n", "... ... ... ... ... ... ... \n", "699995 0.539958 0.537416 0.528300 0.583064 0.0 0.0 \n", "699996 0.611729 0.456661 0.607262 0.460314 0.0 1.0 \n", "699997 0.375756 0.326091 0.442861 0.553152 1.0 0.0 \n", "699998 0.570083 0.501499 0.403001 0.351776 0.0 0.0 \n", "699999 0.498991 0.389938 0.512189 0.436843 0.0 0.5 \n", "\n", "[1600000 rows x 30 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "combi = (combi - combi.min()) / (combi.max() - combi.min())\n", "combi" ] }, { "cell_type": "markdown", "id": "3d80cfe3", "metadata": { "papermill": { "duration": 0.042442, "end_time": "2022-05-11T11:03:35.610293", "exception": false, "start_time": "2022-05-11T11:03:35.567851", "status": "completed" }, "tags": [] }, "source": [ "Define X and y" ] }, { "cell_type": "code", "execution_count": 16, "id": "4d4770db", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:35.697945Z", "iopub.status.busy": "2022-05-11T11:03:35.697596Z", "iopub.status.idle": "2022-05-11T11:03:35.702099Z", "shell.execute_reply": "2022-05-11T11:03:35.701135Z" }, "papermill": { "duration": 0.051241, "end_time": "2022-05-11T11:03:35.704229", "exception": false, "start_time": "2022-05-11T11:03:35.652988", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "y = target\n", "X = combi[: len(train)]\n", "X_test = combi[len(train) :]" ] }, { "cell_type": "markdown", "id": "5d8bb6c5", "metadata": { "papermill": { "duration": 0.043146, "end_time": "2022-05-11T11:03:35.790146", "exception": false, "start_time": "2022-05-11T11:03:35.747000", "status": "completed" }, "tags": [] }, "source": [ "Split dataset for training and validation" ] }, { "cell_type": "code", "execution_count": 17, "id": "3bb874fb", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:35.878461Z", "iopub.status.busy": "2022-05-11T11:03:35.878177Z", "iopub.status.idle": "2022-05-11T11:03:36.552140Z", "shell.execute_reply": "2022-05-11T11:03:36.551242Z" }, "papermill": { "duration": 0.721353, "end_time": "2022-05-11T11:03:36.554347", "exception": false, "start_time": "2022-05-11T11:03:35.832994", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "((810000, 30), (90000, 30), (810000,), (90000,), (700000, 30))" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)\n", "X_train.shape, X_val.shape, y_train.shape,y_val.shape, X_test.shape" ] }, { "cell_type": "markdown", "id": "383f9ee5", "metadata": { "papermill": { "duration": 0.04255, "end_time": "2022-05-11T11:03:36.640114", "exception": false, "start_time": "2022-05-11T11:03:36.597564", "status": "completed" }, "tags": [] }, "source": [ "Select model - Logistic Regression" ] }, { "cell_type": "code", "execution_count": 18, "id": "b1d9af4e", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:36.727674Z", "iopub.status.busy": "2022-05-11T11:03:36.727395Z", "iopub.status.idle": "2022-05-11T11:03:45.348105Z", "shell.execute_reply": "2022-05-11T11:03:45.346914Z" }, "papermill": { "duration": 8.667486, "end_time": "2022-05-11T11:03:45.350488", "exception": false, "start_time": "2022-05-11T11:03:36.683002", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.614883950617284\n" ] } ], "source": [ "from sklearn.linear_model import LogisticRegression\n", "\n", "model = LogisticRegression(random_state=42).fit(X_train, y_train)\n", "print(model.score(X_train, y_train))" ] }, { "cell_type": "markdown", "id": "ea710141", "metadata": { "papermill": { "duration": 0.043577, "end_time": "2022-05-11T11:03:45.437486", "exception": false, "start_time": "2022-05-11T11:03:45.393909", "status": "completed" }, "tags": [] }, "source": [ "Predict on validation set" ] }, { "cell_type": "code", "execution_count": 19, "id": "110776e9", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:45.528247Z", "iopub.status.busy": "2022-05-11T11:03:45.527682Z", "iopub.status.idle": "2022-05-11T11:03:45.563979Z", "shell.execute_reply": "2022-05-11T11:03:45.563197Z" }, "papermill": { "duration": 0.084883, "end_time": "2022-05-11T11:03:45.567687", "exception": false, "start_time": "2022-05-11T11:03:45.482804", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.6136444444444444\n" ] } ], "source": [ "y_pred = model.predict(X_val)\n", "print(model.score(X_val, y_val))" ] }, { "cell_type": "markdown", "id": "4d6493b1", "metadata": { "papermill": { "duration": 0.050645, "end_time": "2022-05-11T11:03:45.700862", "exception": false, "start_time": "2022-05-11T11:03:45.650217", "status": "completed" }, "tags": [] }, "source": [ "Confusion matrix" ] }, { "cell_type": "code", "execution_count": 20, "id": "3e1a7d9c", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:45.790395Z", "iopub.status.busy": "2022-05-11T11:03:45.790092Z", "iopub.status.idle": "2022-05-11T11:03:45.815750Z", "shell.execute_reply": "2022-05-11T11:03:45.815037Z" }, "papermill": { "duration": 0.073088, "end_time": "2022-05-11T11:03:45.817848", "exception": false, "start_time": "2022-05-11T11:03:45.744760", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[30030 16191]\n", " [18581 25198]]\n" ] } ], "source": [ "from sklearn.metrics import confusion_matrix\n", "\n", "print(confusion_matrix(y_val, y_pred))" ] }, { "cell_type": "markdown", "id": "98219d25", "metadata": { "papermill": { "duration": 0.043722, "end_time": "2022-05-11T11:03:45.906025", "exception": false, "start_time": "2022-05-11T11:03:45.862303", "status": "completed" }, "tags": [] }, "source": [ "Predict on test set" ] }, { "cell_type": "code", "execution_count": 21, "id": "e3af5809", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:45.998329Z", "iopub.status.busy": "2022-05-11T11:03:45.997865Z", "iopub.status.idle": "2022-05-11T11:03:46.178067Z", "shell.execute_reply": "2022-05-11T11:03:46.176967Z" }, "papermill": { "duration": 0.230467, "end_time": "2022-05-11T11:03:46.181754", "exception": false, "start_time": "2022-05-11T11:03:45.951287", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "array([0, 1, 0, ..., 1, 0, 0])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preds = model.predict(X_test)\n", "preds = preds.astype(int)\n", "preds[preds < 0] = 0\n", "preds" ] }, { "cell_type": "markdown", "id": "542f7a58", "metadata": { "papermill": { "duration": 0.051137, "end_time": "2022-05-11T11:03:46.316333", "exception": false, "start_time": "2022-05-11T11:03:46.265196", "status": "completed" }, "tags": [] }, "source": [ "Submit" ] }, { "cell_type": "code", "execution_count": 22, "id": "d9c151cc", "metadata": { "execution": { "iopub.execute_input": "2022-05-11T11:03:46.407139Z", "iopub.status.busy": "2022-05-11T11:03:46.406885Z", "iopub.status.idle": "2022-05-11T11:03:47.731695Z", "shell.execute_reply": "2022-05-11T11:03:47.730883Z" }, "papermill": { "duration": 1.372749, "end_time": "2022-05-11T11:03:47.733818", "exception": false, "start_time": "2022-05-11T11:03:46.361069", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtarget
09000000
19000011
29000020
39000030
49000041
.........
69999515999951
69999615999961
69999715999971
69999815999980
69999915999990
\n", "

700000 rows × 2 columns

\n", "
" ], "text/plain": [ " id target\n", "0 900000 0\n", "1 900001 1\n", "2 900002 0\n", "3 900003 0\n", "4 900004 1\n", "... ... ...\n", "699995 1599995 1\n", "699996 1599996 1\n", "699997 1599997 1\n", "699998 1599998 0\n", "699999 1599999 0\n", "\n", "[700000 rows x 2 columns]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "submission.target = preds\n", "submission.to_csv('submission.csv', index=False)\n", "submission = pd.read_csv(\"submission.csv\")\n", "submission" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.1" }, "papermill": { "default_parameters": {}, "duration": 54.233478, "end_time": "2022-05-11T11:03:48.600102", "environment_variables": {}, "exception": null, "input_path": "__notebook__.ipynb", "output_path": "__notebook__.ipynb", "parameters": {}, "start_time": "2022-05-11T11:02:54.366624", "version": "2.3.4" } }, "nbformat": 4, "nbformat_minor": 5 }