{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "QQHZHevuXdEy" }, "source": [ "# **QSAR Model Building of Acetylcholinesterase Inhibitors**\n", "\n", "Chanin Nantasenamat\n", "\n", "*Data Professor YouTube channel, http://youtube.com/dataprofessor*" ] }, { "cell_type": "markdown", "metadata": { "id": "g1qtHa0zXfWM" }, "source": [ "# Read in data" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "9MdfbvFKXtXq" }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 439 }, "id": "nerGP0fCXfgP", "outputId": "b639892c-8f7b-4b24-d02c-628c9f1a1460" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PubchemFP0PubchemFP1PubchemFP2PubchemFP3PubchemFP4PubchemFP5PubchemFP6PubchemFP7PubchemFP8PubchemFP9...PubchemFP872PubchemFP873PubchemFP874PubchemFP875PubchemFP876PubchemFP877PubchemFP878PubchemFP879PubchemFP880pIC50
01110000001...0000000006.124939
11110000001...0000000007.000000
21110000001...0000000004.301030
31100000001...0000000006.522879
41100000001...0000000006.096910
..................................................................
46901111000001...0000000005.612610
46911111000001...0000000005.595166
46921111000001...0000000005.419075
46931111000001...0000000005.460924
46941111000001...0000000005.555955
\n", "

4695 rows × 882 columns

\n", "
" ], "text/plain": [ " PubchemFP0 PubchemFP1 PubchemFP2 PubchemFP3 PubchemFP4 PubchemFP5 \\\n", "0 1 1 1 0 0 0 \n", "1 1 1 1 0 0 0 \n", "2 1 1 1 0 0 0 \n", "3 1 1 0 0 0 0 \n", "4 1 1 0 0 0 0 \n", "... ... ... ... ... ... ... \n", "4690 1 1 1 1 0 0 \n", "4691 1 1 1 1 0 0 \n", "4692 1 1 1 1 0 0 \n", "4693 1 1 1 1 0 0 \n", "4694 1 1 1 1 0 0 \n", "\n", " PubchemFP6 PubchemFP7 PubchemFP8 PubchemFP9 ... PubchemFP872 \\\n", "0 0 0 0 1 ... 0 \n", "1 0 0 0 1 ... 0 \n", "2 0 0 0 1 ... 0 \n", "3 0 0 0 1 ... 0 \n", "4 0 0 0 1 ... 0 \n", "... ... ... ... ... ... ... \n", "4690 0 0 0 1 ... 0 \n", "4691 0 0 0 1 ... 0 \n", "4692 0 0 0 1 ... 0 \n", "4693 0 0 0 1 ... 0 \n", "4694 0 0 0 1 ... 0 \n", "\n", " PubchemFP873 PubchemFP874 PubchemFP875 PubchemFP876 PubchemFP877 \\\n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "2 0 0 0 0 0 \n", "3 0 0 0 0 0 \n", "4 0 0 0 0 0 \n", "... ... ... ... ... ... \n", "4690 0 0 0 0 0 \n", "4691 0 0 0 0 0 \n", "4692 0 0 0 0 0 \n", "4693 0 0 0 0 0 \n", "4694 0 0 0 0 0 \n", "\n", " PubchemFP878 PubchemFP879 PubchemFP880 pIC50 \n", "0 0 0 0 6.124939 \n", "1 0 0 0 7.000000 \n", "2 0 0 0 4.301030 \n", "3 0 0 0 6.522879 \n", "4 0 0 0 6.096910 \n", "... ... ... ... ... \n", "4690 0 0 0 5.612610 \n", "4691 0 0 0 5.595166 \n", "4692 0 0 0 5.419075 \n", "4693 0 0 0 5.460924 \n", "4694 0 0 0 5.555955 \n", "\n", "[4695 rows x 882 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_url = 'https://github.com/dataprofessor/data/raw/master/acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv'\n", "dataset = pd.read_csv(dataset_url)\n", "dataset" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 439 }, "id": "tgFxx8m_YEUy", "outputId": "47903560-3aa4-497f-85b4-27b1361b200a" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PubchemFP0PubchemFP1PubchemFP2PubchemFP3PubchemFP4PubchemFP5PubchemFP6PubchemFP7PubchemFP8PubchemFP9...PubchemFP871PubchemFP872PubchemFP873PubchemFP874PubchemFP875PubchemFP876PubchemFP877PubchemFP878PubchemFP879PubchemFP880
01110000001...0000000000
11110000001...0000000000
21110000001...0000000000
31100000001...0000000000
41100000001...0000000000
..................................................................
46901111000001...0000000000
46911111000001...0000000000
46921111000001...0000000000
46931111000001...0000000000
46941111000001...0000000000
\n", "

4695 rows × 881 columns

\n", "
" ], "text/plain": [ " PubchemFP0 PubchemFP1 PubchemFP2 PubchemFP3 PubchemFP4 PubchemFP5 \\\n", "0 1 1 1 0 0 0 \n", "1 1 1 1 0 0 0 \n", "2 1 1 1 0 0 0 \n", "3 1 1 0 0 0 0 \n", "4 1 1 0 0 0 0 \n", "... ... ... ... ... ... ... \n", "4690 1 1 1 1 0 0 \n", "4691 1 1 1 1 0 0 \n", "4692 1 1 1 1 0 0 \n", "4693 1 1 1 1 0 0 \n", "4694 1 1 1 1 0 0 \n", "\n", " PubchemFP6 PubchemFP7 PubchemFP8 PubchemFP9 ... PubchemFP871 \\\n", "0 0 0 0 1 ... 0 \n", "1 0 0 0 1 ... 0 \n", "2 0 0 0 1 ... 0 \n", "3 0 0 0 1 ... 0 \n", "4 0 0 0 1 ... 0 \n", "... ... ... ... ... ... ... \n", "4690 0 0 0 1 ... 0 \n", "4691 0 0 0 1 ... 0 \n", "4692 0 0 0 1 ... 0 \n", "4693 0 0 0 1 ... 0 \n", "4694 0 0 0 1 ... 0 \n", "\n", " PubchemFP872 PubchemFP873 PubchemFP874 PubchemFP875 PubchemFP876 \\\n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "2 0 0 0 0 0 \n", "3 0 0 0 0 0 \n", "4 0 0 0 0 0 \n", "... ... ... ... ... ... \n", "4690 0 0 0 0 0 \n", "4691 0 0 0 0 0 \n", "4692 0 0 0 0 0 \n", "4693 0 0 0 0 0 \n", "4694 0 0 0 0 0 \n", "\n", " PubchemFP877 PubchemFP878 PubchemFP879 PubchemFP880 \n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "3 0 0 0 0 \n", "4 0 0 0 0 \n", "... ... ... ... ... \n", "4690 0 0 0 0 \n", "4691 0 0 0 0 \n", "4692 0 0 0 0 \n", "4693 0 0 0 0 \n", "4694 0 0 0 0 \n", "\n", "[4695 rows x 881 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = dataset.drop(['pIC50'], axis=1)\n", "X" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "JDwxgKHqYmD4", "outputId": "472cd19d-7dab-4f16-b03d-fab52d3fc782" }, "outputs": [ { "data": { "text/plain": [ "0 6.124939\n", "1 7.000000\n", "2 4.301030\n", "3 6.522879\n", "4 6.096910\n", " ... \n", "4690 5.612610\n", "4691 5.595166\n", "4692 5.419075\n", "4693 5.460924\n", "4694 5.555955\n", "Name: pIC50, Length: 4695, dtype: float64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Y = dataset.iloc[:,-1]\n", "Y" ] }, { "cell_type": "markdown", "metadata": { "id": "AQ9E0xUY_o_M" }, "source": [ "# Remove low variance features" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Qkgj-lsG_wOJ", "outputId": "d3d11b21-fdf3-4cfb-e0c8-0f2b5a1d779b" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PubchemFP3PubchemFP12PubchemFP13PubchemFP15PubchemFP16PubchemFP18PubchemFP19PubchemFP20PubchemFP37PubchemFP143...PubchemFP758PubchemFP761PubchemFP776PubchemFP777PubchemFP797PubchemFP798PubchemFP818PubchemFP819PubchemFP821PubchemFP824
00101110011...0000000000
10101011101...0000000000
20101110011...0000000000
30101110011...0000000000
40001111001...0001000000
..................................................................
46901101011010...0001100100
46911101011010...0001100100
46921101011000...0001100100
46931101011100...0001100100
46941111011000...0011100100
\n", "

4695 rows × 218 columns

\n", "
" ], "text/plain": [ " PubchemFP3 PubchemFP12 PubchemFP13 PubchemFP15 PubchemFP16 \\\n", "0 0 1 0 1 1 \n", "1 0 1 0 1 0 \n", "2 0 1 0 1 1 \n", "3 0 1 0 1 1 \n", "4 0 0 0 1 1 \n", "... ... ... ... ... ... \n", "4690 1 1 0 1 0 \n", "4691 1 1 0 1 0 \n", "4692 1 1 0 1 0 \n", "4693 1 1 0 1 0 \n", "4694 1 1 1 1 0 \n", "\n", " PubchemFP18 PubchemFP19 PubchemFP20 PubchemFP37 PubchemFP143 ... \\\n", "0 1 0 0 1 1 ... \n", "1 1 1 1 0 1 ... \n", "2 1 0 0 1 1 ... \n", "3 1 0 0 1 1 ... \n", "4 1 1 0 0 1 ... \n", "... ... ... ... ... ... ... \n", "4690 1 1 0 1 0 ... \n", "4691 1 1 0 1 0 ... \n", "4692 1 1 0 0 0 ... \n", "4693 1 1 1 0 0 ... \n", "4694 1 1 0 0 0 ... \n", "\n", " PubchemFP758 PubchemFP761 PubchemFP776 PubchemFP777 PubchemFP797 \\\n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "2 0 0 0 0 0 \n", "3 0 0 0 0 0 \n", "4 0 0 0 1 0 \n", "... ... ... ... ... ... \n", "4690 0 0 0 1 1 \n", "4691 0 0 0 1 1 \n", "4692 0 0 0 1 1 \n", "4693 0 0 0 1 1 \n", "4694 0 0 1 1 1 \n", "\n", " PubchemFP798 PubchemFP818 PubchemFP819 PubchemFP821 PubchemFP824 \n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "2 0 0 0 0 0 \n", "3 0 0 0 0 0 \n", "4 0 0 0 0 0 \n", "... ... ... ... ... ... \n", "4690 0 0 1 0 0 \n", "4691 0 0 1 0 0 \n", "4692 0 0 1 0 0 \n", "4693 0 0 1 0 0 \n", "4694 0 0 1 0 0 \n", "\n", "[4695 rows x 218 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_selection import VarianceThreshold\n", "\n", "def remove_low_variance(input_data, threshold=0.1):\n", " selection = VarianceThreshold(threshold)\n", " selection.fit(input_data)\n", " return input_data[input_data.columns[selection.get_support(indices=True)]]\n", "\n", "X = remove_low_variance(X, threshold=0.1)\n", "X" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "X.to_csv('descriptor_list.csv', index = False)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# In the app, use the following to get this same descriptor list\n", "# of 218 variables from the initial set of 881 variables\n", "# Xlist = list(pd.read_csv('descriptor_list.csv').columns)\n", "# X[Xlist]" ] }, { "cell_type": "markdown", "metadata": { "id": "LNohCdqQY5VZ" }, "source": [ "# Random Forest Regression Model" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "id": "EanoyG2eX9cV" }, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.metrics import mean_squared_error, r2_score" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mLQJ2KLLY_9a", "outputId": "e0c22032-02af-40ca-c5c9-536982ec8627" }, "outputs": [ { "data": { "text/plain": [ "0.8606007951843838" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = RandomForestRegressor(n_estimators=500, random_state=42)\n", "model.fit(X, Y)\n", "r2 = model.score(X, Y)\n", "r2" ] }, { "cell_type": "markdown", "metadata": { "id": "F5f8KGWjZRSc" }, "source": [ "## Model Prediction" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MI3c8LB2ZCYW", "outputId": "1bc71664-c9f8-434f-a666-7f78a2d34da6" }, "outputs": [ { "data": { "text/plain": [ "array([5.97555142, 6.38304794, 4.94339087, ..., 5.802151 , 5.70372719,\n", " 5.62942524])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Y_pred = model.predict(X)\n", "Y_pred" ] }, { "cell_type": "markdown", "metadata": { "id": "fXv7bcolZqa-" }, "source": [ "## Model Performance" ] }, { "cell_type": "code", "execution_count": 76, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6f13gYleZVKy", "outputId": "fd565d7f-26e4-45d1-89f7-55b37687746c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mean squared error (MSE): 0.34\n", "Coefficient of determination (R^2): 0.86\n" ] } ], "source": [ "print('Mean squared error (MSE): %.2f'\n", " % mean_squared_error(Y, Y_pred))\n", "print('Coefficient of determination (R^2): %.2f'\n", " % r2_score(Y, Y_pred))" ] }, { "cell_type": "markdown", "metadata": { "id": "uWvxj1iSaL3n" }, "source": [ "# Data Visualization (Experimental vs Predicted pIC50 for Training Data)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "id": "iPcFF0MjZlh8" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Matplotlib is building the font cache; this may take a moment.\n" ] } ], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 351 }, "id": "QRNyIlGAaQQI", "outputId": "1cf12d14-4ba5-49d4-e627-902a049fba2e" }, "outputs": [ { "data": { "text/plain": [ "Text(0.5, 0, 'Experimental pIC50')" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(5,5))\n", "plt.scatter(x=Y, y=Y_pred, c=\"#7CAE00\", alpha=0.3)\n", "\n", "# Add trendline\n", "# https://stackoverflow.com/questions/26447191/how-to-add-trendline-in-python-matplotlib-dot-scatter-graphs\n", "z = np.polyfit(Y, Y_pred, 1)\n", "p = np.poly1d(z)\n", "\n", "plt.plot(Y,p(Y),\"#F8766D\")\n", "plt.ylabel('Predicted pIC50')\n", "plt.xlabel('Experimental pIC50')" ] }, { "cell_type": "markdown", "metadata": { "id": "YzKTmvZrbFVI" }, "source": [ "# Save Model as Pickle Object" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "id": "DzjpPyVyb8XO" }, "outputs": [], "source": [ "import pickle" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "id": "b2K9ajBaaYUk" }, "outputs": [], "source": [ "pickle.dump(model, open('acetylcholinesterase_model.pkl', 'wb'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ef4fyvrEb-NC" }, "outputs": [], "source": [] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "QSAR-web-app.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 1 }