{ "cells": [ { "cell_type": "markdown", "id": "e01e2899-35c2-4707-b271-433599ded8f6", "metadata": {}, "source": [ "# Read data" ] }, { "cell_type": "code", "execution_count": 1, "id": "1ecabbad-ed2b-48dc-ac3f-1b04e5cd9014", "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import pandas as pd'); }\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LabelMessage
01.0go jurong point crazy available bugis n great ...
11.0ok lar joking wif u oni
20.0free entry wkly comp win fa cup final tkts st ...
31.0u dun say early hor u c already say
41.0nah think go usf life around though
.........
85660.0abc good morning america rank number christmas...
85670.0hyperlink hyperlink hyperlink let mortgage len...
85680.0thank shopping u gift occasion free gift numbe...
85690.0famous ebay marketing e course learn sell comp...
85700.0hello chinese traditional number number f r v ...
\n", "

8571 rows × 2 columns

\n", "
" ], "text/plain": [ " Label Message\n", "0 1.0 go jurong point crazy available bugis n great ...\n", "1 1.0 ok lar joking wif u oni\n", "2 0.0 free entry wkly comp win fa cup final tkts st ...\n", "3 1.0 u dun say early hor u c already say\n", "4 1.0 nah think go usf life around though\n", "... ... ...\n", "8566 0.0 abc good morning america rank number christmas...\n", "8567 0.0 hyperlink hyperlink hyperlink let mortgage len...\n", "8568 0.0 thank shopping u gift occasion free gift numbe...\n", "8569 0.0 famous ebay marketing e course learn sell comp...\n", "8570 0.0 hello chinese traditional number number f r v ...\n", "\n", "[8571 rows x 2 columns]" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "full_data = pd.read_csv('spam_data/full_data.csv')\n", "full_data" ] }, { "cell_type": "code", "execution_count": 2, "id": "3c2c7ed2-6d48-4bac-a9f9-a2220e67dbc2", "metadata": {}, "outputs": [], "source": [ "full_data = full_data.dropna()" ] }, { "cell_type": "code", "execution_count": 3, "id": "c537fe92-ce4c-4aac-9da9-12259d5039f7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Label 0\n", "Message 0\n", "dtype: int64" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "full_data.isnull().sum()" ] }, { "cell_type": "markdown", "id": "483a4715-8949-42e7-8099-4bb970289271", "metadata": {}, "source": [ "# Vectorizer" ] }, { "cell_type": "code", "execution_count": 4, "id": "4a1067a4-7dba-43e7-ac34-a503f87c29ce", "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "cv = CountVectorizer(max_features=5000)\n", "X = cv.fit_transform(full_data['Message']).toarray()" ] }, { "cell_type": "code", "execution_count": 5, "id": "a3941df7-ff2c-4ee5-b806-e7bcb0633274", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(8561, 5000)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.shape" ] }, { "cell_type": "code", "execution_count": 6, "id": "119f45c9-f411-4c93-9447-a02b38724d62", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 0],\n", " ...,\n", " [0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 0]], dtype=int64)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X" ] }, { "cell_type": "code", "execution_count": 7, "id": "8e97ecfd-472f-42ac-9ae5-61308d6df041", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1.0\n", "1 1.0\n", "2 0.0\n", "3 1.0\n", "4 1.0\n", " ... \n", "8566 0.0\n", "8567 0.0\n", "8568 0.0\n", "8569 0.0\n", "8570 0.0\n", "Name: Label, Length: 8561, dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y = full_data['Label']\n", "y" ] }, { "cell_type": "markdown", "id": "9ef6a206-3304-413f-9d5b-ef46d8c87206", "metadata": {}, "source": [ "# Model" ] }, { "cell_type": "code", "execution_count": 8, "id": "66969958-54c1-404d-88c1-b0c694b39527", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)" ] }, { "cell_type": "code", "execution_count": 9, "id": "82d7b37f-ae5e-41f1-acd2-24b361dd41c4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "MultinomialNB()" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.ensemble import RandomForestClassifier\n", "\n", "spam_model = MultinomialNB()\n", "spam_model.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 10, "id": "b122e57a-a425-4b08-9bcd-b5dcedcdbc69", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "96.55575014594278" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.metrics import accuracy_score\n", "\n", "y_pred = spam_model.predict(X_test)\n", "\n", "accuracy_score(y_pred, y_test) * 100" ] }, { "cell_type": "code", "execution_count": 11, "id": "19950e9e-0b0c-4956-8351-96ec8cc2ac8c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 236, 18],\n", " [ 41, 1418]], dtype=int64)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.metrics import confusion_matrix\n", "\n", "confusion_m = confusion_matrix(y_test, y_pred)\n", "confusion_m" ] }, { "cell_type": "markdown", "id": "72f29abb-dc37-4921-ac5c-4a40c51ac51e", "metadata": {}, "source": [ "# Final Model" ] }, { "cell_type": "code", "execution_count": 12, "id": "586531ca-0473-4a52-9037-5d386ab1eda4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "MultinomialNB()" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_model = MultinomialNB()\n", "final_model.fit(X, y)" ] }, { "cell_type": "code", "execution_count": 13, "id": "da22026b-3303-40c2-bebf-afb47bbb3274", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9684616283144493" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pred = final_model.predict(X)\n", "accuracy_score(pred, y)" ] }, { "cell_type": "markdown", "id": "157e41c7-e732-45b4-8075-49db21b7f852", "metadata": {}, "source": [ "# Pickling" ] }, { "cell_type": "code", "execution_count": 14, "id": "a4fcca1e-159f-43d5-90ab-522365ff5328", "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import pandas as pd\\nimport pickle'); }\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pickle.dump(cv, open('pickle_files/count_vectorizer.pkl', 'wb')) " ] }, { "cell_type": "code", "execution_count": 15, "id": "114f0150-63ad-43c0-a90d-3450647c50ae", "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import pandas as pd\\nimport pickle'); }\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pickle.dump(final_model, open('pickle_files/spam_model.pkl', 'wb')) " ] }, { "cell_type": "code", "execution_count": null, "id": "eb2b40f9-aea0-44ae-9eec-b32c37f4831d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 5 }