avngrstark commited on
Commit
6bad2a4
·
verified ·
1 Parent(s): 027e936

Upload 2 files

Browse files
Files changed (2) hide show
  1. data_preprocessing.ipynb +846 -0
  2. vectorizer_and_model.ipynb +1326 -0
data_preprocessing.ipynb ADDED
@@ -0,0 +1,846 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "90d1208c-18ee-43b2-aafb-c79d0b862687",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import nltk\n",
11
+ "import pandas as pd\n",
12
+ "import numpy as np\n",
13
+ "\n",
14
+ "import re\n",
15
+ "from nltk.corpus import stopwords\n",
16
+ "from nltk.stem import PorterStemmer\n",
17
+ "from nltk.stem import WordNetLemmatizer\n",
18
+ "\n",
19
+ "stemmer = PorterStemmer()\n",
20
+ "lemmatizer = WordNetLemmatizer()"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 2,
26
+ "id": "583799b8-54f4-4faa-83a0-8d5da9ed6c1f",
27
+ "metadata": {},
28
+ "outputs": [
29
+ {
30
+ "data": {
31
+ "text/html": [
32
+ "<div>\n",
33
+ "<style scoped>\n",
34
+ " .dataframe tbody tr th:only-of-type {\n",
35
+ " vertical-align: middle;\n",
36
+ " }\n",
37
+ "\n",
38
+ " .dataframe tbody tr th {\n",
39
+ " vertical-align: top;\n",
40
+ " }\n",
41
+ "\n",
42
+ " .dataframe thead th {\n",
43
+ " text-align: right;\n",
44
+ " }\n",
45
+ "</style>\n",
46
+ "<table border=\"1\" class=\"dataframe\">\n",
47
+ " <thead>\n",
48
+ " <tr style=\"text-align: right;\">\n",
49
+ " <th></th>\n",
50
+ " <th>Label</th>\n",
51
+ " <th>Message</th>\n",
52
+ " </tr>\n",
53
+ " </thead>\n",
54
+ " <tbody>\n",
55
+ " </tbody>\n",
56
+ "</table>\n",
57
+ "</div>"
58
+ ],
59
+ "text/plain": [
60
+ "Empty DataFrame\n",
61
+ "Columns: [Label, Message]\n",
62
+ "Index: []"
63
+ ]
64
+ },
65
+ "execution_count": 2,
66
+ "metadata": {},
67
+ "output_type": "execute_result"
68
+ }
69
+ ],
70
+ "source": [
71
+ "full_data = pd.DataFrame({'Label':[], 'Message':[]})\n",
72
+ "full_data"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "markdown",
77
+ "id": "c7a03b9c-ae0c-49d0-b65c-73aa0b12f773",
78
+ "metadata": {},
79
+ "source": [
80
+ "# Dataset 1"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": 3,
86
+ "id": "ab2c7f73-dce3-4c31-848b-741c3b68c418",
87
+ "metadata": {},
88
+ "outputs": [
89
+ {
90
+ "data": {
91
+ "text/html": [
92
+ "<div>\n",
93
+ "<style scoped>\n",
94
+ " .dataframe tbody tr th:only-of-type {\n",
95
+ " vertical-align: middle;\n",
96
+ " }\n",
97
+ "\n",
98
+ " .dataframe tbody tr th {\n",
99
+ " vertical-align: top;\n",
100
+ " }\n",
101
+ "\n",
102
+ " .dataframe thead th {\n",
103
+ " text-align: right;\n",
104
+ " }\n",
105
+ "</style>\n",
106
+ "<table border=\"1\" class=\"dataframe\">\n",
107
+ " <thead>\n",
108
+ " <tr style=\"text-align: right;\">\n",
109
+ " <th></th>\n",
110
+ " <th>v1</th>\n",
111
+ " <th>v2</th>\n",
112
+ " <th>Unnamed: 2</th>\n",
113
+ " <th>Unnamed: 3</th>\n",
114
+ " <th>Unnamed: 4</th>\n",
115
+ " </tr>\n",
116
+ " </thead>\n",
117
+ " <tbody>\n",
118
+ " <tr>\n",
119
+ " <th>0</th>\n",
120
+ " <td>ham</td>\n",
121
+ " <td>Go until jurong point, crazy.. Available only ...</td>\n",
122
+ " <td>NaN</td>\n",
123
+ " <td>NaN</td>\n",
124
+ " <td>NaN</td>\n",
125
+ " </tr>\n",
126
+ " <tr>\n",
127
+ " <th>1</th>\n",
128
+ " <td>ham</td>\n",
129
+ " <td>Ok lar... Joking wif u oni...</td>\n",
130
+ " <td>NaN</td>\n",
131
+ " <td>NaN</td>\n",
132
+ " <td>NaN</td>\n",
133
+ " </tr>\n",
134
+ " <tr>\n",
135
+ " <th>2</th>\n",
136
+ " <td>spam</td>\n",
137
+ " <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
138
+ " <td>NaN</td>\n",
139
+ " <td>NaN</td>\n",
140
+ " <td>NaN</td>\n",
141
+ " </tr>\n",
142
+ " <tr>\n",
143
+ " <th>3</th>\n",
144
+ " <td>ham</td>\n",
145
+ " <td>U dun say so early hor... U c already then say...</td>\n",
146
+ " <td>NaN</td>\n",
147
+ " <td>NaN</td>\n",
148
+ " <td>NaN</td>\n",
149
+ " </tr>\n",
150
+ " <tr>\n",
151
+ " <th>4</th>\n",
152
+ " <td>ham</td>\n",
153
+ " <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
154
+ " <td>NaN</td>\n",
155
+ " <td>NaN</td>\n",
156
+ " <td>NaN</td>\n",
157
+ " </tr>\n",
158
+ " </tbody>\n",
159
+ "</table>\n",
160
+ "</div>"
161
+ ],
162
+ "text/plain": [
163
+ " v1 v2 Unnamed: 2 \\\n",
164
+ "0 ham Go until jurong point, crazy.. Available only ... NaN \n",
165
+ "1 ham Ok lar... Joking wif u oni... NaN \n",
166
+ "2 spam Free entry in 2 a wkly comp to win FA Cup fina... NaN \n",
167
+ "3 ham U dun say so early hor... U c already then say... NaN \n",
168
+ "4 ham Nah I don't think he goes to usf, he lives aro... NaN \n",
169
+ "\n",
170
+ " Unnamed: 3 Unnamed: 4 \n",
171
+ "0 NaN NaN \n",
172
+ "1 NaN NaN \n",
173
+ "2 NaN NaN \n",
174
+ "3 NaN NaN \n",
175
+ "4 NaN NaN "
176
+ ]
177
+ },
178
+ "execution_count": 3,
179
+ "metadata": {},
180
+ "output_type": "execute_result"
181
+ }
182
+ ],
183
+ "source": [
184
+ "data = pd.read_csv(\"spam_data/spam_data_1.csv\", encoding='Windows-1252')\n",
185
+ "data.head()"
186
+ ]
187
+ },
188
+ {
189
+ "cell_type": "code",
190
+ "execution_count": 4,
191
+ "id": "347db751-7bd6-4d6f-8cfd-4456a69ebc90",
192
+ "metadata": {},
193
+ "outputs": [
194
+ {
195
+ "name": "stderr",
196
+ "output_type": "stream",
197
+ "text": [
198
+ "C:\\Users\\thaku\\AppData\\Local\\Temp\\ipykernel_24436\\3848975045.py:1: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
199
+ " data['v1'] = data['v1'].replace(to_replace=['ham', 'spam'], value=[1, 0]).astype(int)\n"
200
+ ]
201
+ }
202
+ ],
203
+ "source": [
204
+ "data['v1'] = data['v1'].replace(to_replace=['ham', 'spam'], value=[1, 0]).astype(int)"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": 5,
210
+ "id": "b4e18778-0900-4114-b405-92433f686d85",
211
+ "metadata": {},
212
+ "outputs": [],
213
+ "source": [
214
+ "for i in range(len(data)):\n",
215
+ " review = re.sub('[^a-zA-Z]', ' ', data['v2'][i])\n",
216
+ " review = review.lower()\n",
217
+ " review = review.split()\n",
218
+ " review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]\n",
219
+ " review = ' '.join(review)\n",
220
+ " data.loc[i, 'v2'] = review "
221
+ ]
222
+ },
223
+ {
224
+ "cell_type": "code",
225
+ "execution_count": 6,
226
+ "id": "e7e5d709-fce0-459f-8df8-4daa7ec7f1e2",
227
+ "metadata": {},
228
+ "outputs": [],
229
+ "source": [
230
+ "data = data[['v1', 'v2']]\n",
231
+ "data = data.rename(columns={'v1':'Label', 'v2':'Message'})"
232
+ ]
233
+ },
234
+ {
235
+ "cell_type": "code",
236
+ "execution_count": 7,
237
+ "id": "d286a7a2-8bd7-4f3b-b6f1-71197c5dd234",
238
+ "metadata": {},
239
+ "outputs": [
240
+ {
241
+ "data": {
242
+ "text/html": [
243
+ "<div>\n",
244
+ "<style scoped>\n",
245
+ " .dataframe tbody tr th:only-of-type {\n",
246
+ " vertical-align: middle;\n",
247
+ " }\n",
248
+ "\n",
249
+ " .dataframe tbody tr th {\n",
250
+ " vertical-align: top;\n",
251
+ " }\n",
252
+ "\n",
253
+ " .dataframe thead th {\n",
254
+ " text-align: right;\n",
255
+ " }\n",
256
+ "</style>\n",
257
+ "<table border=\"1\" class=\"dataframe\">\n",
258
+ " <thead>\n",
259
+ " <tr style=\"text-align: right;\">\n",
260
+ " <th></th>\n",
261
+ " <th>Label</th>\n",
262
+ " <th>Message</th>\n",
263
+ " </tr>\n",
264
+ " </thead>\n",
265
+ " <tbody>\n",
266
+ " <tr>\n",
267
+ " <th>0</th>\n",
268
+ " <td>1</td>\n",
269
+ " <td>go jurong point crazy available bugis n great ...</td>\n",
270
+ " </tr>\n",
271
+ " <tr>\n",
272
+ " <th>1</th>\n",
273
+ " <td>1</td>\n",
274
+ " <td>ok lar joking wif u oni</td>\n",
275
+ " </tr>\n",
276
+ " <tr>\n",
277
+ " <th>2</th>\n",
278
+ " <td>0</td>\n",
279
+ " <td>free entry wkly comp win fa cup final tkts st ...</td>\n",
280
+ " </tr>\n",
281
+ " <tr>\n",
282
+ " <th>3</th>\n",
283
+ " <td>1</td>\n",
284
+ " <td>u dun say early hor u c already say</td>\n",
285
+ " </tr>\n",
286
+ " <tr>\n",
287
+ " <th>4</th>\n",
288
+ " <td>1</td>\n",
289
+ " <td>nah think go usf life around though</td>\n",
290
+ " </tr>\n",
291
+ " <tr>\n",
292
+ " <th>...</th>\n",
293
+ " <td>...</td>\n",
294
+ " <td>...</td>\n",
295
+ " </tr>\n",
296
+ " <tr>\n",
297
+ " <th>5567</th>\n",
298
+ " <td>0</td>\n",
299
+ " <td>nd time tried contact u u pound prize claim ea...</td>\n",
300
+ " </tr>\n",
301
+ " <tr>\n",
302
+ " <th>5568</th>\n",
303
+ " <td>1</td>\n",
304
+ " <td>b going esplanade fr home</td>\n",
305
+ " </tr>\n",
306
+ " <tr>\n",
307
+ " <th>5569</th>\n",
308
+ " <td>1</td>\n",
309
+ " <td>pity mood suggestion</td>\n",
310
+ " </tr>\n",
311
+ " <tr>\n",
312
+ " <th>5570</th>\n",
313
+ " <td>1</td>\n",
314
+ " <td>guy bitching acted like interested buying some...</td>\n",
315
+ " </tr>\n",
316
+ " <tr>\n",
317
+ " <th>5571</th>\n",
318
+ " <td>1</td>\n",
319
+ " <td>rofl true name</td>\n",
320
+ " </tr>\n",
321
+ " </tbody>\n",
322
+ "</table>\n",
323
+ "<p>5572 rows × 2 columns</p>\n",
324
+ "</div>"
325
+ ],
326
+ "text/plain": [
327
+ " Label Message\n",
328
+ "0 1 go jurong point crazy available bugis n great ...\n",
329
+ "1 1 ok lar joking wif u oni\n",
330
+ "2 0 free entry wkly comp win fa cup final tkts st ...\n",
331
+ "3 1 u dun say early hor u c already say\n",
332
+ "4 1 nah think go usf life around though\n",
333
+ "... ... ...\n",
334
+ "5567 0 nd time tried contact u u pound prize claim ea...\n",
335
+ "5568 1 b going esplanade fr home\n",
336
+ "5569 1 pity mood suggestion\n",
337
+ "5570 1 guy bitching acted like interested buying some...\n",
338
+ "5571 1 rofl true name\n",
339
+ "\n",
340
+ "[5572 rows x 2 columns]"
341
+ ]
342
+ },
343
+ "execution_count": 7,
344
+ "metadata": {},
345
+ "output_type": "execute_result"
346
+ }
347
+ ],
348
+ "source": [
349
+ "data"
350
+ ]
351
+ },
352
+ {
353
+ "cell_type": "code",
354
+ "execution_count": 8,
355
+ "id": "d337a825-5af8-40b5-88b1-f6ec7ca5bca2",
356
+ "metadata": {},
357
+ "outputs": [
358
+ {
359
+ "data": {
360
+ "text/html": [
361
+ "<div>\n",
362
+ "<style scoped>\n",
363
+ " .dataframe tbody tr th:only-of-type {\n",
364
+ " vertical-align: middle;\n",
365
+ " }\n",
366
+ "\n",
367
+ " .dataframe tbody tr th {\n",
368
+ " vertical-align: top;\n",
369
+ " }\n",
370
+ "\n",
371
+ " .dataframe thead th {\n",
372
+ " text-align: right;\n",
373
+ " }\n",
374
+ "</style>\n",
375
+ "<table border=\"1\" class=\"dataframe\">\n",
376
+ " <thead>\n",
377
+ " <tr style=\"text-align: right;\">\n",
378
+ " <th></th>\n",
379
+ " <th>Label</th>\n",
380
+ " <th>Message</th>\n",
381
+ " </tr>\n",
382
+ " </thead>\n",
383
+ " <tbody>\n",
384
+ " <tr>\n",
385
+ " <th>0</th>\n",
386
+ " <td>1.0</td>\n",
387
+ " <td>go jurong point crazy available bugis n great ...</td>\n",
388
+ " </tr>\n",
389
+ " <tr>\n",
390
+ " <th>1</th>\n",
391
+ " <td>1.0</td>\n",
392
+ " <td>ok lar joking wif u oni</td>\n",
393
+ " </tr>\n",
394
+ " <tr>\n",
395
+ " <th>2</th>\n",
396
+ " <td>0.0</td>\n",
397
+ " <td>free entry wkly comp win fa cup final tkts st ...</td>\n",
398
+ " </tr>\n",
399
+ " <tr>\n",
400
+ " <th>3</th>\n",
401
+ " <td>1.0</td>\n",
402
+ " <td>u dun say early hor u c already say</td>\n",
403
+ " </tr>\n",
404
+ " <tr>\n",
405
+ " <th>4</th>\n",
406
+ " <td>1.0</td>\n",
407
+ " <td>nah think go usf life around though</td>\n",
408
+ " </tr>\n",
409
+ " <tr>\n",
410
+ " <th>...</th>\n",
411
+ " <td>...</td>\n",
412
+ " <td>...</td>\n",
413
+ " </tr>\n",
414
+ " <tr>\n",
415
+ " <th>5567</th>\n",
416
+ " <td>0.0</td>\n",
417
+ " <td>nd time tried contact u u pound prize claim ea...</td>\n",
418
+ " </tr>\n",
419
+ " <tr>\n",
420
+ " <th>5568</th>\n",
421
+ " <td>1.0</td>\n",
422
+ " <td>b going esplanade fr home</td>\n",
423
+ " </tr>\n",
424
+ " <tr>\n",
425
+ " <th>5569</th>\n",
426
+ " <td>1.0</td>\n",
427
+ " <td>pity mood suggestion</td>\n",
428
+ " </tr>\n",
429
+ " <tr>\n",
430
+ " <th>5570</th>\n",
431
+ " <td>1.0</td>\n",
432
+ " <td>guy bitching acted like interested buying some...</td>\n",
433
+ " </tr>\n",
434
+ " <tr>\n",
435
+ " <th>5571</th>\n",
436
+ " <td>1.0</td>\n",
437
+ " <td>rofl true name</td>\n",
438
+ " </tr>\n",
439
+ " </tbody>\n",
440
+ "</table>\n",
441
+ "<p>5572 rows × 2 columns</p>\n",
442
+ "</div>"
443
+ ],
444
+ "text/plain": [
445
+ " Label Message\n",
446
+ "0 1.0 go jurong point crazy available bugis n great ...\n",
447
+ "1 1.0 ok lar joking wif u oni\n",
448
+ "2 0.0 free entry wkly comp win fa cup final tkts st ...\n",
449
+ "3 1.0 u dun say early hor u c already say\n",
450
+ "4 1.0 nah think go usf life around though\n",
451
+ "... ... ...\n",
452
+ "5567 0.0 nd time tried contact u u pound prize claim ea...\n",
453
+ "5568 1.0 b going esplanade fr home\n",
454
+ "5569 1.0 pity mood suggestion\n",
455
+ "5570 1.0 guy bitching acted like interested buying some...\n",
456
+ "5571 1.0 rofl true name\n",
457
+ "\n",
458
+ "[5572 rows x 2 columns]"
459
+ ]
460
+ },
461
+ "execution_count": 8,
462
+ "metadata": {},
463
+ "output_type": "execute_result"
464
+ }
465
+ ],
466
+ "source": [
467
+ "full_data = pd.concat([full_data, data], ignore_index=True)\n",
468
+ "full_data"
469
+ ]
470
+ },
471
+ {
472
+ "cell_type": "markdown",
473
+ "id": "e903b4ef-f47c-4df6-9775-f920b9a91ad1",
474
+ "metadata": {},
475
+ "source": [
476
+ "# Dataset 2"
477
+ ]
478
+ },
479
+ {
480
+ "cell_type": "code",
481
+ "execution_count": 9,
482
+ "id": "ea1cbd53-160c-481e-b911-7c03f672de9b",
483
+ "metadata": {},
484
+ "outputs": [
485
+ {
486
+ "data": {
487
+ "text/html": [
488
+ "<div>\n",
489
+ "<style scoped>\n",
490
+ " .dataframe tbody tr th:only-of-type {\n",
491
+ " vertical-align: middle;\n",
492
+ " }\n",
493
+ "\n",
494
+ " .dataframe tbody tr th {\n",
495
+ " vertical-align: top;\n",
496
+ " }\n",
497
+ "\n",
498
+ " .dataframe thead th {\n",
499
+ " text-align: right;\n",
500
+ " }\n",
501
+ "</style>\n",
502
+ "<table border=\"1\" class=\"dataframe\">\n",
503
+ " <thead>\n",
504
+ " <tr style=\"text-align: right;\">\n",
505
+ " <th></th>\n",
506
+ " <th>email</th>\n",
507
+ " <th>label</th>\n",
508
+ " </tr>\n",
509
+ " </thead>\n",
510
+ " <tbody>\n",
511
+ " <tr>\n",
512
+ " <th>0</th>\n",
513
+ " <td>date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...</td>\n",
514
+ " <td>0</td>\n",
515
+ " </tr>\n",
516
+ " <tr>\n",
517
+ " <th>1</th>\n",
518
+ " <td>martin a posted tassos papadopoulos the greek ...</td>\n",
519
+ " <td>0</td>\n",
520
+ " </tr>\n",
521
+ " <tr>\n",
522
+ " <th>2</th>\n",
523
+ " <td>man threatens explosion in moscow thursday aug...</td>\n",
524
+ " <td>0</td>\n",
525
+ " </tr>\n",
526
+ " <tr>\n",
527
+ " <th>3</th>\n",
528
+ " <td>klez the virus that won t die already the most...</td>\n",
529
+ " <td>0</td>\n",
530
+ " </tr>\n",
531
+ " <tr>\n",
532
+ " <th>4</th>\n",
533
+ " <td>in adding cream to spaghetti carbonara which ...</td>\n",
534
+ " <td>0</td>\n",
535
+ " </tr>\n",
536
+ " <tr>\n",
537
+ " <th>...</th>\n",
538
+ " <td>...</td>\n",
539
+ " <td>...</td>\n",
540
+ " </tr>\n",
541
+ " <tr>\n",
542
+ " <th>2995</th>\n",
543
+ " <td>abc s good morning america ranks it the NUMBE...</td>\n",
544
+ " <td>1</td>\n",
545
+ " </tr>\n",
546
+ " <tr>\n",
547
+ " <th>2996</th>\n",
548
+ " <td>hyperlink hyperlink hyperlink let mortgage le...</td>\n",
549
+ " <td>1</td>\n",
550
+ " </tr>\n",
551
+ " <tr>\n",
552
+ " <th>2997</th>\n",
553
+ " <td>thank you for shopping with us gifts for all ...</td>\n",
554
+ " <td>1</td>\n",
555
+ " </tr>\n",
556
+ " <tr>\n",
557
+ " <th>2998</th>\n",
558
+ " <td>the famous ebay marketing e course learn to s...</td>\n",
559
+ " <td>1</td>\n",
560
+ " </tr>\n",
561
+ " <tr>\n",
562
+ " <th>2999</th>\n",
563
+ " <td>hello this is chinese traditional 子 件 NUMBER世...</td>\n",
564
+ " <td>1</td>\n",
565
+ " </tr>\n",
566
+ " </tbody>\n",
567
+ "</table>\n",
568
+ "<p>3000 rows × 2 columns</p>\n",
569
+ "</div>"
570
+ ],
571
+ "text/plain": [
572
+ " email label\n",
573
+ "0 date wed NUMBER aug NUMBER NUMBER NUMBER NUMB... 0\n",
574
+ "1 martin a posted tassos papadopoulos the greek ... 0\n",
575
+ "2 man threatens explosion in moscow thursday aug... 0\n",
576
+ "3 klez the virus that won t die already the most... 0\n",
577
+ "4 in adding cream to spaghetti carbonara which ... 0\n",
578
+ "... ... ...\n",
579
+ "2995 abc s good morning america ranks it the NUMBE... 1\n",
580
+ "2996 hyperlink hyperlink hyperlink let mortgage le... 1\n",
581
+ "2997 thank you for shopping with us gifts for all ... 1\n",
582
+ "2998 the famous ebay marketing e course learn to s... 1\n",
583
+ "2999 hello this is chinese traditional 子 件 NUMBER世... 1\n",
584
+ "\n",
585
+ "[3000 rows x 2 columns]"
586
+ ]
587
+ },
588
+ "execution_count": 9,
589
+ "metadata": {},
590
+ "output_type": "execute_result"
591
+ }
592
+ ],
593
+ "source": [
594
+ "data = pd.read_csv('spam_data/spam_data_2.csv')\n",
595
+ "data"
596
+ ]
597
+ },
598
+ {
599
+ "cell_type": "code",
600
+ "execution_count": 10,
601
+ "id": "25352c21-ce85-4b17-90da-48fb4a959844",
602
+ "metadata": {},
603
+ "outputs": [],
604
+ "source": [
605
+ "data = data.dropna()\n",
606
+ "data = data.reset_index(drop=True)"
607
+ ]
608
+ },
609
+ {
610
+ "cell_type": "code",
611
+ "execution_count": 11,
612
+ "id": "7b07e8f2-be13-4d87-a5fd-eecaf2408f61",
613
+ "metadata": {},
614
+ "outputs": [],
615
+ "source": [
616
+ "data['label'] = data['label'].replace(to_replace=[0, 1], value=[1, 0]).astype(int)"
617
+ ]
618
+ },
619
+ {
620
+ "cell_type": "code",
621
+ "execution_count": 12,
622
+ "id": "aadb5b45-e08f-456c-940b-457936bf49f0",
623
+ "metadata": {
624
+ "scrolled": true
625
+ },
626
+ "outputs": [
627
+ {
628
+ "name": "stdout",
629
+ "output_type": "stream",
630
+ "text": [
631
+ "messages processed : 0\n",
632
+ "messages processed : 100\n",
633
+ "messages processed : 200\n",
634
+ "messages processed : 300\n",
635
+ "messages processed : 400\n",
636
+ "messages processed : 500\n",
637
+ "messages processed : 600\n",
638
+ "messages processed : 700\n",
639
+ "messages processed : 800\n",
640
+ "messages processed : 900\n",
641
+ "messages processed : 1000\n",
642
+ "messages processed : 1100\n",
643
+ "messages processed : 1200\n",
644
+ "messages processed : 1300\n",
645
+ "messages processed : 1400\n",
646
+ "messages processed : 1500\n",
647
+ "messages processed : 1600\n",
648
+ "messages processed : 1700\n",
649
+ "messages processed : 1800\n",
650
+ "messages processed : 1900\n",
651
+ "messages processed : 2000\n",
652
+ "messages processed : 2100\n",
653
+ "messages processed : 2200\n",
654
+ "messages processed : 2300\n",
655
+ "messages processed : 2400\n",
656
+ "messages processed : 2500\n",
657
+ "messages processed : 2600\n",
658
+ "messages processed : 2700\n",
659
+ "messages processed : 2800\n",
660
+ "messages processed : 2900\n"
661
+ ]
662
+ }
663
+ ],
664
+ "source": [
665
+ "for i in range(len(data)):\n",
666
+ " review = re.sub('[^a-zA-Z]', ' ', data['email'][i])\n",
667
+ " review = review.lower()\n",
668
+ " review = review.split()\n",
669
+ " review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]\n",
670
+ " review = ' '.join(review)\n",
671
+ " data.loc[i, 'email'] = review\n",
672
+ " if i%100==0:\n",
673
+ " print('messages processed :' ,i)"
674
+ ]
675
+ },
676
+ {
677
+ "cell_type": "code",
678
+ "execution_count": 13,
679
+ "id": "fbe8e35f-979c-4a93-abab-a2dd7be2eee5",
680
+ "metadata": {},
681
+ "outputs": [],
682
+ "source": [
683
+ "data = data[['label', 'email']]\n",
684
+ "data = data.rename(columns={'label':'Label', 'email':'Message'})"
685
+ ]
686
+ },
687
+ {
688
+ "cell_type": "code",
689
+ "execution_count": 14,
690
+ "id": "14abf300-2450-43e8-8473-2d80ba810889",
691
+ "metadata": {},
692
+ "outputs": [
693
+ {
694
+ "data": {
695
+ "text/html": [
696
+ "<div>\n",
697
+ "<style scoped>\n",
698
+ " .dataframe tbody tr th:only-of-type {\n",
699
+ " vertical-align: middle;\n",
700
+ " }\n",
701
+ "\n",
702
+ " .dataframe tbody tr th {\n",
703
+ " vertical-align: top;\n",
704
+ " }\n",
705
+ "\n",
706
+ " .dataframe thead th {\n",
707
+ " text-align: right;\n",
708
+ " }\n",
709
+ "</style>\n",
710
+ "<table border=\"1\" class=\"dataframe\">\n",
711
+ " <thead>\n",
712
+ " <tr style=\"text-align: right;\">\n",
713
+ " <th></th>\n",
714
+ " <th>Label</th>\n",
715
+ " <th>Message</th>\n",
716
+ " </tr>\n",
717
+ " </thead>\n",
718
+ " <tbody>\n",
719
+ " <tr>\n",
720
+ " <th>0</th>\n",
721
+ " <td>1.0</td>\n",
722
+ " <td>go jurong point crazy available bugis n great ...</td>\n",
723
+ " </tr>\n",
724
+ " <tr>\n",
725
+ " <th>1</th>\n",
726
+ " <td>1.0</td>\n",
727
+ " <td>ok lar joking wif u oni</td>\n",
728
+ " </tr>\n",
729
+ " <tr>\n",
730
+ " <th>2</th>\n",
731
+ " <td>0.0</td>\n",
732
+ " <td>free entry wkly comp win fa cup final tkts st ...</td>\n",
733
+ " </tr>\n",
734
+ " <tr>\n",
735
+ " <th>3</th>\n",
736
+ " <td>1.0</td>\n",
737
+ " <td>u dun say early hor u c already say</td>\n",
738
+ " </tr>\n",
739
+ " <tr>\n",
740
+ " <th>4</th>\n",
741
+ " <td>1.0</td>\n",
742
+ " <td>nah think go usf life around though</td>\n",
743
+ " </tr>\n",
744
+ " <tr>\n",
745
+ " <th>...</th>\n",
746
+ " <td>...</td>\n",
747
+ " <td>...</td>\n",
748
+ " </tr>\n",
749
+ " <tr>\n",
750
+ " <th>8566</th>\n",
751
+ " <td>0.0</td>\n",
752
+ " <td>abc good morning america rank number christmas...</td>\n",
753
+ " </tr>\n",
754
+ " <tr>\n",
755
+ " <th>8567</th>\n",
756
+ " <td>0.0</td>\n",
757
+ " <td>hyperlink hyperlink hyperlink let mortgage len...</td>\n",
758
+ " </tr>\n",
759
+ " <tr>\n",
760
+ " <th>8568</th>\n",
761
+ " <td>0.0</td>\n",
762
+ " <td>thank shopping u gift occasion free gift numbe...</td>\n",
763
+ " </tr>\n",
764
+ " <tr>\n",
765
+ " <th>8569</th>\n",
766
+ " <td>0.0</td>\n",
767
+ " <td>famous ebay marketing e course learn sell comp...</td>\n",
768
+ " </tr>\n",
769
+ " <tr>\n",
770
+ " <th>8570</th>\n",
771
+ " <td>0.0</td>\n",
772
+ " <td>hello chinese traditional number number f r v ...</td>\n",
773
+ " </tr>\n",
774
+ " </tbody>\n",
775
+ "</table>\n",
776
+ "<p>8571 rows × 2 columns</p>\n",
777
+ "</div>"
778
+ ],
779
+ "text/plain": [
780
+ " Label Message\n",
781
+ "0 1.0 go jurong point crazy available bugis n great ...\n",
782
+ "1 1.0 ok lar joking wif u oni\n",
783
+ "2 0.0 free entry wkly comp win fa cup final tkts st ...\n",
784
+ "3 1.0 u dun say early hor u c already say\n",
785
+ "4 1.0 nah think go usf life around though\n",
786
+ "... ... ...\n",
787
+ "8566 0.0 abc good morning america rank number christmas...\n",
788
+ "8567 0.0 hyperlink hyperlink hyperlink let mortgage len...\n",
789
+ "8568 0.0 thank shopping u gift occasion free gift numbe...\n",
790
+ "8569 0.0 famous ebay marketing e course learn sell comp...\n",
791
+ "8570 0.0 hello chinese traditional number number f r v ...\n",
792
+ "\n",
793
+ "[8571 rows x 2 columns]"
794
+ ]
795
+ },
796
+ "execution_count": 14,
797
+ "metadata": {},
798
+ "output_type": "execute_result"
799
+ }
800
+ ],
801
+ "source": [
802
+ "full_data = pd.concat([full_data, data], ignore_index=True)\n",
803
+ "full_data"
804
+ ]
805
+ },
806
+ {
807
+ "cell_type": "code",
808
+ "execution_count": 15,
809
+ "id": "bf09a641-c36f-434d-8d5a-3fd62ae1dac8",
810
+ "metadata": {},
811
+ "outputs": [],
812
+ "source": [
813
+ "full_data.to_csv('spam_data/full_data.csv', index=False)"
814
+ ]
815
+ },
816
+ {
817
+ "cell_type": "code",
818
+ "execution_count": null,
819
+ "id": "5f84c43d-2915-4d99-b1c5-85cb65113c8c",
820
+ "metadata": {},
821
+ "outputs": [],
822
+ "source": []
823
+ }
824
+ ],
825
+ "metadata": {
826
+ "kernelspec": {
827
+ "display_name": "Python 3 (ipykernel)",
828
+ "language": "python",
829
+ "name": "python3"
830
+ },
831
+ "language_info": {
832
+ "codemirror_mode": {
833
+ "name": "ipython",
834
+ "version": 3
835
+ },
836
+ "file_extension": ".py",
837
+ "mimetype": "text/x-python",
838
+ "name": "python",
839
+ "nbconvert_exporter": "python",
840
+ "pygments_lexer": "ipython3",
841
+ "version": "3.12.4"
842
+ }
843
+ },
844
+ "nbformat": 4,
845
+ "nbformat_minor": 5
846
+ }
vectorizer_and_model.ipynb ADDED
@@ -0,0 +1,1326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "e01e2899-35c2-4707-b271-433599ded8f6",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Read data"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 1,
14
+ "id": "1ecabbad-ed2b-48dc-ac3f-1b04e5cd9014",
15
+ "metadata": {},
16
+ "outputs": [
17
+ {
18
+ "data": {
19
+ "application/javascript": [
20
+ "\n",
21
+ " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import pandas as pd'); }\n",
22
+ " "
23
+ ],
24
+ "text/plain": [
25
+ "<IPython.core.display.Javascript object>"
26
+ ]
27
+ },
28
+ "metadata": {},
29
+ "output_type": "display_data"
30
+ },
31
+ {
32
+ "data": {
33
+ "text/html": [
34
+ "<div>\n",
35
+ "<style scoped>\n",
36
+ " .dataframe tbody tr th:only-of-type {\n",
37
+ " vertical-align: middle;\n",
38
+ " }\n",
39
+ "\n",
40
+ " .dataframe tbody tr th {\n",
41
+ " vertical-align: top;\n",
42
+ " }\n",
43
+ "\n",
44
+ " .dataframe thead th {\n",
45
+ " text-align: right;\n",
46
+ " }\n",
47
+ "</style>\n",
48
+ "<table border=\"1\" class=\"dataframe\">\n",
49
+ " <thead>\n",
50
+ " <tr style=\"text-align: right;\">\n",
51
+ " <th></th>\n",
52
+ " <th>Label</th>\n",
53
+ " <th>Message</th>\n",
54
+ " </tr>\n",
55
+ " </thead>\n",
56
+ " <tbody>\n",
57
+ " <tr>\n",
58
+ " <th>0</th>\n",
59
+ " <td>1.0</td>\n",
60
+ " <td>go jurong point crazy available bugis n great ...</td>\n",
61
+ " </tr>\n",
62
+ " <tr>\n",
63
+ " <th>1</th>\n",
64
+ " <td>1.0</td>\n",
65
+ " <td>ok lar joking wif u oni</td>\n",
66
+ " </tr>\n",
67
+ " <tr>\n",
68
+ " <th>2</th>\n",
69
+ " <td>0.0</td>\n",
70
+ " <td>free entry wkly comp win fa cup final tkts st ...</td>\n",
71
+ " </tr>\n",
72
+ " <tr>\n",
73
+ " <th>3</th>\n",
74
+ " <td>1.0</td>\n",
75
+ " <td>u dun say early hor u c already say</td>\n",
76
+ " </tr>\n",
77
+ " <tr>\n",
78
+ " <th>4</th>\n",
79
+ " <td>1.0</td>\n",
80
+ " <td>nah think go usf life around though</td>\n",
81
+ " </tr>\n",
82
+ " <tr>\n",
83
+ " <th>...</th>\n",
84
+ " <td>...</td>\n",
85
+ " <td>...</td>\n",
86
+ " </tr>\n",
87
+ " <tr>\n",
88
+ " <th>8566</th>\n",
89
+ " <td>0.0</td>\n",
90
+ " <td>abc good morning america rank number christmas...</td>\n",
91
+ " </tr>\n",
92
+ " <tr>\n",
93
+ " <th>8567</th>\n",
94
+ " <td>0.0</td>\n",
95
+ " <td>hyperlink hyperlink hyperlink let mortgage len...</td>\n",
96
+ " </tr>\n",
97
+ " <tr>\n",
98
+ " <th>8568</th>\n",
99
+ " <td>0.0</td>\n",
100
+ " <td>thank shopping u gift occasion free gift numbe...</td>\n",
101
+ " </tr>\n",
102
+ " <tr>\n",
103
+ " <th>8569</th>\n",
104
+ " <td>0.0</td>\n",
105
+ " <td>famous ebay marketing e course learn sell comp...</td>\n",
106
+ " </tr>\n",
107
+ " <tr>\n",
108
+ " <th>8570</th>\n",
109
+ " <td>0.0</td>\n",
110
+ " <td>hello chinese traditional number number f r v ...</td>\n",
111
+ " </tr>\n",
112
+ " </tbody>\n",
113
+ "</table>\n",
114
+ "<p>8571 rows × 2 columns</p>\n",
115
+ "</div>"
116
+ ],
117
+ "text/plain": [
118
+ " Label Message\n",
119
+ "0 1.0 go jurong point crazy available bugis n great ...\n",
120
+ "1 1.0 ok lar joking wif u oni\n",
121
+ "2 0.0 free entry wkly comp win fa cup final tkts st ...\n",
122
+ "3 1.0 u dun say early hor u c already say\n",
123
+ "4 1.0 nah think go usf life around though\n",
124
+ "... ... ...\n",
125
+ "8566 0.0 abc good morning america rank number christmas...\n",
126
+ "8567 0.0 hyperlink hyperlink hyperlink let mortgage len...\n",
127
+ "8568 0.0 thank shopping u gift occasion free gift numbe...\n",
128
+ "8569 0.0 famous ebay marketing e course learn sell comp...\n",
129
+ "8570 0.0 hello chinese traditional number number f r v ...\n",
130
+ "\n",
131
+ "[8571 rows x 2 columns]"
132
+ ]
133
+ },
134
+ "execution_count": 1,
135
+ "metadata": {},
136
+ "output_type": "execute_result"
137
+ }
138
+ ],
139
+ "source": [
140
+ "full_data = pd.read_csv('spam_data/full_data.csv')\n",
141
+ "full_data"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": 2,
147
+ "id": "3c2c7ed2-6d48-4bac-a9f9-a2220e67dbc2",
148
+ "metadata": {},
149
+ "outputs": [],
150
+ "source": [
151
+ "full_data = full_data.dropna()"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": 3,
157
+ "id": "c537fe92-ce4c-4aac-9da9-12259d5039f7",
158
+ "metadata": {},
159
+ "outputs": [
160
+ {
161
+ "data": {
162
+ "text/plain": [
163
+ "Label 0\n",
164
+ "Message 0\n",
165
+ "dtype: int64"
166
+ ]
167
+ },
168
+ "execution_count": 3,
169
+ "metadata": {},
170
+ "output_type": "execute_result"
171
+ }
172
+ ],
173
+ "source": [
174
+ "full_data.isnull().sum()"
175
+ ]
176
+ },
177
+ {
178
+ "cell_type": "markdown",
179
+ "id": "483a4715-8949-42e7-8099-4bb970289271",
180
+ "metadata": {},
181
+ "source": [
182
+ "# Vectorizer"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": 4,
188
+ "id": "4a1067a4-7dba-43e7-ac34-a503f87c29ce",
189
+ "metadata": {},
190
+ "outputs": [],
191
+ "source": [
192
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
193
+ "cv = CountVectorizer(max_features=5000)\n",
194
+ "X = cv.fit_transform(full_data['Message']).toarray()"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": 5,
200
+ "id": "a3941df7-ff2c-4ee5-b806-e7bcb0633274",
201
+ "metadata": {},
202
+ "outputs": [
203
+ {
204
+ "data": {
205
+ "text/plain": [
206
+ "(8561, 5000)"
207
+ ]
208
+ },
209
+ "execution_count": 5,
210
+ "metadata": {},
211
+ "output_type": "execute_result"
212
+ }
213
+ ],
214
+ "source": [
215
+ "X.shape"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": 6,
221
+ "id": "119f45c9-f411-4c93-9447-a02b38724d62",
222
+ "metadata": {},
223
+ "outputs": [
224
+ {
225
+ "data": {
226
+ "text/plain": [
227
+ "array([[0, 0, 0, ..., 0, 0, 0],\n",
228
+ " [0, 0, 0, ..., 0, 0, 0],\n",
229
+ " [0, 0, 0, ..., 0, 0, 0],\n",
230
+ " ...,\n",
231
+ " [0, 0, 0, ..., 0, 0, 0],\n",
232
+ " [0, 0, 0, ..., 0, 0, 0],\n",
233
+ " [0, 0, 0, ..., 0, 0, 0]], dtype=int64)"
234
+ ]
235
+ },
236
+ "execution_count": 6,
237
+ "metadata": {},
238
+ "output_type": "execute_result"
239
+ }
240
+ ],
241
+ "source": [
242
+ "X"
243
+ ]
244
+ },
245
+ {
246
+ "cell_type": "code",
247
+ "execution_count": 7,
248
+ "id": "8e97ecfd-472f-42ac-9ae5-61308d6df041",
249
+ "metadata": {},
250
+ "outputs": [
251
+ {
252
+ "data": {
253
+ "text/plain": [
254
+ "0 1.0\n",
255
+ "1 1.0\n",
256
+ "2 0.0\n",
257
+ "3 1.0\n",
258
+ "4 1.0\n",
259
+ " ... \n",
260
+ "8566 0.0\n",
261
+ "8567 0.0\n",
262
+ "8568 0.0\n",
263
+ "8569 0.0\n",
264
+ "8570 0.0\n",
265
+ "Name: Label, Length: 8561, dtype: float64"
266
+ ]
267
+ },
268
+ "execution_count": 7,
269
+ "metadata": {},
270
+ "output_type": "execute_result"
271
+ }
272
+ ],
273
+ "source": [
274
+ "y = full_data['Label']\n",
275
+ "y"
276
+ ]
277
+ },
278
+ {
279
+ "cell_type": "markdown",
280
+ "id": "9ef6a206-3304-413f-9d5b-ef46d8c87206",
281
+ "metadata": {},
282
+ "source": [
283
+ "# Model"
284
+ ]
285
+ },
286
+ {
287
+ "cell_type": "code",
288
+ "execution_count": 8,
289
+ "id": "66969958-54c1-404d-88c1-b0c694b39527",
290
+ "metadata": {},
291
+ "outputs": [],
292
+ "source": [
293
+ "from sklearn.model_selection import train_test_split\n",
294
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)"
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "code",
299
+ "execution_count": 9,
300
+ "id": "82d7b37f-ae5e-41f1-acd2-24b361dd41c4",
301
+ "metadata": {},
302
+ "outputs": [
303
+ {
304
+ "data": {
305
+ "text/html": [
306
+ "<style>#sk-container-id-1 {\n",
307
+ " /* Definition of color scheme common for light and dark mode */\n",
308
+ " --sklearn-color-text: black;\n",
309
+ " --sklearn-color-line: gray;\n",
310
+ " /* Definition of color scheme for unfitted estimators */\n",
311
+ " --sklearn-color-unfitted-level-0: #fff5e6;\n",
312
+ " --sklearn-color-unfitted-level-1: #f6e4d2;\n",
313
+ " --sklearn-color-unfitted-level-2: #ffe0b3;\n",
314
+ " --sklearn-color-unfitted-level-3: chocolate;\n",
315
+ " /* Definition of color scheme for fitted estimators */\n",
316
+ " --sklearn-color-fitted-level-0: #f0f8ff;\n",
317
+ " --sklearn-color-fitted-level-1: #d4ebff;\n",
318
+ " --sklearn-color-fitted-level-2: #b3dbfd;\n",
319
+ " --sklearn-color-fitted-level-3: cornflowerblue;\n",
320
+ "\n",
321
+ " /* Specific color for light theme */\n",
322
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
323
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
324
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
325
+ " --sklearn-color-icon: #696969;\n",
326
+ "\n",
327
+ " @media (prefers-color-scheme: dark) {\n",
328
+ " /* Redefinition of color scheme for dark theme */\n",
329
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
330
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
331
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
332
+ " --sklearn-color-icon: #878787;\n",
333
+ " }\n",
334
+ "}\n",
335
+ "\n",
336
+ "#sk-container-id-1 {\n",
337
+ " color: var(--sklearn-color-text);\n",
338
+ "}\n",
339
+ "\n",
340
+ "#sk-container-id-1 pre {\n",
341
+ " padding: 0;\n",
342
+ "}\n",
343
+ "\n",
344
+ "#sk-container-id-1 input.sk-hidden--visually {\n",
345
+ " border: 0;\n",
346
+ " clip: rect(1px 1px 1px 1px);\n",
347
+ " clip: rect(1px, 1px, 1px, 1px);\n",
348
+ " height: 1px;\n",
349
+ " margin: -1px;\n",
350
+ " overflow: hidden;\n",
351
+ " padding: 0;\n",
352
+ " position: absolute;\n",
353
+ " width: 1px;\n",
354
+ "}\n",
355
+ "\n",
356
+ "#sk-container-id-1 div.sk-dashed-wrapped {\n",
357
+ " border: 1px dashed var(--sklearn-color-line);\n",
358
+ " margin: 0 0.4em 0.5em 0.4em;\n",
359
+ " box-sizing: border-box;\n",
360
+ " padding-bottom: 0.4em;\n",
361
+ " background-color: var(--sklearn-color-background);\n",
362
+ "}\n",
363
+ "\n",
364
+ "#sk-container-id-1 div.sk-container {\n",
365
+ " /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
366
+ " but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
367
+ " so we also need the `!important` here to be able to override the\n",
368
+ " default hidden behavior on the sphinx rendered scikit-learn.org.\n",
369
+ " See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
370
+ " display: inline-block !important;\n",
371
+ " position: relative;\n",
372
+ "}\n",
373
+ "\n",
374
+ "#sk-container-id-1 div.sk-text-repr-fallback {\n",
375
+ " display: none;\n",
376
+ "}\n",
377
+ "\n",
378
+ "div.sk-parallel-item,\n",
379
+ "div.sk-serial,\n",
380
+ "div.sk-item {\n",
381
+ " /* draw centered vertical line to link estimators */\n",
382
+ " background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
383
+ " background-size: 2px 100%;\n",
384
+ " background-repeat: no-repeat;\n",
385
+ " background-position: center center;\n",
386
+ "}\n",
387
+ "\n",
388
+ "/* Parallel-specific style estimator block */\n",
389
+ "\n",
390
+ "#sk-container-id-1 div.sk-parallel-item::after {\n",
391
+ " content: \"\";\n",
392
+ " width: 100%;\n",
393
+ " border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
394
+ " flex-grow: 1;\n",
395
+ "}\n",
396
+ "\n",
397
+ "#sk-container-id-1 div.sk-parallel {\n",
398
+ " display: flex;\n",
399
+ " align-items: stretch;\n",
400
+ " justify-content: center;\n",
401
+ " background-color: var(--sklearn-color-background);\n",
402
+ " position: relative;\n",
403
+ "}\n",
404
+ "\n",
405
+ "#sk-container-id-1 div.sk-parallel-item {\n",
406
+ " display: flex;\n",
407
+ " flex-direction: column;\n",
408
+ "}\n",
409
+ "\n",
410
+ "#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
411
+ " align-self: flex-end;\n",
412
+ " width: 50%;\n",
413
+ "}\n",
414
+ "\n",
415
+ "#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
416
+ " align-self: flex-start;\n",
417
+ " width: 50%;\n",
418
+ "}\n",
419
+ "\n",
420
+ "#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
421
+ " width: 0;\n",
422
+ "}\n",
423
+ "\n",
424
+ "/* Serial-specific style estimator block */\n",
425
+ "\n",
426
+ "#sk-container-id-1 div.sk-serial {\n",
427
+ " display: flex;\n",
428
+ " flex-direction: column;\n",
429
+ " align-items: center;\n",
430
+ " background-color: var(--sklearn-color-background);\n",
431
+ " padding-right: 1em;\n",
432
+ " padding-left: 1em;\n",
433
+ "}\n",
434
+ "\n",
435
+ "\n",
436
+ "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
437
+ "clickable and can be expanded/collapsed.\n",
438
+ "- Pipeline and ColumnTransformer use this feature and define the default style\n",
439
+ "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
440
+ "*/\n",
441
+ "\n",
442
+ "/* Pipeline and ColumnTransformer style (default) */\n",
443
+ "\n",
444
+ "#sk-container-id-1 div.sk-toggleable {\n",
445
+ " /* Default theme specific background. It is overwritten whether we have a\n",
446
+ " specific estimator or a Pipeline/ColumnTransformer */\n",
447
+ " background-color: var(--sklearn-color-background);\n",
448
+ "}\n",
449
+ "\n",
450
+ "/* Toggleable label */\n",
451
+ "#sk-container-id-1 label.sk-toggleable__label {\n",
452
+ " cursor: pointer;\n",
453
+ " display: block;\n",
454
+ " width: 100%;\n",
455
+ " margin-bottom: 0;\n",
456
+ " padding: 0.5em;\n",
457
+ " box-sizing: border-box;\n",
458
+ " text-align: center;\n",
459
+ "}\n",
460
+ "\n",
461
+ "#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
462
+ " /* Arrow on the left of the label */\n",
463
+ " content: \"▸\";\n",
464
+ " float: left;\n",
465
+ " margin-right: 0.25em;\n",
466
+ " color: var(--sklearn-color-icon);\n",
467
+ "}\n",
468
+ "\n",
469
+ "#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
470
+ " color: var(--sklearn-color-text);\n",
471
+ "}\n",
472
+ "\n",
473
+ "/* Toggleable content - dropdown */\n",
474
+ "\n",
475
+ "#sk-container-id-1 div.sk-toggleable__content {\n",
476
+ " max-height: 0;\n",
477
+ " max-width: 0;\n",
478
+ " overflow: hidden;\n",
479
+ " text-align: left;\n",
480
+ " /* unfitted */\n",
481
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
482
+ "}\n",
483
+ "\n",
484
+ "#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
485
+ " /* fitted */\n",
486
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
487
+ "}\n",
488
+ "\n",
489
+ "#sk-container-id-1 div.sk-toggleable__content pre {\n",
490
+ " margin: 0.2em;\n",
491
+ " border-radius: 0.25em;\n",
492
+ " color: var(--sklearn-color-text);\n",
493
+ " /* unfitted */\n",
494
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
495
+ "}\n",
496
+ "\n",
497
+ "#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
498
+ " /* unfitted */\n",
499
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
500
+ "}\n",
501
+ "\n",
502
+ "#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
503
+ " /* Expand drop-down */\n",
504
+ " max-height: 200px;\n",
505
+ " max-width: 100%;\n",
506
+ " overflow: auto;\n",
507
+ "}\n",
508
+ "\n",
509
+ "#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
510
+ " content: \"▾\";\n",
511
+ "}\n",
512
+ "\n",
513
+ "/* Pipeline/ColumnTransformer-specific style */\n",
514
+ "\n",
515
+ "#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
516
+ " color: var(--sklearn-color-text);\n",
517
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
518
+ "}\n",
519
+ "\n",
520
+ "#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
521
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
522
+ "}\n",
523
+ "\n",
524
+ "/* Estimator-specific style */\n",
525
+ "\n",
526
+ "/* Colorize estimator box */\n",
527
+ "#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
528
+ " /* unfitted */\n",
529
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
530
+ "}\n",
531
+ "\n",
532
+ "#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
533
+ " /* fitted */\n",
534
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
535
+ "}\n",
536
+ "\n",
537
+ "#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
538
+ "#sk-container-id-1 div.sk-label label {\n",
539
+ " /* The background is the default theme color */\n",
540
+ " color: var(--sklearn-color-text-on-default-background);\n",
541
+ "}\n",
542
+ "\n",
543
+ "/* On hover, darken the color of the background */\n",
544
+ "#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
545
+ " color: var(--sklearn-color-text);\n",
546
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
547
+ "}\n",
548
+ "\n",
549
+ "/* Label box, darken color on hover, fitted */\n",
550
+ "#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
551
+ " color: var(--sklearn-color-text);\n",
552
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
553
+ "}\n",
554
+ "\n",
555
+ "/* Estimator label */\n",
556
+ "\n",
557
+ "#sk-container-id-1 div.sk-label label {\n",
558
+ " font-family: monospace;\n",
559
+ " font-weight: bold;\n",
560
+ " display: inline-block;\n",
561
+ " line-height: 1.2em;\n",
562
+ "}\n",
563
+ "\n",
564
+ "#sk-container-id-1 div.sk-label-container {\n",
565
+ " text-align: center;\n",
566
+ "}\n",
567
+ "\n",
568
+ "/* Estimator-specific */\n",
569
+ "#sk-container-id-1 div.sk-estimator {\n",
570
+ " font-family: monospace;\n",
571
+ " border: 1px dotted var(--sklearn-color-border-box);\n",
572
+ " border-radius: 0.25em;\n",
573
+ " box-sizing: border-box;\n",
574
+ " margin-bottom: 0.5em;\n",
575
+ " /* unfitted */\n",
576
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
577
+ "}\n",
578
+ "\n",
579
+ "#sk-container-id-1 div.sk-estimator.fitted {\n",
580
+ " /* fitted */\n",
581
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
582
+ "}\n",
583
+ "\n",
584
+ "/* on hover */\n",
585
+ "#sk-container-id-1 div.sk-estimator:hover {\n",
586
+ " /* unfitted */\n",
587
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
588
+ "}\n",
589
+ "\n",
590
+ "#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
591
+ " /* fitted */\n",
592
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
593
+ "}\n",
594
+ "\n",
595
+ "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
596
+ "\n",
597
+ "/* Common style for \"i\" and \"?\" */\n",
598
+ "\n",
599
+ ".sk-estimator-doc-link,\n",
600
+ "a:link.sk-estimator-doc-link,\n",
601
+ "a:visited.sk-estimator-doc-link {\n",
602
+ " float: right;\n",
603
+ " font-size: smaller;\n",
604
+ " line-height: 1em;\n",
605
+ " font-family: monospace;\n",
606
+ " background-color: var(--sklearn-color-background);\n",
607
+ " border-radius: 1em;\n",
608
+ " height: 1em;\n",
609
+ " width: 1em;\n",
610
+ " text-decoration: none !important;\n",
611
+ " margin-left: 1ex;\n",
612
+ " /* unfitted */\n",
613
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
614
+ " color: var(--sklearn-color-unfitted-level-1);\n",
615
+ "}\n",
616
+ "\n",
617
+ ".sk-estimator-doc-link.fitted,\n",
618
+ "a:link.sk-estimator-doc-link.fitted,\n",
619
+ "a:visited.sk-estimator-doc-link.fitted {\n",
620
+ " /* fitted */\n",
621
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
622
+ " color: var(--sklearn-color-fitted-level-1);\n",
623
+ "}\n",
624
+ "\n",
625
+ "/* On hover */\n",
626
+ "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
627
+ ".sk-estimator-doc-link:hover,\n",
628
+ "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
629
+ ".sk-estimator-doc-link:hover {\n",
630
+ " /* unfitted */\n",
631
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
632
+ " color: var(--sklearn-color-background);\n",
633
+ " text-decoration: none;\n",
634
+ "}\n",
635
+ "\n",
636
+ "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
637
+ ".sk-estimator-doc-link.fitted:hover,\n",
638
+ "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
639
+ ".sk-estimator-doc-link.fitted:hover {\n",
640
+ " /* fitted */\n",
641
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
642
+ " color: var(--sklearn-color-background);\n",
643
+ " text-decoration: none;\n",
644
+ "}\n",
645
+ "\n",
646
+ "/* Span, style for the box shown on hovering the info icon */\n",
647
+ ".sk-estimator-doc-link span {\n",
648
+ " display: none;\n",
649
+ " z-index: 9999;\n",
650
+ " position: relative;\n",
651
+ " font-weight: normal;\n",
652
+ " right: .2ex;\n",
653
+ " padding: .5ex;\n",
654
+ " margin: .5ex;\n",
655
+ " width: min-content;\n",
656
+ " min-width: 20ex;\n",
657
+ " max-width: 50ex;\n",
658
+ " color: var(--sklearn-color-text);\n",
659
+ " box-shadow: 2pt 2pt 4pt #999;\n",
660
+ " /* unfitted */\n",
661
+ " background: var(--sklearn-color-unfitted-level-0);\n",
662
+ " border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
663
+ "}\n",
664
+ "\n",
665
+ ".sk-estimator-doc-link.fitted span {\n",
666
+ " /* fitted */\n",
667
+ " background: var(--sklearn-color-fitted-level-0);\n",
668
+ " border: var(--sklearn-color-fitted-level-3);\n",
669
+ "}\n",
670
+ "\n",
671
+ ".sk-estimator-doc-link:hover span {\n",
672
+ " display: block;\n",
673
+ "}\n",
674
+ "\n",
675
+ "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
676
+ "\n",
677
+ "#sk-container-id-1 a.estimator_doc_link {\n",
678
+ " float: right;\n",
679
+ " font-size: 1rem;\n",
680
+ " line-height: 1em;\n",
681
+ " font-family: monospace;\n",
682
+ " background-color: var(--sklearn-color-background);\n",
683
+ " border-radius: 1rem;\n",
684
+ " height: 1rem;\n",
685
+ " width: 1rem;\n",
686
+ " text-decoration: none;\n",
687
+ " /* unfitted */\n",
688
+ " color: var(--sklearn-color-unfitted-level-1);\n",
689
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
690
+ "}\n",
691
+ "\n",
692
+ "#sk-container-id-1 a.estimator_doc_link.fitted {\n",
693
+ " /* fitted */\n",
694
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
695
+ " color: var(--sklearn-color-fitted-level-1);\n",
696
+ "}\n",
697
+ "\n",
698
+ "/* On hover */\n",
699
+ "#sk-container-id-1 a.estimator_doc_link:hover {\n",
700
+ " /* unfitted */\n",
701
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
702
+ " color: var(--sklearn-color-background);\n",
703
+ " text-decoration: none;\n",
704
+ "}\n",
705
+ "\n",
706
+ "#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
707
+ " /* fitted */\n",
708
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
709
+ "}\n",
710
+ "</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>MultinomialNB()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;MultinomialNB<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.naive_bayes.MultinomialNB.html\">?<span>Documentation for MultinomialNB</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>MultinomialNB()</pre></div> </div></div></div></div>"
711
+ ],
712
+ "text/plain": [
713
+ "MultinomialNB()"
714
+ ]
715
+ },
716
+ "execution_count": 9,
717
+ "metadata": {},
718
+ "output_type": "execute_result"
719
+ }
720
+ ],
721
+ "source": [
722
+ "from sklearn.naive_bayes import MultinomialNB\n",
723
+ "from sklearn.ensemble import RandomForestClassifier\n",
724
+ "\n",
725
+ "spam_model = MultinomialNB()\n",
726
+ "spam_model.fit(X_train, y_train)"
727
+ ]
728
+ },
729
+ {
730
+ "cell_type": "code",
731
+ "execution_count": 10,
732
+ "id": "b122e57a-a425-4b08-9bcd-b5dcedcdbc69",
733
+ "metadata": {},
734
+ "outputs": [
735
+ {
736
+ "data": {
737
+ "text/plain": [
738
+ "96.55575014594278"
739
+ ]
740
+ },
741
+ "execution_count": 10,
742
+ "metadata": {},
743
+ "output_type": "execute_result"
744
+ }
745
+ ],
746
+ "source": [
747
+ "from sklearn.metrics import accuracy_score\n",
748
+ "\n",
749
+ "y_pred = spam_model.predict(X_test)\n",
750
+ "\n",
751
+ "accuracy_score(y_pred, y_test) * 100"
752
+ ]
753
+ },
754
+ {
755
+ "cell_type": "code",
756
+ "execution_count": 11,
757
+ "id": "19950e9e-0b0c-4956-8351-96ec8cc2ac8c",
758
+ "metadata": {},
759
+ "outputs": [
760
+ {
761
+ "data": {
762
+ "text/plain": [
763
+ "array([[ 236, 18],\n",
764
+ " [ 41, 1418]], dtype=int64)"
765
+ ]
766
+ },
767
+ "execution_count": 11,
768
+ "metadata": {},
769
+ "output_type": "execute_result"
770
+ }
771
+ ],
772
+ "source": [
773
+ "from sklearn.metrics import confusion_matrix\n",
774
+ "\n",
775
+ "confusion_m = confusion_matrix(y_test, y_pred)\n",
776
+ "confusion_m"
777
+ ]
778
+ },
779
+ {
780
+ "cell_type": "markdown",
781
+ "id": "72f29abb-dc37-4921-ac5c-4a40c51ac51e",
782
+ "metadata": {},
783
+ "source": [
784
+ "# Final Model"
785
+ ]
786
+ },
787
+ {
788
+ "cell_type": "code",
789
+ "execution_count": 12,
790
+ "id": "586531ca-0473-4a52-9037-5d386ab1eda4",
791
+ "metadata": {},
792
+ "outputs": [
793
+ {
794
+ "data": {
795
+ "text/html": [
796
+ "<style>#sk-container-id-2 {\n",
797
+ " /* Definition of color scheme common for light and dark mode */\n",
798
+ " --sklearn-color-text: black;\n",
799
+ " --sklearn-color-line: gray;\n",
800
+ " /* Definition of color scheme for unfitted estimators */\n",
801
+ " --sklearn-color-unfitted-level-0: #fff5e6;\n",
802
+ " --sklearn-color-unfitted-level-1: #f6e4d2;\n",
803
+ " --sklearn-color-unfitted-level-2: #ffe0b3;\n",
804
+ " --sklearn-color-unfitted-level-3: chocolate;\n",
805
+ " /* Definition of color scheme for fitted estimators */\n",
806
+ " --sklearn-color-fitted-level-0: #f0f8ff;\n",
807
+ " --sklearn-color-fitted-level-1: #d4ebff;\n",
808
+ " --sklearn-color-fitted-level-2: #b3dbfd;\n",
809
+ " --sklearn-color-fitted-level-3: cornflowerblue;\n",
810
+ "\n",
811
+ " /* Specific color for light theme */\n",
812
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
813
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
814
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
815
+ " --sklearn-color-icon: #696969;\n",
816
+ "\n",
817
+ " @media (prefers-color-scheme: dark) {\n",
818
+ " /* Redefinition of color scheme for dark theme */\n",
819
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
820
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
821
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
822
+ " --sklearn-color-icon: #878787;\n",
823
+ " }\n",
824
+ "}\n",
825
+ "\n",
826
+ "#sk-container-id-2 {\n",
827
+ " color: var(--sklearn-color-text);\n",
828
+ "}\n",
829
+ "\n",
830
+ "#sk-container-id-2 pre {\n",
831
+ " padding: 0;\n",
832
+ "}\n",
833
+ "\n",
834
+ "#sk-container-id-2 input.sk-hidden--visually {\n",
835
+ " border: 0;\n",
836
+ " clip: rect(1px 1px 1px 1px);\n",
837
+ " clip: rect(1px, 1px, 1px, 1px);\n",
838
+ " height: 1px;\n",
839
+ " margin: -1px;\n",
840
+ " overflow: hidden;\n",
841
+ " padding: 0;\n",
842
+ " position: absolute;\n",
843
+ " width: 1px;\n",
844
+ "}\n",
845
+ "\n",
846
+ "#sk-container-id-2 div.sk-dashed-wrapped {\n",
847
+ " border: 1px dashed var(--sklearn-color-line);\n",
848
+ " margin: 0 0.4em 0.5em 0.4em;\n",
849
+ " box-sizing: border-box;\n",
850
+ " padding-bottom: 0.4em;\n",
851
+ " background-color: var(--sklearn-color-background);\n",
852
+ "}\n",
853
+ "\n",
854
+ "#sk-container-id-2 div.sk-container {\n",
855
+ " /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
856
+ " but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
857
+ " so we also need the `!important` here to be able to override the\n",
858
+ " default hidden behavior on the sphinx rendered scikit-learn.org.\n",
859
+ " See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
860
+ " display: inline-block !important;\n",
861
+ " position: relative;\n",
862
+ "}\n",
863
+ "\n",
864
+ "#sk-container-id-2 div.sk-text-repr-fallback {\n",
865
+ " display: none;\n",
866
+ "}\n",
867
+ "\n",
868
+ "div.sk-parallel-item,\n",
869
+ "div.sk-serial,\n",
870
+ "div.sk-item {\n",
871
+ " /* draw centered vertical line to link estimators */\n",
872
+ " background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
873
+ " background-size: 2px 100%;\n",
874
+ " background-repeat: no-repeat;\n",
875
+ " background-position: center center;\n",
876
+ "}\n",
877
+ "\n",
878
+ "/* Parallel-specific style estimator block */\n",
879
+ "\n",
880
+ "#sk-container-id-2 div.sk-parallel-item::after {\n",
881
+ " content: \"\";\n",
882
+ " width: 100%;\n",
883
+ " border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
884
+ " flex-grow: 1;\n",
885
+ "}\n",
886
+ "\n",
887
+ "#sk-container-id-2 div.sk-parallel {\n",
888
+ " display: flex;\n",
889
+ " align-items: stretch;\n",
890
+ " justify-content: center;\n",
891
+ " background-color: var(--sklearn-color-background);\n",
892
+ " position: relative;\n",
893
+ "}\n",
894
+ "\n",
895
+ "#sk-container-id-2 div.sk-parallel-item {\n",
896
+ " display: flex;\n",
897
+ " flex-direction: column;\n",
898
+ "}\n",
899
+ "\n",
900
+ "#sk-container-id-2 div.sk-parallel-item:first-child::after {\n",
901
+ " align-self: flex-end;\n",
902
+ " width: 50%;\n",
903
+ "}\n",
904
+ "\n",
905
+ "#sk-container-id-2 div.sk-parallel-item:last-child::after {\n",
906
+ " align-self: flex-start;\n",
907
+ " width: 50%;\n",
908
+ "}\n",
909
+ "\n",
910
+ "#sk-container-id-2 div.sk-parallel-item:only-child::after {\n",
911
+ " width: 0;\n",
912
+ "}\n",
913
+ "\n",
914
+ "/* Serial-specific style estimator block */\n",
915
+ "\n",
916
+ "#sk-container-id-2 div.sk-serial {\n",
917
+ " display: flex;\n",
918
+ " flex-direction: column;\n",
919
+ " align-items: center;\n",
920
+ " background-color: var(--sklearn-color-background);\n",
921
+ " padding-right: 1em;\n",
922
+ " padding-left: 1em;\n",
923
+ "}\n",
924
+ "\n",
925
+ "\n",
926
+ "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
927
+ "clickable and can be expanded/collapsed.\n",
928
+ "- Pipeline and ColumnTransformer use this feature and define the default style\n",
929
+ "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
930
+ "*/\n",
931
+ "\n",
932
+ "/* Pipeline and ColumnTransformer style (default) */\n",
933
+ "\n",
934
+ "#sk-container-id-2 div.sk-toggleable {\n",
935
+ " /* Default theme specific background. It is overwritten whether we have a\n",
936
+ " specific estimator or a Pipeline/ColumnTransformer */\n",
937
+ " background-color: var(--sklearn-color-background);\n",
938
+ "}\n",
939
+ "\n",
940
+ "/* Toggleable label */\n",
941
+ "#sk-container-id-2 label.sk-toggleable__label {\n",
942
+ " cursor: pointer;\n",
943
+ " display: block;\n",
944
+ " width: 100%;\n",
945
+ " margin-bottom: 0;\n",
946
+ " padding: 0.5em;\n",
947
+ " box-sizing: border-box;\n",
948
+ " text-align: center;\n",
949
+ "}\n",
950
+ "\n",
951
+ "#sk-container-id-2 label.sk-toggleable__label-arrow:before {\n",
952
+ " /* Arrow on the left of the label */\n",
953
+ " content: \"▸\";\n",
954
+ " float: left;\n",
955
+ " margin-right: 0.25em;\n",
956
+ " color: var(--sklearn-color-icon);\n",
957
+ "}\n",
958
+ "\n",
959
+ "#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {\n",
960
+ " color: var(--sklearn-color-text);\n",
961
+ "}\n",
962
+ "\n",
963
+ "/* Toggleable content - dropdown */\n",
964
+ "\n",
965
+ "#sk-container-id-2 div.sk-toggleable__content {\n",
966
+ " max-height: 0;\n",
967
+ " max-width: 0;\n",
968
+ " overflow: hidden;\n",
969
+ " text-align: left;\n",
970
+ " /* unfitted */\n",
971
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
972
+ "}\n",
973
+ "\n",
974
+ "#sk-container-id-2 div.sk-toggleable__content.fitted {\n",
975
+ " /* fitted */\n",
976
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
977
+ "}\n",
978
+ "\n",
979
+ "#sk-container-id-2 div.sk-toggleable__content pre {\n",
980
+ " margin: 0.2em;\n",
981
+ " border-radius: 0.25em;\n",
982
+ " color: var(--sklearn-color-text);\n",
983
+ " /* unfitted */\n",
984
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
985
+ "}\n",
986
+ "\n",
987
+ "#sk-container-id-2 div.sk-toggleable__content.fitted pre {\n",
988
+ " /* unfitted */\n",
989
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
990
+ "}\n",
991
+ "\n",
992
+ "#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
993
+ " /* Expand drop-down */\n",
994
+ " max-height: 200px;\n",
995
+ " max-width: 100%;\n",
996
+ " overflow: auto;\n",
997
+ "}\n",
998
+ "\n",
999
+ "#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
1000
+ " content: \"▾\";\n",
1001
+ "}\n",
1002
+ "\n",
1003
+ "/* Pipeline/ColumnTransformer-specific style */\n",
1004
+ "\n",
1005
+ "#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
1006
+ " color: var(--sklearn-color-text);\n",
1007
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
1008
+ "}\n",
1009
+ "\n",
1010
+ "#sk-container-id-2 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
1011
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
1012
+ "}\n",
1013
+ "\n",
1014
+ "/* Estimator-specific style */\n",
1015
+ "\n",
1016
+ "/* Colorize estimator box */\n",
1017
+ "#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
1018
+ " /* unfitted */\n",
1019
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
1020
+ "}\n",
1021
+ "\n",
1022
+ "#sk-container-id-2 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
1023
+ " /* fitted */\n",
1024
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
1025
+ "}\n",
1026
+ "\n",
1027
+ "#sk-container-id-2 div.sk-label label.sk-toggleable__label,\n",
1028
+ "#sk-container-id-2 div.sk-label label {\n",
1029
+ " /* The background is the default theme color */\n",
1030
+ " color: var(--sklearn-color-text-on-default-background);\n",
1031
+ "}\n",
1032
+ "\n",
1033
+ "/* On hover, darken the color of the background */\n",
1034
+ "#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {\n",
1035
+ " color: var(--sklearn-color-text);\n",
1036
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
1037
+ "}\n",
1038
+ "\n",
1039
+ "/* Label box, darken color on hover, fitted */\n",
1040
+ "#sk-container-id-2 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
1041
+ " color: var(--sklearn-color-text);\n",
1042
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
1043
+ "}\n",
1044
+ "\n",
1045
+ "/* Estimator label */\n",
1046
+ "\n",
1047
+ "#sk-container-id-2 div.sk-label label {\n",
1048
+ " font-family: monospace;\n",
1049
+ " font-weight: bold;\n",
1050
+ " display: inline-block;\n",
1051
+ " line-height: 1.2em;\n",
1052
+ "}\n",
1053
+ "\n",
1054
+ "#sk-container-id-2 div.sk-label-container {\n",
1055
+ " text-align: center;\n",
1056
+ "}\n",
1057
+ "\n",
1058
+ "/* Estimator-specific */\n",
1059
+ "#sk-container-id-2 div.sk-estimator {\n",
1060
+ " font-family: monospace;\n",
1061
+ " border: 1px dotted var(--sklearn-color-border-box);\n",
1062
+ " border-radius: 0.25em;\n",
1063
+ " box-sizing: border-box;\n",
1064
+ " margin-bottom: 0.5em;\n",
1065
+ " /* unfitted */\n",
1066
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
1067
+ "}\n",
1068
+ "\n",
1069
+ "#sk-container-id-2 div.sk-estimator.fitted {\n",
1070
+ " /* fitted */\n",
1071
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
1072
+ "}\n",
1073
+ "\n",
1074
+ "/* on hover */\n",
1075
+ "#sk-container-id-2 div.sk-estimator:hover {\n",
1076
+ " /* unfitted */\n",
1077
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
1078
+ "}\n",
1079
+ "\n",
1080
+ "#sk-container-id-2 div.sk-estimator.fitted:hover {\n",
1081
+ " /* fitted */\n",
1082
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
1083
+ "}\n",
1084
+ "\n",
1085
+ "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
1086
+ "\n",
1087
+ "/* Common style for \"i\" and \"?\" */\n",
1088
+ "\n",
1089
+ ".sk-estimator-doc-link,\n",
1090
+ "a:link.sk-estimator-doc-link,\n",
1091
+ "a:visited.sk-estimator-doc-link {\n",
1092
+ " float: right;\n",
1093
+ " font-size: smaller;\n",
1094
+ " line-height: 1em;\n",
1095
+ " font-family: monospace;\n",
1096
+ " background-color: var(--sklearn-color-background);\n",
1097
+ " border-radius: 1em;\n",
1098
+ " height: 1em;\n",
1099
+ " width: 1em;\n",
1100
+ " text-decoration: none !important;\n",
1101
+ " margin-left: 1ex;\n",
1102
+ " /* unfitted */\n",
1103
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
1104
+ " color: var(--sklearn-color-unfitted-level-1);\n",
1105
+ "}\n",
1106
+ "\n",
1107
+ ".sk-estimator-doc-link.fitted,\n",
1108
+ "a:link.sk-estimator-doc-link.fitted,\n",
1109
+ "a:visited.sk-estimator-doc-link.fitted {\n",
1110
+ " /* fitted */\n",
1111
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
1112
+ " color: var(--sklearn-color-fitted-level-1);\n",
1113
+ "}\n",
1114
+ "\n",
1115
+ "/* On hover */\n",
1116
+ "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
1117
+ ".sk-estimator-doc-link:hover,\n",
1118
+ "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
1119
+ ".sk-estimator-doc-link:hover {\n",
1120
+ " /* unfitted */\n",
1121
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
1122
+ " color: var(--sklearn-color-background);\n",
1123
+ " text-decoration: none;\n",
1124
+ "}\n",
1125
+ "\n",
1126
+ "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
1127
+ ".sk-estimator-doc-link.fitted:hover,\n",
1128
+ "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
1129
+ ".sk-estimator-doc-link.fitted:hover {\n",
1130
+ " /* fitted */\n",
1131
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
1132
+ " color: var(--sklearn-color-background);\n",
1133
+ " text-decoration: none;\n",
1134
+ "}\n",
1135
+ "\n",
1136
+ "/* Span, style for the box shown on hovering the info icon */\n",
1137
+ ".sk-estimator-doc-link span {\n",
1138
+ " display: none;\n",
1139
+ " z-index: 9999;\n",
1140
+ " position: relative;\n",
1141
+ " font-weight: normal;\n",
1142
+ " right: .2ex;\n",
1143
+ " padding: .5ex;\n",
1144
+ " margin: .5ex;\n",
1145
+ " width: min-content;\n",
1146
+ " min-width: 20ex;\n",
1147
+ " max-width: 50ex;\n",
1148
+ " color: var(--sklearn-color-text);\n",
1149
+ " box-shadow: 2pt 2pt 4pt #999;\n",
1150
+ " /* unfitted */\n",
1151
+ " background: var(--sklearn-color-unfitted-level-0);\n",
1152
+ " border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
1153
+ "}\n",
1154
+ "\n",
1155
+ ".sk-estimator-doc-link.fitted span {\n",
1156
+ " /* fitted */\n",
1157
+ " background: var(--sklearn-color-fitted-level-0);\n",
1158
+ " border: var(--sklearn-color-fitted-level-3);\n",
1159
+ "}\n",
1160
+ "\n",
1161
+ ".sk-estimator-doc-link:hover span {\n",
1162
+ " display: block;\n",
1163
+ "}\n",
1164
+ "\n",
1165
+ "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
1166
+ "\n",
1167
+ "#sk-container-id-2 a.estimator_doc_link {\n",
1168
+ " float: right;\n",
1169
+ " font-size: 1rem;\n",
1170
+ " line-height: 1em;\n",
1171
+ " font-family: monospace;\n",
1172
+ " background-color: var(--sklearn-color-background);\n",
1173
+ " border-radius: 1rem;\n",
1174
+ " height: 1rem;\n",
1175
+ " width: 1rem;\n",
1176
+ " text-decoration: none;\n",
1177
+ " /* unfitted */\n",
1178
+ " color: var(--sklearn-color-unfitted-level-1);\n",
1179
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
1180
+ "}\n",
1181
+ "\n",
1182
+ "#sk-container-id-2 a.estimator_doc_link.fitted {\n",
1183
+ " /* fitted */\n",
1184
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
1185
+ " color: var(--sklearn-color-fitted-level-1);\n",
1186
+ "}\n",
1187
+ "\n",
1188
+ "/* On hover */\n",
1189
+ "#sk-container-id-2 a.estimator_doc_link:hover {\n",
1190
+ " /* unfitted */\n",
1191
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
1192
+ " color: var(--sklearn-color-background);\n",
1193
+ " text-decoration: none;\n",
1194
+ "}\n",
1195
+ "\n",
1196
+ "#sk-container-id-2 a.estimator_doc_link.fitted:hover {\n",
1197
+ " /* fitted */\n",
1198
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
1199
+ "}\n",
1200
+ "</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>MultinomialNB()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" checked><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;MultinomialNB<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.naive_bayes.MultinomialNB.html\">?<span>Documentation for MultinomialNB</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>MultinomialNB()</pre></div> </div></div></div></div>"
1201
+ ],
1202
+ "text/plain": [
1203
+ "MultinomialNB()"
1204
+ ]
1205
+ },
1206
+ "execution_count": 12,
1207
+ "metadata": {},
1208
+ "output_type": "execute_result"
1209
+ }
1210
+ ],
1211
+ "source": [
1212
+ "final_model = MultinomialNB()\n",
1213
+ "final_model.fit(X, y)"
1214
+ ]
1215
+ },
1216
+ {
1217
+ "cell_type": "code",
1218
+ "execution_count": 13,
1219
+ "id": "da22026b-3303-40c2-bebf-afb47bbb3274",
1220
+ "metadata": {},
1221
+ "outputs": [
1222
+ {
1223
+ "data": {
1224
+ "text/plain": [
1225
+ "0.9684616283144493"
1226
+ ]
1227
+ },
1228
+ "execution_count": 13,
1229
+ "metadata": {},
1230
+ "output_type": "execute_result"
1231
+ }
1232
+ ],
1233
+ "source": [
1234
+ "pred = final_model.predict(X)\n",
1235
+ "accuracy_score(pred, y)"
1236
+ ]
1237
+ },
1238
+ {
1239
+ "cell_type": "markdown",
1240
+ "id": "157e41c7-e732-45b4-8075-49db21b7f852",
1241
+ "metadata": {},
1242
+ "source": [
1243
+ "# Pickling"
1244
+ ]
1245
+ },
1246
+ {
1247
+ "cell_type": "code",
1248
+ "execution_count": 14,
1249
+ "id": "a4fcca1e-159f-43d5-90ab-522365ff5328",
1250
+ "metadata": {},
1251
+ "outputs": [
1252
+ {
1253
+ "data": {
1254
+ "application/javascript": [
1255
+ "\n",
1256
+ " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import pandas as pd\\nimport pickle'); }\n",
1257
+ " "
1258
+ ],
1259
+ "text/plain": [
1260
+ "<IPython.core.display.Javascript object>"
1261
+ ]
1262
+ },
1263
+ "metadata": {},
1264
+ "output_type": "display_data"
1265
+ }
1266
+ ],
1267
+ "source": [
1268
+ "pickle.dump(cv, open('pickle_files/count_vectorizer.pkl', 'wb')) "
1269
+ ]
1270
+ },
1271
+ {
1272
+ "cell_type": "code",
1273
+ "execution_count": 15,
1274
+ "id": "114f0150-63ad-43c0-a90d-3450647c50ae",
1275
+ "metadata": {},
1276
+ "outputs": [
1277
+ {
1278
+ "data": {
1279
+ "application/javascript": [
1280
+ "\n",
1281
+ " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import pandas as pd\\nimport pickle'); }\n",
1282
+ " "
1283
+ ],
1284
+ "text/plain": [
1285
+ "<IPython.core.display.Javascript object>"
1286
+ ]
1287
+ },
1288
+ "metadata": {},
1289
+ "output_type": "display_data"
1290
+ }
1291
+ ],
1292
+ "source": [
1293
+ "pickle.dump(final_model, open('pickle_files/spam_model.pkl', 'wb')) "
1294
+ ]
1295
+ },
1296
+ {
1297
+ "cell_type": "code",
1298
+ "execution_count": null,
1299
+ "id": "eb2b40f9-aea0-44ae-9eec-b32c37f4831d",
1300
+ "metadata": {},
1301
+ "outputs": [],
1302
+ "source": []
1303
+ }
1304
+ ],
1305
+ "metadata": {
1306
+ "kernelspec": {
1307
+ "display_name": "Python 3 (ipykernel)",
1308
+ "language": "python",
1309
+ "name": "python3"
1310
+ },
1311
+ "language_info": {
1312
+ "codemirror_mode": {
1313
+ "name": "ipython",
1314
+ "version": 3
1315
+ },
1316
+ "file_extension": ".py",
1317
+ "mimetype": "text/x-python",
1318
+ "name": "python",
1319
+ "nbconvert_exporter": "python",
1320
+ "pygments_lexer": "ipython3",
1321
+ "version": "3.12.4"
1322
+ }
1323
+ },
1324
+ "nbformat": 4,
1325
+ "nbformat_minor": 5
1326
+ }