{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "VIGPNlN8sqqp" }, "outputs": [], "source": [ "%%capture\n", "!pip install transformers\n", "!pip install accelerate -U\n", "!pip install --upgrade tensorflow\n", "! pip install datasets\n", "! pip install huggingface_hub\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "dqgUXWuwZvJQ" }, "outputs": [], "source": [ "from google.colab import drive\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "UFP95uHeTaGK" }, "outputs": [], "source": [ "import os\n", "\n", "mount_point = \"/content/MyDrive/deep-learning\"\n", "if not os.path.exists(mount_point):\n", " os.makedirs(mount_point)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9IakUnxiu_1g" }, "outputs": [], "source": [ "%%capture\n", "\n", "##for data handling\n", "import pandas as pd\n", "import numpy as np\n", "\n", "##visualizations\n", "\n", "import matplotlib.pyplot as plt\n", "import plotly.express as px\n", "import seaborn as sns\n", "\n", "##NLP\n", "\n", "import nltk\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", "from wordcloud import WordCloud, STOPWORDS\n", "import re,string, unicodedata\n", "from nltk.tokenize import word_tokenize\n", "from nltk.stem import WordNetLemmatizer\n", "from string import punctuation\n", "from nltk.corpus import wordnet\n", "from collections import Counter\n", "import string\n", "nltk.download('punkt')\n", "nltk.download('wordnet')\n", "nltk.download('maxent_ne_chunker')\n", "nltk.download(\"words\")\n", "import nltk\n", "from nltk.corpus import gutenberg\n", "from nltk import FreqDist\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9VZbCA6s9pzo" }, "outputs": [], "source": [ "train_path= '/content/drive/MyDrive/deep-learning/Train.csv'\n", "test_path= '/content/drive/MyDrive/deep-learning/Test.csv'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8Frz5b19-VVO" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NMGZ7qwxWHQm", "outputId": "c2262d8e-2385-4568-8abf-13ae5ba422d0" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 62.4 ms, sys: 19 ms, total: 81.4 ms\n", "Wall time: 545 ms\n" ] } ], "source": [ "\n", "%%time\n", "df_train= pd.read_csv(train_path)\n", "df_test= pd.read_csv(test_path)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "EOFVlu8obzQQ", "outputId": "24839376-571a-47d7-f24c-7b764f648f85" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet_idsafe_textlabelagreement
0CL1KWCMYMe & The Big Homie meanboy3000 #MEANBOY #M...0.01.0
1E3303EMEI'm 100% thinking of devoting my career to pro...1.01.0
2M4IVFSMS#whatcausesautism VACCINES, DO NOT VACCINATE Y...-1.01.0
31DR6ROZ4I mean if they immunize my kid with something ...-1.01.0
4J77ENIIEThanks to <user> Catch me performing at La Nui...0.01.0
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " tweet_id safe_text label \\\n", "0 CL1KWCMY Me & The Big Homie meanboy3000 #MEANBOY #M... 0.0 \n", "1 E3303EME I'm 100% thinking of devoting my career to pro... 1.0 \n", "2 M4IVFSMS #whatcausesautism VACCINES, DO NOT VACCINATE Y... -1.0 \n", "3 1DR6ROZ4 I mean if they immunize my kid with something ... -1.0 \n", "4 J77ENIIE Thanks to Catch me performing at La Nui... 0.0 \n", "\n", " agreement \n", "0 1.0 \n", "1 1.0 \n", "2 1.0 \n", "3 1.0 \n", "4 1.0 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##loading the dataset\n", "\n", "df_train.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "_PHz_nKab0BH", "outputId": "f0acb4ca-3167-4a8e-cc5f-00e2d20a7d9d" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet_idsafe_text
000BHHHP1<user> <user> ... &amp; 4 a vaccine given 2 he...
100UNMD0EStudents starting school without whooping coug...
201AXPTJFI'm kinda over every ep of <user> being \"rippe...
301HOEQJWHow many innocent children die for lack of vac...
401JUKMAOCDC eyeing bird flu vaccine for humans, though...
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " tweet_id safe_text\n", "0 00BHHHP1 ... & 4 a vaccine given 2 he...\n", "1 00UNMD0E Students starting school without whooping coug...\n", "2 01AXPTJF I'm kinda over every ep of being \"rippe...\n", "3 01HOEQJW How many innocent children die for lack of vac...\n", "4 01JUKMAO CDC eyeing bird flu vaccine for humans, though..." ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_test.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "XwxCmyHPc3EM" }, "source": [ "## CRISP-DM Framework\n", "\n", "- Data Understanding\n", "- Data Preparation\n", "- Modelling\n", "- Evaluation\n", "- Deployment\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "W8XiDpLifCot" }, "outputs": [], "source": [ "##before starting, I will like to rename safe_text column to tweets\n", "\n", "\n", "df_train.rename(columns= {\"safe_text\": \"tweets\"}, inplace= True)\n", "df_test.rename(columns= {\"safe_text\": \"tweets\"}, inplace= True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 238 }, "id": "WlWyBoOPfGoW", "outputId": "b6ec1c05-1c8d-4f14-d5bd-5547d616df42" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet_idtweets
000BHHHP1<user> <user> ... &amp; 4 a vaccine given 2 he...
100UNMD0EStudents starting school without whooping coug...
201AXPTJFI'm kinda over every ep of <user> being \"rippe...
301HOEQJWHow many innocent children die for lack of vac...
401JUKMAOCDC eyeing bird flu vaccine for humans, though...
501V1X8XWI think that active duty soldiers should get v...
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " tweet_id tweets\n", "0 00BHHHP1 ... & 4 a vaccine given 2 he...\n", "1 00UNMD0E Students starting school without whooping coug...\n", "2 01AXPTJF I'm kinda over every ep of being \"rippe...\n", "3 01HOEQJW How many innocent children die for lack of vac...\n", "4 01JUKMAO CDC eyeing bird flu vaccine for humans, though...\n", "5 01V1X8XW I think that active duty soldiers should get v..." ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_test.loc[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 238 }, "id": "82386zRCc3xx", "outputId": "e44c91be-47ad-4af6-d9c3-99f2cf19733c" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet_idtweetslabelagreement
0CL1KWCMYMe &amp; The Big Homie meanboy3000 #MEANBOY #M...0.01.000000
1E3303EMEI'm 100% thinking of devoting my career to pro...1.01.000000
2M4IVFSMS#whatcausesautism VACCINES, DO NOT VACCINATE Y...-1.01.000000
31DR6ROZ4I mean if they immunize my kid with something ...-1.01.000000
4J77ENIIEThanks to <user> Catch me performing at La Nui...0.01.000000
5OVNPOAUX<user> a nearly 67 year old study when mental ...1.00.666667
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " tweet_id tweets label \\\n", "0 CL1KWCMY Me & The Big Homie meanboy3000 #MEANBOY #M... 0.0 \n", "1 E3303EME I'm 100% thinking of devoting my career to pro... 1.0 \n", "2 M4IVFSMS #whatcausesautism VACCINES, DO NOT VACCINATE Y... -1.0 \n", "3 1DR6ROZ4 I mean if they immunize my kid with something ... -1.0 \n", "4 J77ENIIE Thanks to Catch me performing at La Nui... 0.0 \n", "5 OVNPOAUX a nearly 67 year old study when mental ... 1.0 \n", "\n", " agreement \n", "0 1.000000 \n", "1 1.000000 \n", "2 1.000000 \n", "3 1.000000 \n", "4 1.000000 \n", "5 0.666667 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.loc[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "wtIglVGdjPkm" }, "outputs": [], "source": [ "##creating a copy\n", "\n", "train_data= df_train.copy()\n", "test_data= df_test.copy()" ] }, { "cell_type": "markdown", "metadata": { "id": "xZ7lILoEpfs9" }, "source": [ "# 1. Data Understanding\n", "\n", "Workflow:\n", "\n", "- info\n", "- Check for missing values\n", "- Check for duplicates\n", "- Plot some charts\n" ] }, { "cell_type": "markdown", "metadata": { "id": "0JpzEdOV4Xsr" }, "source": [ "## 1.1 Checking Info" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "u1tG3XSI4OTq", "outputId": "dfb84faa-14dd-4ccd-8fbe-a4b8c79f01cb" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 10001 entries, 0 to 10000\n", "Data columns (total 4 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 tweet_id 10001 non-null object \n", " 1 tweets 10001 non-null object \n", " 2 label 10000 non-null float64\n", " 3 agreement 9999 non-null float64\n", "dtypes: float64(2), object(2)\n", "memory usage: 312.7+ KB\n", "the info df_train dataset are: \n", "\n", " None \n", "\n", " ------------------------------------------------------------\n", "\n", "RangeIndex: 5177 entries, 0 to 5176\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 tweet_id 5177 non-null object\n", " 1 tweets 5176 non-null object\n", "dtypes: object(2)\n", "memory usage: 81.0+ KB\n", "the info df_test dataset are: \n", "\n", " None \n", "\n", " ------------------------------------------------------------\n" ] } ], "source": [ "data=[df_train, df_test]\n", "names=[\"df_train\", \"df_test\"]\n", "\n", "\n", "\n", "for m, i in zip(data, names):\n", " print(f\"the info\", i,\"dataset are: \", \"\\n\\n\", m.info(), \"\\n\\n\", \"---\"*20 )" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 300 }, "id": "B_tK0QJQOG1c", "outputId": "8ff23f7b-0b52-40fa-a253-5906c25f0f95" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelagreement
count10000.0000009999.000000
mean0.3015670.854252
std0.6467180.180707
min-1.0000000.333333
25%0.0000000.666667
50%0.0000001.000000
75%1.0000001.000000
max1.0000001.000000
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " label agreement\n", "count 10000.000000 9999.000000\n", "mean 0.301567 0.854252\n", "std 0.646718 0.180707\n", "min -1.000000 0.333333\n", "25% 0.000000 0.666667\n", "50% 0.000000 1.000000\n", "75% 1.000000 1.000000\n", "max 1.000000 1.000000" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 300 }, "id": "vgwJp85uOMv4", "outputId": "c7b5ac45-1a45-4477-c0cc-e08a67b33108" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelagreement
count10000.0000009999.000000
mean0.3015670.854252
std0.6467180.180707
min-1.0000000.333333
25%0.0000000.666667
50%0.0000001.000000
75%1.0000001.000000
max1.0000001.000000
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " label agreement\n", "count 10000.000000 9999.000000\n", "mean 0.301567 0.854252\n", "std 0.646718 0.180707\n", "min -1.000000 0.333333\n", "25% 0.000000 0.666667\n", "50% 0.000000 1.000000\n", "75% 1.000000 1.000000\n", "max 1.000000 1.000000" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.describe()" ] }, { "cell_type": "markdown", "metadata": { "id": "srYRJy-Le7M6" }, "source": [ "###1.2 Checking for Missing Values" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ElXduvXYil2W", "outputId": "30aa8dc0-b988-476d-dd73-649bbb76b1b1" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "the missing values in the df_train dataset are: \n", "\n", " tweet_id 0\n", "tweets 0\n", "label 1\n", "agreement 2\n", "dtype: int64 \n", "\n", " ------------------------------------------------------------\n", "the missing values in the df_test dataset are: \n", "\n", " tweet_id 0\n", "tweets 1\n", "dtype: int64 \n", "\n", " ------------------------------------------------------------\n" ] } ], "source": [ "data=[df_train, df_test]\n", "names=[\"df_train\", \"df_test\"]\n", "\n", "\n", "\n", "for m, i in zip(data, names):\n", " print(f\"the missing values in the\", i,\"dataset are: \", \"\\n\\n\", m.isna().sum(), \"\\n\\n\", \"---\"*20 )" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 112 }, "id": "ED9iOg9oi1be", "outputId": "83429135-b086-424d-b80b-085a57da62c0" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet_idtweetslabelagreement
4798RQMQ0L2A#lawandorderSVUNaNNaN
4799I cannot believe in this day and age some pare...10.666667NaN
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " tweet_id tweets \\\n", "4798 RQMQ0L2A #lawandorderSVU \n", "4799 I cannot believe in this day and age some pare... 1 \n", "\n", " label agreement \n", "4798 NaN NaN \n", "4799 0.666667 NaN " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##we will see the missing values\n", "\n", "df_train[df_train[\"label\"].isna()]\n", "df_train[df_train[\"agreement\"].isna()]" ] }, { "cell_type": "markdown", "metadata": { "id": "uAfUoa_Ij_W-" }, "source": [ "#### Notes after checking for missing values:\n", "\n", "- We will drop them in the Data Preparation stage" ] }, { "cell_type": "markdown", "metadata": { "id": "OQydlhOfsxgI" }, "source": [ "## 1.3 Checking for Duplicates\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 49 }, "id": "4jfC-EFwo38c", "outputId": "7e9ed683-6fa4-46c1-f176-95c26ea62ba3" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet_idtweetslabelagreement
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ "Empty DataFrame\n", "Columns: [tweet_id, tweets, label, agreement]\n", "Index: []" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "df_train[df_train.duplicated()]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 84 }, "id": "H0BMXrkPo6ZV", "outputId": "e2be7227-7202-4dd6-dac4-e60d10b23c35" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n", " df_test[df_train.duplicated()]\n" ] }, { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet_idtweets
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ "Empty DataFrame\n", "Columns: [tweet_id, tweets]\n", "Index: []" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_test[df_train.duplicated()]" ] }, { "cell_type": "markdown", "metadata": { "id": "P_hvJqmHpDCI" }, "source": [ "### Notes after checking for duplicates\n", "\n", "\n", "\n", "- There were no duplicates" ] }, { "cell_type": "markdown", "metadata": { "id": "M0jDZv6qpIB5" }, "source": [ "## 1.3 Visualizations" ] }, { "cell_type": "markdown", "metadata": { "id": "Kwc8iY_Z1_1L" }, "source": [ "##### i.Checking Distrubution of Sentiments" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 542 }, "id": "HLiDYsLTs72q", "outputId": "3f535bb4-8769-4610-8fb6-277cf5359a67" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "
\n", "
\n", "\n", "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig_1=px.histogram(data_frame= df_train, x= \"label\", title= \"Distrubution of Sentiments\")\n", "fig_1.show()" ] }, { "cell_type": "markdown", "metadata": { "id": "jFmPib0BtUYE" }, "source": [ "Notes:\n", "\n", "- Most tweets were neutral\n", "- There were more postive tweets than negative\n", "- There is class imbalance\n", "- There is a random value which doesn't belong to any of our predefined class" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5N8zYUZhu2B2", "outputId": "58bb319f-0e08-488f-f0ea-f4d362b24332" }, "outputs": [ { "data": { "text/plain": [ "array([ 0. , 1. , -1. , nan, 0.66666667])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##checking to find out the values of the random value:\n", "df_train.label.unique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jcApoRoKCr1X" }, "outputs": [], "source": [ "postive= df_train[df_train[\"label\"]== 1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "psfbHTj1Cq51" }, "outputs": [], "source": [ "negative= df_train[df_train[\"label\"]== -1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "iQpj4kfyC4uL" }, "outputs": [], "source": [ "neutral= df_train[df_train[\"label\"]== 0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 677 }, "id": "GUcAtECEDCBX", "outputId": "9426e00b-ecab-475c-a0da-0ba151fe6675" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet_idtweetslabelagreement
2M4IVFSMS#whatcausesautism VACCINES, DO NOT VACCINATE Y...-1.01.000000
31DR6ROZ4I mean if they immunize my kid with something ...-1.01.000000
2489AB846O<user> #CDC lied and hid data that black boys ...-1.01.000000
25XSXFN1C8<user> vaccines causing autism-1.01.000000
356PMH7C56<user> <user> Other than that, his defense is ...-1.00.333333
47G1CJ54KD<user> I'm not vaccinating my kids lol-1.01.000000
58Y3OMTB1Q<user> yeah. I'll just stick to my regular vac...-1.00.666667
62ZCOLETM5CIA: No more vaccination campaigns in spy ops ...-1.00.666667
65E88B1XQJVaccine Brain Damage Cover Up Implodes: <url> ...-1.01.000000
8943MWGI00Centers for Disease Control: This Year’s Flu V...-1.00.666667
90TP0MIEXK<user> TY. Fought hard 2 NOT vaccinate my kids...-1.00.666667
10085B8L54L\"<user> Conservative Neurosurgeon Ben Carson S...-1.00.666667
1051TI13L1W<user> ok what's good u have to say about poli...-1.01.000000
108DZWTVPSHWho wants a shot of autism juice...I mean meas...-1.01.000000
118NSZDXB2J2/3 ...yet the only way to immunize him is to ...-1.00.666667
120L2TIWPQDNEW: Bexar Co. District Attorney Nico LaHood: ...-1.01.000000
128DOZBHCZ5<user> <user> and the vaccine will injure tens...-1.01.000000
1334KEP2GOMI honestly don't believe in immunization. \\r\\n...-1.01.000000
135C9QSY5LEReally? MMR Shots?-1.01.000000
138OA5RF3H5Pro safety doesn't make me an anti-vaxer. It ...-1.00.333333
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " tweet_id tweets label \\\n", "2 M4IVFSMS #whatcausesautism VACCINES, DO NOT VACCINATE Y... -1.0 \n", "3 1DR6ROZ4 I mean if they immunize my kid with something ... -1.0 \n", "24 89AB846O #CDC lied and hid data that black boys ... -1.0 \n", "25 XSXFN1C8 vaccines causing autism -1.0 \n", "35 6PMH7C56 Other than that, his defense is ... -1.0 \n", "47 G1CJ54KD I'm not vaccinating my kids lol -1.0 \n", "58 Y3OMTB1Q yeah. I'll just stick to my regular vac... -1.0 \n", "62 ZCOLETM5 CIA: No more vaccination campaigns in spy ops ... -1.0 \n", "65 E88B1XQJ Vaccine Brain Damage Cover Up Implodes: ... -1.0 \n", "89 43MWGI00 Centers for Disease Control: This Year’s Flu V... -1.0 \n", "90 TP0MIEXK TY. Fought hard 2 NOT vaccinate my kids... -1.0 \n", "100 85B8L54L \" Conservative Neurosurgeon Ben Carson S... -1.0 \n", "105 1TI13L1W ok what's good u have to say about poli... -1.0 \n", "108 DZWTVPSH Who wants a shot of autism juice...I mean meas... -1.0 \n", "118 NSZDXB2J 2/3 ...yet the only way to immunize him is to ... -1.0 \n", "120 L2TIWPQD NEW: Bexar Co. District Attorney Nico LaHood: ... -1.0 \n", "128 DOZBHCZ5 and the vaccine will injure tens... -1.0 \n", "133 4KEP2GOM I honestly don't believe in immunization. \\r\\n... -1.0 \n", "135 C9QSY5LE Really? MMR Shots? -1.0 \n", "138 OA5RF3H5 Pro safety doesn't make me an anti-vaxer. It ... -1.0 \n", "\n", " agreement \n", "2 1.000000 \n", "3 1.000000 \n", "24 1.000000 \n", "25 1.000000 \n", "35 0.333333 \n", "47 1.000000 \n", "58 0.666667 \n", "62 0.666667 \n", "65 1.000000 \n", "89 0.666667 \n", "90 0.666667 \n", "100 0.666667 \n", "105 1.000000 \n", "108 1.000000 \n", "118 0.666667 \n", "120 1.000000 \n", "128 1.000000 \n", "133 1.000000 \n", "135 1.000000 \n", "138 0.333333 " ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "negative.head(20)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 53 }, "id": "jjIFvh4UDDIt", "outputId": "70e83377-862f-4e19-c4e5-fd2679b80c38" }, "outputs": [ { "data": { "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" }, "text/plain": [ "\"Pro safety doesn't make me an anti-vaxer. It makes me an educated consumer #CDCwhistleblower #CDCfraud #MyKidMatters #VaccineInjuryIsReal\"" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "negative.loc[138][\"tweets\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jMPuOjjhDD1f" }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": { "id": "5Oowf-Fh1qgq" }, "source": [ "##### ii.Using a WordCloud to Visualize Frequent Words" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1LP2HLws1q1_" }, "outputs": [], "source": [ "##instantiating my Stopwords variable\n", "\n", "stopwords= STOPWORDS" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "FWDu1ibi2L-B", "outputId": "8d79a23e-f9e1-4433-8a31-2941fc5b7232" }, "outputs": [ { "data": { "text/plain": [ "192" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(stopwords)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "4HYdAeiI2MZT" }, "outputs": [], "source": [ "##initializing my Wordcloud library\n", "\n", "wc= WordCloud(background_color= \"black\",\n", "\n", " stopwords=stopwords,\n", "\n", " height= 600,\n", "\n", " width= 400\n", "\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "f-GxG8yF3iJe" }, "outputs": [], "source": [ "train_tweets= df_train[\"tweets\"].dropna()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HawrS2_J3i0R" }, "outputs": [], "source": [ "##converting all textin various rows into a single string\n", "\n", "train_text= \" \".join(train_tweets)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 870 }, "id": "SPKrkanF6vK4", "outputId": "7345439c-c138-4c58-97d8-01f9f62ab97c" }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "wc_train= wc.generate(train_text)\n", "plt.figure(figsize= (15,10))\n", "plt.title(\"Most common Words in Covid-19 Tweets\")\n", "plt.imshow(wc_train)\n" ] }, { "cell_type": "markdown", "metadata": { "id": "x23CyGBe6vdS" }, "source": [ "the most tweeted words are:\n", "\n", "- URL\n", "- Measles\n", "- Outbreak\n", "- User\n", "- Kid\n", "- AMP\n", "- Disneyland\n", "- Autism\n", "\n" ] }, { "cell_type": "markdown", "metadata": { "id": "HxmNycbMjVTC" }, "source": [ "### iii.Length of Tweets" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fTLp0tfKjVqV" }, "outputs": [], "source": [ "df_train[\"tweet_length\"]= [len(i.split(\" \")) for i in df_train[\"tweets\"]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 238 }, "id": "9Vkrj6OwjV_A", "outputId": "3583c159-b823-4d18-9bd3-f6937c05a645" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet_idtweetslabelagreementtweet_length
0CL1KWCMYMe &amp; The Big Homie meanboy3000 #MEANBOY #M...0.01.00000015
1E3303EMEI'm 100% thinking of devoting my career to pro...1.01.00000025
2M4IVFSMS#whatcausesautism VACCINES, DO NOT VACCINATE Y...-1.01.0000007
31DR6ROZ4I mean if they immunize my kid with something ...-1.01.00000028
4J77ENIIEThanks to <user> Catch me performing at La Nui...0.01.00000020
5OVNPOAUX<user> a nearly 67 year old study when mental ...1.00.66666722
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " tweet_id tweets label \\\n", "0 CL1KWCMY Me & The Big Homie meanboy3000 #MEANBOY #M... 0.0 \n", "1 E3303EME I'm 100% thinking of devoting my career to pro... 1.0 \n", "2 M4IVFSMS #whatcausesautism VACCINES, DO NOT VACCINATE Y... -1.0 \n", "3 1DR6ROZ4 I mean if they immunize my kid with something ... -1.0 \n", "4 J77ENIIE Thanks to Catch me performing at La Nui... 0.0 \n", "5 OVNPOAUX a nearly 67 year old study when mental ... 1.0 \n", "\n", " agreement tweet_length \n", "0 1.000000 15 \n", "1 1.000000 25 \n", "2 1.000000 7 \n", "3 1.000000 28 \n", "4 1.000000 20 \n", "5 0.666667 22 " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.loc[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "467XroyznrrG", "outputId": "98e34cce-1a6d-45bf-de39-035885e9ad70" }, "outputs": [ { "data": { "text/plain": [ "33" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train[\"tweet_length\"].max()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "nXCuWCnmjWWD" }, "outputs": [], "source": [ "##getting my tweet length\n", "tweet_len= df_train[\"tweet_length\"].value_counts().reset_index()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "bIDdD9n2nTwd" }, "outputs": [], "source": [ "tweet_len.columns= [\"tweet_length\", \"count\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 542 }, "id": "9qXxxPbcnTW2", "outputId": "d43514e4-d0b1-4aa8-a426-e19817bbd675" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "
\n", "
\n", "\n", "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "##visualizaing the length of my tweets\n", "\n", "fig_2= px.scatter(data_frame= tweet_len, x= \"tweet_length\", y= \"count\", size= \"count\", color= \"tweet_length\", title= \"Length of Tweets\")\n", "\n", "fig_2.show()" ] }, { "cell_type": "markdown", "metadata": { "id": "ozRBOW54ov7s" }, "source": [ "#### Notes:\n", "\n", "- Most tweets are 18 words of length" ] }, { "cell_type": "markdown", "metadata": { "id": "-idgRNHAvuha" }, "source": [ "# 2. Data Preparation\n", "\n", "Workflow:\n", "\n", "- Handle Issues we discovered during our exploration\n", "- drop unecessary columns\n", "- Perform NLP Preprocessing steps" ] }, { "cell_type": "markdown", "metadata": { "id": "uC5dcx6dv9_v" }, "source": [ "## 2.1 Handling Issues Discovered During Exploration" ] }, { "cell_type": "markdown", "metadata": { "id": "7-AEFYPnzZgA" }, "source": [ "#### i. Missing values\n", "\n", "- Since the missing values are just a few, we will just drop them" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "uACkCy2Czd-b" }, "outputs": [], "source": [ "df_train= df_train.dropna()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "DE2-IPoozuKk" }, "outputs": [], "source": [ "df_test= df_test.dropna()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "D2AYHVyEzyA_", "outputId": "d6ea4a41-a9ff-4e28-e421-130afab175c5" }, "outputs": [ { "data": { "text/plain": [ "tweet_id 0\n", "tweets 0\n", "label 0\n", "agreement 0\n", "tweet_length 0\n", "dtype: int64" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.isna().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "WtfnmF8Zz0yu", "outputId": "c4b7aad6-8ddc-42bb-d0db-1d6b6dedfb2f" }, "outputs": [ { "data": { "text/plain": [ "tweet_id 0\n", "tweets 0\n", "dtype: int64" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_test.isna().sum()" ] }, { "cell_type": "markdown", "metadata": { "id": "XYjQzbREz5g-" }, "source": [ "##### ii.Handling the abnormal value" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_viDcA3b0hhM", "outputId": "cc9fd6ef-6174-4f4d-ad64-13013de2b883" }, "outputs": [ { "data": { "text/plain": [ "array([ 0., 1., -1.])" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train[\"label\"].unique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "y1iw1_LX0lFM" }, "outputs": [], "source": [ "##since it has been handled, we will proceed to the next step" ] }, { "cell_type": "markdown", "metadata": { "id": "bCaE520a0ult" }, "source": [ "## 2.2 Dropping Unecesary Columns\n", "\n", "- I will be dropping the tweet ID column since it holds no inherent value" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "krUGPFyl0vZ3" }, "outputs": [], "source": [ "df_train= df_train.drop(\"tweet_id\", axis=1)\n", "df_test= df_test.drop(\"tweet_id\", axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "w7fC8uOB0vxl", "outputId": "3be34bbe-31fa-46cc-f246-c0216b78a1fe" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweetslabelagreementtweet_length
0Me &amp; The Big Homie meanboy3000 #MEANBOY #M...0.01.015
1I'm 100% thinking of devoting my career to pro...1.01.025
2#whatcausesautism VACCINES, DO NOT VACCINATE Y...-1.01.07
3I mean if they immunize my kid with something ...-1.01.028
4Thanks to <user> Catch me performing at La Nui...0.01.020
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " tweets label agreement \\\n", "0 Me & The Big Homie meanboy3000 #MEANBOY #M... 0.0 1.0 \n", "1 I'm 100% thinking of devoting my career to pro... 1.0 1.0 \n", "2 #whatcausesautism VACCINES, DO NOT VACCINATE Y... -1.0 1.0 \n", "3 I mean if they immunize my kid with something ... -1.0 1.0 \n", "4 Thanks to Catch me performing at La Nui... 0.0 1.0 \n", "\n", " tweet_length \n", "0 15 \n", "1 25 \n", "2 7 \n", "3 28 \n", "4 20 " ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "iRymZC8R0wGo", "outputId": "ac2ce5ca-3f0d-46b7-9ccd-1f3068e04823" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweets
0<user> <user> ... &amp; 4 a vaccine given 2 he...
1Students starting school without whooping coug...
2I'm kinda over every ep of <user> being \"rippe...
3How many innocent children die for lack of vac...
4CDC eyeing bird flu vaccine for humans, though...
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " tweets\n", "0 ... & 4 a vaccine given 2 he...\n", "1 Students starting school without whooping coug...\n", "2 I'm kinda over every ep of being \"rippe...\n", "3 How many innocent children die for lack of vac...\n", "4 CDC eyeing bird flu vaccine for humans, though..." ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_test.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "I6jDwwDUub_9" }, "source": [ "## 2.3 NLP Preprocessing Steps" ] }, { "cell_type": "markdown", "metadata": { "id": "spnaHCG1wGuO" }, "source": [ "###i. Converting Everything to Lowercase" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5w0btf_iv3Vk" }, "outputs": [], "source": [ "df_train[\"tweets\"]= df_train[\"tweets\"].str.lower()\n", "df_test[\"tweets\"]= df_test[\"tweets\"].str.lower()" ] }, { "cell_type": "markdown", "metadata": { "id": "EJEZkf2KwAG7" }, "source": [ "### ii.Removing Punctuations" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "p_JNDC4HwaXi" }, "outputs": [], "source": [ "##using regex to remove punctuations and replace them with a space\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "MD3uS9uElHfh" }, "outputs": [], "source": [ "##removing hashtags:\n", "\n", "def punctuation_remover(text):\n", " ##removing has tags\n", " text= re.sub(r'#\\w+', '', text)\n", " ##removing punctuations\n", " text= re.sub(\"[^\\w\\s]\", repl= \"\", string=text)\n", " return text\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2GgFzHO6zsHx" }, "outputs": [], "source": [ "##creating a new\n", "df_train[\"clean_tweet\"]= df_train[\"tweets\"].apply(punctuation_remover)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ToevcCyKz3Sg" }, "outputs": [], "source": [ "df_test[\"clean_tweet\"]= df_test[\"tweets\"].apply(punctuation_remover)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 554 }, "id": "CBVqPeBYz1wK", "outputId": "44f4e7cc-9070-40cb-c17c-482d21771014" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweetslabelagreementtweet_lengthclean_tweet
0me &amp; the big homie meanboy3000 #meanboy #m...0.01.00000015me amp the big homie meanboy3000 stegman...
1i'm 100% thinking of devoting my career to pro...1.01.00000025im 100 thinking of devoting my career to provi...
2#whatcausesautism vaccines, do not vaccinate y...-1.01.0000007vaccines do not vaccinate your child
3i mean if they immunize my kid with something ...-1.01.00000028i mean if they immunize my kid with something ...
4thanks to <user> catch me performing at la nui...0.01.00000020thanks to user catch me performing at la nuit ...
..................
96“<user> people who complain live longer. relea...0.01.00000019user people who complain live longer releasing...
97austerity is not a vaccine to crisis... it is ...0.01.00000020austerity is not a vaccine to crisis it is a p...
98“<user> compensation for autism brain damage a...0.00.66666715user compensation for autism brain damage and ...
99is it that i seek to bear w/ them? is it that ...0.00.66666724is it that i seek to bear w them is it that i ...
100\"<user> conservative neurosurgeon ben carson s...-1.00.66666720user conservative neurosurgeon ben carson says...
\n", "

101 rows × 5 columns

\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " tweets label agreement \\\n", "0 me & the big homie meanboy3000 #meanboy #m... 0.0 1.000000 \n", "1 i'm 100% thinking of devoting my career to pro... 1.0 1.000000 \n", "2 #whatcausesautism vaccines, do not vaccinate y... -1.0 1.000000 \n", "3 i mean if they immunize my kid with something ... -1.0 1.000000 \n", "4 thanks to catch me performing at la nui... 0.0 1.000000 \n", ".. ... ... ... \n", "96 “ people who complain live longer. relea... 0.0 1.000000 \n", "97 austerity is not a vaccine to crisis... it is ... 0.0 1.000000 \n", "98 “ compensation for autism brain damage a... 0.0 0.666667 \n", "99 is it that i seek to bear w/ them? is it that ... 0.0 0.666667 \n", "100 \" conservative neurosurgeon ben carson s... -1.0 0.666667 \n", "\n", " tweet_length clean_tweet \n", "0 15 me amp the big homie meanboy3000 stegman... \n", "1 25 im 100 thinking of devoting my career to provi... \n", "2 7 vaccines do not vaccinate your child \n", "3 28 i mean if they immunize my kid with something ... \n", "4 20 thanks to user catch me performing at la nuit ... \n", ".. ... ... \n", "96 19 user people who complain live longer releasing... \n", "97 20 austerity is not a vaccine to crisis it is a p... \n", "98 15 user compensation for autism brain damage and ... \n", "99 24 is it that i seek to bear w them is it that i ... \n", "100 20 user conservative neurosurgeon ben carson says... \n", "\n", "[101 rows x 5 columns]" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.loc[:100]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 551 }, "id": "9TiVpEzLz8Zv", "outputId": "20f701f1-83c3-4656-8f98-571a6660f4e9" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweetsclean_tweet
10<user> : i have built up immunity to those di...user i have built up immunity to those disea...
11<user> <user> <user> study of 1.3 million kids...user user user study of 13 million kids reveal...
12vaccines :-0 (@ cherokee county health departm...vaccines 0 cherokee county health department url
13<user> are you sure you want to come back to a...user are you sure you want to come back to a m...
14oh well <user> an 18-month-old who had not bee...oh well user an 18monthold who had not been va...
15kcmo health depart:if you want a nasal flu vac...kcmo health departif you want a nasal flu vacc...
16a stipulation on jay's contract should be that...a stipulation on jays contract should be that ...
17if you do not vaccinate your children, let me ...if you do not vaccinate your children let me k...
18currently at the health department waiting for...currently at the health department waiting for...
19<user> <user> <user> <user> <user> and again, ...user user user user user and again they do hav...
20disney parks-linked measles outbreak grows to ...disney parkslinked measles outbreak grows to 7...
21#hatemondays#fml#immunization#health#mmr#vacci...nyc department of education url
22this made me think of you. <user> “<user> seat...this made me think of you user user seattle ki...
23this is how infectious diseases start, thx for...this is how infectious diseases start thx for ...
248 out of 12 patents admitted children develope...8 out of 12 patents admitted children develope...
25almighty jesus, nobody gave me nothing but the...almighty jesus nobody gave me nothing but the ...
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " tweets \\\n", "10 : i have built up immunity to those di... \n", "11 study of 1.3 million kids... \n", "12 vaccines :-0 (@ cherokee county health departm... \n", "13 are you sure you want to come back to a... \n", "14 oh well an 18-month-old who had not bee... \n", "15 kcmo health depart:if you want a nasal flu vac... \n", "16 a stipulation on jay's contract should be that... \n", "17 if you do not vaccinate your children, let me ... \n", "18 currently at the health department waiting for... \n", "19 and again, ... \n", "20 disney parks-linked measles outbreak grows to ... \n", "21 #hatemondays#fml#immunization#health#mmr#vacci... \n", "22 this made me think of you. seat... \n", "23 this is how infectious diseases start, thx for... \n", "24 8 out of 12 patents admitted children develope... \n", "25 almighty jesus, nobody gave me nothing but the... \n", "\n", " clean_tweet \n", "10 user i have built up immunity to those disea... \n", "11 user user user study of 13 million kids reveal... \n", "12 vaccines 0 cherokee county health department url \n", "13 user are you sure you want to come back to a m... \n", "14 oh well user an 18monthold who had not been va... \n", "15 kcmo health departif you want a nasal flu vacc... \n", "16 a stipulation on jays contract should be that ... \n", "17 if you do not vaccinate your children let me k... \n", "18 currently at the health department waiting for... \n", "19 user user user user user and again they do hav... \n", "20 disney parkslinked measles outbreak grows to 7... \n", "21 nyc department of education url \n", "22 this made me think of you user user seattle ki... \n", "23 this is how infectious diseases start thx for ... \n", "24 8 out of 12 patents admitted children develope... \n", "25 almighty jesus nobody gave me nothing but the ... " ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_test.loc[10:25]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Y0QnGEt4k8OZ" }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": { "id": "nz5Nr7nT1v_E" }, "source": [ "## iii.Removing Stop words" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 174 }, "id": "YkcZrhGB17Gs", "outputId": "ac8d7392-8834-4b0b-a397-78b373cd7bb0" }, "outputs": [ { "data": { "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" }, "text/plain": [ "\"on i why's such been during me we've no have himself your just not she'll since when against both how out do for it her he same com did very over own www couldn't she's then further there here's does therefore again these http yourselves can't through of i'd where's a as were theirs only with could when's some you'll more you're hadn't itself k yourself before are few can those had doing here she that's we're how's his who he's wouldn't aren't below each once i'm like their won't between whom who's they'd about they've most themselves our you should off the weren't what's down they'll hers she'd be don't until why also by he'll an shouldn't you've we'll under so because yours than into is isn't too from he'd him nor we you'd hence i'll doesn't get that what myself in there's else having they if let's my being wasn't however was after but them didn't has ought shan't this cannot all hasn't above am we'd any otherwise while herself mustn't they're at and haven't ours up ourselves where would other or it's its shall i've r ever which to\"" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##viewing my stopwrods\n", "\" \".join(STOPWORDS)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9DKes9k719LM" }, "outputs": [], "source": [ "def remove_stopwords(text):\n", " return \" \".join([word for word in text.split() if word not in stopwords])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "vJAubWhc3Lrl" }, "outputs": [], "source": [ "df_train[\"clean_tweet\"]= df_train[\"clean_tweet\"].apply(lambda x: remove_stopwords(x))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 473 }, "id": "LHeP0c9y4cht", "outputId": "2b780307-5ca7-467b-c148-8d8ab8b5ed25" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweetslabelagreementtweet_lengthclean_tweet
10<user> @ this point i have 2 text, butw/bon jo...0.01.00000025user point 2 text butwbon jovi cover playin al...
11my prediction, vaccine exemption in arizona wi...0.00.66666718prediction vaccine exemption arizona will end ...
12getting my vaccines ! #china #nervous #moving ...1.01.00000016getting vaccines cheryl southern nevada health...
131$mug noche <user> #mmr #mixmasterrod #dcdj #m...0.01.000000131mug noche user mad hatter url
14got my influenza vaccine! (@ purdue university...1.00.66666713got influenza vaccine purdue university studen...
15sb121 [enroll] meningococcal disease-pupils to...0.00.66666712sb121 enroll meningococcal diseasepupils immun...
16increasing number of parents skip vaccinations...0.01.00000012increasing number parents skip vaccinations ch...
17<user> thank you for standing with ca parents ...1.01.00000016user thank standing ca parents children suppor...
18dude idc if disney land has the measles, that ...0.01.00000020dude idc disney land measles means shorter lin...
19beeftalk: start your calf vaccinations now <ur...1.01.00000014beeftalk start calf vaccinations now url via u...
20i don't care what <user> says, you should prob...1.01.00000013dont care user says probably kids vaccinated
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " tweets label agreement \\\n", "10 @ this point i have 2 text, butw/bon jo... 0.0 1.000000 \n", "11 my prediction, vaccine exemption in arizona wi... 0.0 0.666667 \n", "12 getting my vaccines ! #china #nervous #moving ... 1.0 1.000000 \n", "13 1$mug noche #mmr #mixmasterrod #dcdj #m... 0.0 1.000000 \n", "14 got my influenza vaccine! (@ purdue university... 1.0 0.666667 \n", "15 sb121 [enroll] meningococcal disease-pupils to... 0.0 0.666667 \n", "16 increasing number of parents skip vaccinations... 0.0 1.000000 \n", "17 thank you for standing with ca parents ... 1.0 1.000000 \n", "18 dude idc if disney land has the measles, that ... 0.0 1.000000 \n", "19 beeftalk: start your calf vaccinations now says, you should prob... 1.0 1.000000 \n", "\n", " tweet_length clean_tweet \n", "10 25 user point 2 text butwbon jovi cover playin al... \n", "11 18 prediction vaccine exemption arizona will end ... \n", "12 16 getting vaccines cheryl southern nevada health... \n", "13 13 1mug noche user mad hatter url \n", "14 13 got influenza vaccine purdue university studen... \n", "15 12 sb121 enroll meningococcal diseasepupils immun... \n", "16 12 increasing number parents skip vaccinations ch... \n", "17 16 user thank standing ca parents children suppor... \n", "18 20 dude idc disney land measles means shorter lin... \n", "19 14 beeftalk start calf vaccinations now url via u... \n", "20 13 dont care user says probably kids vaccinated " ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.loc[10:20]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Us6KNIke4eFr" }, "outputs": [], "source": [ "df_test[\"clean_tweet\"]= df_test[\"clean_tweet\"].apply(lambda x: remove_stopwords(x))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "als7vgFf42aY", "outputId": "1c38cd02-d8ca-454e-cff1-813ed193fb0e" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweetsclean_tweet
900i wanna dip myself in a vat of purell after th...wanna dip vat purell doctors officethey safe r...
901division of public health launches community i...division public health launches community immu...
902she's a puss. haha scared of her vaccines. poo...shes puss haha scared vaccines poor baby spruc...
903researcher says cdc 'chose to cover up' data l...researcher says cdc chose cover data linking a...
904disney measles outbreak could get worse, exper...disney measles outbreak worse experts warn url
.........
996free back-to-school immunizations <url> <url>free backtoschool immunizations url url
997lesson of the day: vaccinate your children or ...lesson day vaccinate children disney will kill...
998<user> i have read some articles about the vac...user read articles vaccine say high incident p...
999<user> #gop stop blaming #immigrants first #eb...user stop blaming first now diseasesbrought an...
1000mmr shots hurt like a little bmmr shots hurt little b
\n", "

101 rows × 2 columns

\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " tweets \\\n", "900 i wanna dip myself in a vat of purell after th... \n", "901 division of public health launches community i... \n", "902 she's a puss. haha scared of her vaccines. poo... \n", "903 researcher says cdc 'chose to cover up' data l... \n", "904 disney measles outbreak could get worse, exper... \n", "... ... \n", "996 free back-to-school immunizations \n", "997 lesson of the day: vaccinate your children or ... \n", "998 i have read some articles about the vac... \n", "999 #gop stop blaming #immigrants first #eb... \n", "1000 mmr shots hurt like a little b \n", "\n", " clean_tweet \n", "900 wanna dip vat purell doctors officethey safe r... \n", "901 division public health launches community immu... \n", "902 shes puss haha scared vaccines poor baby spruc... \n", "903 researcher says cdc chose cover data linking a... \n", "904 disney measles outbreak worse experts warn url \n", "... ... \n", "996 free backtoschool immunizations url url \n", "997 lesson day vaccinate children disney will kill... \n", "998 user read articles vaccine say high incident p... \n", "999 user stop blaming first now diseasesbrought an... \n", "1000 mmr shots hurt little b \n", "\n", "[101 rows x 2 columns]" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_test.loc[900:1000]" ] }, { "cell_type": "markdown", "metadata": { "id": "CaJn6Fmt46j1" }, "source": [ "### iii.Removing Unnecesary words and Cleaning" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Qr7HKxnsG2oU" }, "outputs": [], "source": [ "clean_tweet= \" \".join(df_train[\"clean_tweet\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "uHDdmIC8_9qM" }, "outputs": [], "source": [ "freq= FreqDist([word for word in clean_tweet.split()])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "TYHXIk2oAwZW", "outputId": "a9004056-15cf-44ac-ebf9-3b4f571aff9b" }, "outputs": [ { "data": { "text/plain": [ "[('user', 5499),\n", " ('url', 4630),\n", " ('measles', 3176),\n", " ('vaccine', 1469),\n", " ('kids', 1258),\n", " ('vaccines', 1189),\n", " ('health', 1066),\n", " ('vaccinate', 905),\n", " ('children', 831),\n", " ('people', 702)]" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "freq.most_common(10)" ] }, { "cell_type": "markdown", "metadata": { "id": "-CAR7zlahPmN" }, "source": [ "I will get rid of the words:\n", "\n", "- user\n", "- url\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PKUBSifpbV7m", "outputId": "4b5ffa59-7f57-494f-81f7-2ad3193b2b7c" }, "outputs": [ { "data": { "text/plain": [ "[('amp', 'big'),\n", " ('big', 'homie'),\n", " ('homie', 'meanboy3000'),\n", " ('meanboy3000', 'stegman'),\n", " ('stegman', 'st'),\n", " ('st', 'url'),\n", " ('url', 'im'),\n", " ('im', '100'),\n", " ('100', 'thinking'),\n", " ('thinking', 'devoting'),\n", " ('devoting', 'career'),\n", " ('career', 'proving'),\n", " ('proving', 'autism'),\n", " ('autism', 'isnt'),\n", " ('isnt', 'caused'),\n", " ('caused', 'vaccines'),\n", " ('vaccines', 'due'),\n", " ('due', 'idiotic'),\n", " ('idiotic', 'posts'),\n", " ('posts', 'ive'),\n", " ('ive', 'seen'),\n", " ('seen', 'world'),\n", " ('world', 'autism'),\n", " ('autism', 'day'),\n", " ('day', 'vaccines'),\n", " ('vaccines', 'vaccinate'),\n", " ('vaccinate', 'child'),\n", " ('child', 'mean'),\n", " ('mean', 'immunize'),\n", " ('immunize', 'kid'),\n", " ('kid', 'something'),\n", " ('something', 'wont'),\n", " ('wont', 'secretly'),\n", " ('secretly', 'kill'),\n", " ('kill', 'years'),\n", " ('years', 'line'),\n", " ('line', 'im'),\n", " ('im', 'dont'),\n", " ('dont', 'trust'),\n", " ('trust', 'thanks'),\n", " ('thanks', 'user'),\n", " ('user', 'catch'),\n", " ('catch', 'performing'),\n", " ('performing', 'la'),\n", " ('la', 'nuit'),\n", " ('nuit', 'nyc'),\n", " ('nyc', '1134'),\n", " ('1134', '1st'),\n", " ('1st', 'ave'),\n", " ('ave', 'show'),\n", " ('show', 'starts'),\n", " ('starts', '6'),\n", " ('6', 'url'),\n", " ('url', 'user'),\n", " ('user', 'nearly'),\n", " ('nearly', '67'),\n", " ('67', 'year'),\n", " ('year', 'old'),\n", " ('old', 'study'),\n", " ('study', 'mental'),\n", " ('mental', 'health'),\n", " ('health', 'studies'),\n", " ('studies', 'vaccines'),\n", " ('vaccines', 'relatively'),\n", " ('relatively', 'infancies'),\n", " ('infancies', 'refuted'),\n", " ('refuted', 'study'),\n", " ('study', '95000'),\n", " ('95000', 'kids'),\n", " ('kids', 'finds'),\n", " ('finds', 'link'),\n", " ('link', 'mmr'),\n", " ('mmr', 'vaccine'),\n", " ('vaccine', 'autism'),\n", " ('autism', 'url'),\n", " ('url', 'psa'),\n", " ('psa', 'vaccinate'),\n", " ('vaccinate', 'fucking'),\n", " ('fucking', 'kids'),\n", " ('kids', 'coughing'),\n", " ('coughing', 'extra'),\n", " ('extra', 'shuttle'),\n", " ('shuttle', 'everyone'),\n", " ('everyone', 'thinks'),\n", " ('thinks', 'measles'),\n", " ('measles', 'aids'),\n", " ('aids', 'vaccine'),\n", " ('vaccine', 'created'),\n", " ('created', 'oregon'),\n", " ('oregon', 'health'),\n", " ('health', 'amp'),\n", " ('amp', 'science'),\n", " ('science', 'university'),\n", " ('university', 'may'),\n", " ('may', 'clear'),\n", " ('clear', 'virus'),\n", " ('virus', 'body'),\n", " ('body', 'url'),\n", " ('url', 'url'),\n", " ('url', 'user'),\n", " ('user', 'point'),\n", " ('point', '2'),\n", " ('2', 'text'),\n", " ('text', 'butwbon'),\n", " ('butwbon', 'jovi'),\n", " ('jovi', 'cover'),\n", " ('cover', 'playin'),\n", " ('playin', 'alibis'),\n", " ('alibis', 'hope'),\n", " ('hope', 'u'),\n", " ('u', 'come'),\n", " ('come', '2'),\n", " ('2', 'mmr'),\n", " ('mmr', 'bbquser'),\n", " ('bbquser', 'will'),\n", " ('will', 'b'),\n", " ('b', 'prediction'),\n", " ('prediction', 'vaccine'),\n", " ('vaccine', 'exemption'),\n", " ('exemption', 'arizona'),\n", " ('arizona', 'will'),\n", " ('will', 'end'),\n", " ('end', 'soon'),\n", " ('soon', 'much'),\n", " ('much', 'money'),\n", " ('money', 'lost'),\n", " ('lost', 'big'),\n", " ('big', 'pharma'),\n", " ('pharma', 'getting'),\n", " ('getting', 'vaccines'),\n", " ('vaccines', 'cheryl'),\n", " ('cheryl', 'southern'),\n", " ('southern', 'nevada'),\n", " ('nevada', 'health'),\n", " ('health', 'district'),\n", " ('district', 'url'),\n", " ('url', '1mug'),\n", " ('1mug', 'noche'),\n", " ('noche', 'user'),\n", " ('user', 'mad'),\n", " ('mad', 'hatter'),\n", " ('hatter', 'url'),\n", " ('url', 'got'),\n", " ('got', 'influenza'),\n", " ('influenza', 'vaccine'),\n", " ('vaccine', 'purdue'),\n", " ('purdue', 'university'),\n", " ('university', 'student'),\n", " ('student', 'health'),\n", " ('health', 'push'),\n", " ('push', 'user'),\n", " ('user', 'url'),\n", " ('url', 'sb121'),\n", " ('sb121', 'enroll'),\n", " ('enroll', 'meningococcal'),\n", " ('meningococcal', 'diseasepupils'),\n", " ('diseasepupils', 'immunized'),\n", " ('immunized', 'recommended'),\n", " ('recommended', 'age'),\n", " ('age', 'url'),\n", " ('url', 'increasing'),\n", " ('increasing', 'number'),\n", " ('number', 'parents'),\n", " ('parents', 'skip'),\n", " ('skip', 'vaccinations'),\n", " ('vaccinations', 'childrencolumbia'),\n", " ('childrencolumbia', 'missourian'),\n", " ('missourian', 'url'),\n", " ('url', 'user'),\n", " ('user', 'thank'),\n", " ('thank', 'standing'),\n", " ('standing', 'ca'),\n", " ('ca', 'parents'),\n", " ('parents', 'children'),\n", " ('children', 'support'),\n", " ('support', 'user'),\n", " ('user', 'dude'),\n", " ('dude', 'idc'),\n", " ('idc', 'disney'),\n", " ('disney', 'land'),\n", " ('land', 'measles'),\n", " ('measles', 'means'),\n", " ('means', 'shorter'),\n", " ('shorter', 'lines'),\n", " ('lines', 'rides'),\n", " ('rides', 'im'),\n", " ('im', 'driving'),\n", " ('driving', 'rn'),\n", " ('rn', 'beeftalk'),\n", " ('beeftalk', 'start'),\n", " ('start', 'calf'),\n", " ('calf', 'vaccinations'),\n", " ('vaccinations', 'now'),\n", " ('now', 'url'),\n", " ('url', 'via'),\n", " ('via', 'user'),\n", " ('user', 'good'),\n", " ('good', 'article'),\n", " ('article', 'herd'),\n", " ('herd', 'health'),\n", " ('health', 'dont'),\n", " ('dont', 'care'),\n", " ('care', 'user'),\n", " ('user', 'says'),\n", " ('says', 'probably'),\n", " ('probably', 'kids'),\n", " ('kids', 'vaccinated'),\n", " ('vaccinated', 'small'),\n", " ('small', 'study'),\n", " ('study', 'shows'),\n", " ('shows', '10'),\n", " ('10', 'pts'),\n", " ('pts', 'lupus'),\n", " ('lupus', 'shingles'),\n", " ('shingles', 'vaccine'),\n", " ('vaccine', 'safe'),\n", " ('safe', 'needs'),\n", " ('needs', 'confirmed'),\n", " ('confirmed', 'larger'),\n", " ('larger', 'study'),\n", " ('study', 'url'),\n", " ('url', 'cdc'),\n", " ('cdc', 'measles'),\n", " ('measles', 'epidemic'),\n", " ('epidemic', 'poses'),\n", " ('poses', 'travel'),\n", " ('travel', 'risks'),\n", " ('risks', 'usatlyixdtv1'),\n", " ('usatlyixdtv1', 'yes'),\n", " ('yes', 'true'),\n", " ('true', 'zombie'),\n", " ('zombie', 'epidemic'),\n", " ('epidemic', 'closing'),\n", " ('closing', 'us'),\n", " ('us', 'every'),\n", " ('every', 'time'),\n", " ('time', 'see'),\n", " ('see', 'vaccinate'),\n", " ('vaccinate', 'debate'),\n", " ('debate', 'wonder'),\n", " ('wonder', 'one'),\n", " ('one', 'side'),\n", " ('side', 'argument'),\n", " ('argument', 'even'),\n", " ('even', 'children'),\n", " ('children', 'user'),\n", " ('user', 'lied'),\n", " ('lied', 'hid'),\n", " ('hid', 'data'),\n", " ('data', 'black'),\n", " ('black', 'boys'),\n", " ('boys', '340'),\n", " ('340', 'uncreased'),\n", " ('uncreased', 'risk'),\n", " ('risk', 'developing'),\n", " ('developing', 'autism'),\n", " ('autism', 'mmr'),\n", " ('mmr', 'help'),\n", " ('help', 'user'),\n", " ('user', 'vaccines'),\n", " ('vaccines', 'causing'),\n", " ('causing', 'autism'),\n", " ('autism', 'user'),\n", " ('user', 'rarely'),\n", " ('rarely', 'see'),\n", " ('see', 'arguments'),\n", " ('arguments', 'vaccination'),\n", " ('vaccination', 'actually'),\n", " ('actually', 'big'),\n", " ('big', 'deal'),\n", " ('deal', 'im'),\n", " ('im', 'obsessed'),\n", " ('obsessed', 'w'),\n", " ('w', 'ebola'),\n", " ('ebola', 'following'),\n", " ('following', 'outbreak'),\n", " ('outbreak', 'worlds'),\n", " ('worlds', 'deadliest'),\n", " ('deadliest', 'disease'),\n", " ('disease', 'vaccine'),\n", " ('vaccine', 'cure'),\n", " ('cure', 'amp'),\n", " ('amp', '90'),\n", " ('90', 'death'),\n", " ('death', 'rate'),\n", " ('rate', 'user'),\n", " ('user', 'joshthenewt'),\n", " ('joshthenewt', 'suck'),\n", " ('suck', 'game'),\n", " ('game', 'haha'),\n", " ('haha', 'well'),\n", " ('well', 'people'),\n", " ('people', 'say'),\n", " ('say', 'dont'),\n", " ('dont', 'mmr'),\n", " ('mmr', 'eh'),\n", " ('eh', 'dont'),\n", " ('dont', 'shake'),\n", " ('shake', 'hand'),\n", " ('hand', 'pocahontas'),\n", " ('pocahontas', 'thats'),\n", " ('thats', 'probably'),\n", " ('probably', 'measles'),\n", " ('measles', 'now'),\n", " ('now', 'user'),\n", " ('user', 'yes'),\n", " ('yes', 'im'),\n", " ('im', 'part'),\n", " ('part', 'public'),\n", " ('public', 'health'),\n", " ('health', 'im'),\n", " ('im', 'part'),\n", " ('part', 'nature'),\n", " ('nature', 'feed'),\n", " ('feed', 'abandoned'),\n", " ('abandoned', 'baby'),\n", " ('baby', 'bird'),\n", " ('bird', 'vaccinate'),\n", " ('vaccinate', 'privileged'),\n", " ('privileged', 'kids'),\n", " ('kids', 'new'),\n", " ('new', 'studies'),\n", " ('studies', 'show'),\n", " ('show', 'vaccines'),\n", " ('vaccines', 'associated'),\n", " ('associated', 'autism'),\n", " ('autism', 'news'),\n", " ('news', 'sky'),\n", " ('sky', 'blue'),\n", " ('blue', 'repetitive'),\n", " ('repetitive', 'less'),\n", " ('less', 'shocked'),\n", " ('shocked', 'increase'),\n", " ('increase', 'asd'),\n", " ('asd', 'even'),\n", " ('even', 'genetically'),\n", " ('genetically', 'high'),\n", " ('high', 'risk'),\n", " ('risk', 'url'),\n", " ('url', 'glad'),\n", " ('glad', 'got'),\n", " ('got', 'vaccinated'),\n", " ('vaccinated', 'user'),\n", " ('user', 'health'),\n", " ('health', 'alert'),\n", " ('alert', 'case'),\n", " ('case', 'meningitis'),\n", " ('meningitis', 'reported'),\n", " ('reported', 'monmouth'),\n", " ('monmouth', 'university'),\n", " ('university', 'url'),\n", " ('url', 'look'),\n", " ('look', 'got'),\n", " ('got', 'measles'),\n", " ('measles', 'user'),\n", " ('user', 'user'),\n", " ('user', 'defense'),\n", " ('defense', 'vaccines'),\n", " ('vaccines', 'harmful'),\n", " ('harmful', 'american'),\n", " ('american', 'life'),\n", " ('life', 'unhealthy'),\n", " ('unhealthy', 'hello'),\n", " ('hello', 'ranked'),\n", " ('ranked', 'reset'),\n", " ('reset', 'probably'),\n", " ('probably', 'bad'),\n", " ('bad', 'placements'),\n", " ('placements', 'mmr'),\n", " ('mmr', 'went'),\n", " ('went', 'went'),\n", " ('went', 'lol'),\n", " ('lol', 'amid'),\n", " ('amid', 'measles'),\n", " ('measles', 'outbreak'),\n", " ('outbreak', 'vaccines'),\n", " ('vaccines', 'teachers'),\n", " ('teachers', 'arent'),\n", " ('arent', 'required'),\n", " ('required', 'much'),\n", " ('much', 'attention'),\n", " ('attention', 'ongoing'),\n", " ('ongoing', 'measles'),\n", " ('measles', 'url'),\n", " ('url', 'user'),\n", " ('user', 'user'),\n", " ('user', 'user'),\n", " ('user', 'user'),\n", " ('user', 'user'),\n", " ('user', 'url'),\n", " ('url', 'measles'),\n", " ('measles', 'threat'),\n", " ('threat', 'improve'),\n", " ('improve', 'mood'),\n", " ('mood', 'energy'),\n", " ('energy', 'immunity'),\n", " ('immunity', 'cardio'),\n", " ('cardio', 'health'),\n", " ('health', '15'),\n", " ('15', 'kit'),\n", " ('kit', 'profit'),\n", " ('profit', 'user'),\n", " ('user', 'thru'),\n", " ('thru', '1111'),\n", " ('1111', 'url'),\n", " ('url', 'mt'),\n", " ('mt', 'user'),\n", " ('user', 'new'),\n", " ('new', 'bill'),\n", " ('bill', 'end'),\n", " ('end', 'exemptions'),\n", " ('exemptions', '4personal'),\n", " ('4personal', 'religious'),\n", " ('religious', 'url'),\n", " ('url', 'via'),\n", " ('via', 'user'),\n", " ('user', 'url'),\n", " ('url', 'autism'),\n", " ('autism', 'immunizations'),\n", " ('immunizations', 'vaccinate'),\n", " ('vaccinate', 'url'),\n", " ('url', 'bart'),\n", " ('bart', 'riders'),\n", " ('riders', 'warned'),\n", " ('warned', 'measles'),\n", " ('measles', 'infection'),\n", " ('infection', 'contagious'),\n", " ('contagious', 'passenger'),\n", " ('passenger', 'oakland'),\n", " ('oakland', 'cbs'),\n", " ('cbs', 'sf'),\n", " ('sf', 'bay'),\n", " ('bay', 'area'),\n", " ('area', 'rapid'),\n", " ('rapid', 'transit'),\n", " ('transit', 'url'),\n", " ('url', 'user'),\n", " ('user', 'us'),\n", " ('us', 'cases'),\n", " ('cases', 'hit'),\n", " ('hit', '15year'),\n", " ('15year', 'high'),\n", " ('high', 'far'),\n", " ('far', 'year'),\n", " ('year', '118'),\n", " ('118', 'cases'),\n", " ('cases', 'measles'),\n", " ('measles', 'reported'),\n", " ('reported', 'unit'),\n", " ('unit', 'url'),\n", " ('url', 'pull'),\n", " ('pull', 'myxx'),\n", " ('myxx', 'nightlife'),\n", " ('nightlife', 'stint'),\n", " ('stint', 't'),\n", " ('t', 'performing'),\n", " ('performing', 'live'),\n", " ('live', 'hosted'),\n", " ('hosted', 'user'),\n", " ('user', 'url'),\n", " ('url', 'cdc'),\n", " ('cdc', 'eyeing'),\n", " ('eyeing', 'bird'),\n", " ('bird', 'flu'),\n", " ('flu', 'vaccine'),\n", " ('vaccine', 'humans'),\n", " ('humans', 'though'),\n", " ('though', 'risk'),\n", " ('risk', 'low'),\n", " ('low', 'federal'),\n", " ('federal', 'officials'),\n", " ('officials', 'said'),\n", " ('said', 'wednesday'),\n", " ('wednesday', 'theyre'),\n", " ('theyre', 'taking'),\n", " ('taking', 'steps'),\n", " ('steps', 'url'),\n", " ('url', 'involved'),\n", " ('involved', 'fight'),\n", " ('fight', 'preventable'),\n", " ('preventable', 'diseases'),\n", " ('diseases', 'tenure'),\n", " ('tenure', 'track'),\n", " ('track', 'opening'),\n", " ('opening', 'user'),\n", " ('user', 'url'),\n", " ('url', 'user'),\n", " ('user', 'im'),\n", " ('im', 'vaccinating'),\n", " ('vaccinating', 'kids'),\n", " ('kids', 'lol'),\n", " ('lol', 'alleged'),\n", " ('alleged', 'victim'),\n", " ('victim', 'reviewing'),\n", " ('reviewing', 'immunity'),\n", " ('immunity', 'paperwork'),\n", " ('paperwork', 'resume'),\n", " ('resume', 'testimony'),\n", " ('testimony', 'wo'),\n", " ('wo', 'fear'),\n", " ('fear', 'blackmail'),\n", " ('blackmail', 'prosecution'),\n", " ('prosecution', 'thanksuser'),\n", " ('thanksuser', 'crystal'),\n", " ('crystal', 'clear'),\n", " ('clear', 'lack'),\n", " ('lack', 'evidence'),\n", " ('evidence', 'linking'),\n", " ('linking', 'mmr'),\n", " ('mmr', 'autism'),\n", " ('autism', 'noticed'),\n", " ('noticed', 'user'),\n", " ('user', 'giving'),\n", " ('giving', 'dvds'),\n", " ('dvds', 'vaccinating'),\n", " ('vaccinating', 'children'),\n", " ('children', 'bad'),\n", " ('bad', 'hey'),\n", " ('hey', 'nowuser'),\n", " ('nowuser', 'mixmasterrods'),\n", " ('mixmasterrods', 'upstairs'),\n", " ('upstairs', 'lounge'),\n", " ('lounge', 'url'),\n", " ('url', 'seriously'),\n", " ('seriously', 'dont'),\n", " ('dont', 'want'),\n", " ('want', 'child'),\n", " ('child', 'antivaccination'),\n", " ('antivaccination', 'movement'),\n", " ('movement', 'ended'),\n", " ('ended', 'terrifying'),\n", " ('terrifying', 'user'),\n", " ('user', 'user'),\n", " ('user', 'user'),\n", " ('user', 'epidemic'),\n", " ('epidemic', 'enterovirus'),\n", " ('enterovirus', 'mumps'),\n", " ('mumps', 'now'),\n", " ('now', 'measles'),\n", " ('measles', 'drug'),\n", " ('drug', 'resistant'),\n", " ('resistant', 'tb'),\n", " ('tb', 'next'),\n", " ('next', 'said'),\n", " ('said', 'wasnt'),\n", " ('wasnt', 'gone'),\n", " ('gone', 'shit'),\n", " ('shit', 'lol'),\n", " ('lol', 'looked'),\n", " ('looked', 'proved'),\n", " ('proved', 'wrong'),\n", " ('wrong', 'baby'),\n", " ('baby', 'url'),\n", " ('url', 'flu'),\n", " ('flu', 'shots'),\n", " ('shots', 'school'),\n", " ('school', 'boost'),\n", " ('boost', 'vaccination'),\n", " ('vaccination', 'rates'),\n", " ('rates', 'offering'),\n", " ('offering', 'flu'),\n", " ('flu', 'shots'),\n", " ('shots', 'elementary'),\n", " ('elementary', 'schools'),\n", " ('schools', 'reduce'),\n", " ('reduce', 'number'),\n", " ('number', 'url'),\n", " ('url', 'user'),\n", " ('user', '1'),\n", " ('1', 'marin'),\n", " ('marin', 'county'),\n", " ('county', 'school'),\n", " ('school', 'board'),\n", " ('board', 'sides'),\n", " ('sides', 'young'),\n", " ('young', 'leukemia'),\n", " ('leukemia', 'patient'),\n", " ('patient', 'vaccinations'),\n", " ('vaccinations', 'url'),\n", " ('url', 'still'),\n", " ('still', 'running'),\n", " ('running', 'niggas'),\n", " ('niggas', 'til'),\n", " ('til', 'death'),\n", " ('death', 'dibiasimb'),\n", " ('dibiasimb', 'user'),\n", " ('user', 'url'),\n", " ('url', 'user'),\n", " ('user', 'yeah'),\n", " ('yeah', 'ill'),\n", " ('ill', 'stick'),\n", " ('stick', 'regular'),\n", " ('regular', 'vaccines'),\n", " ('vaccines', 'oh'),\n", " ('oh', 'wait'),\n", " ('wait', 'gives'),\n", " ('gives', 'autism'),\n", " ('autism', 'user'),\n", " ('user', 'antivaccine'),\n", " ('antivaccine', 'people'),\n", " ('people', 'want'),\n", " ('want', 'control'),\n", " ('control', 'people'),\n", " ('people', 'fear'),\n", " ('fear', 'weaken'),\n", " ('weaken', 'herd'),\n", " ('herd', 'immunityso'),\n", " ('immunityso', 'anyone'),\n", " ('anyone', 'knows'),\n", " ('knows', 'mind'),\n", " ('mind', 'control'),\n", " ('control', 'important'),\n", " ('important', 'user'),\n", " ('user', 'measles'),\n", " ('measles', 'update'),\n", " ('update', 'user'),\n", " ('user', 'says'),\n", " ('says', 'working'),\n", " ('working', 'w'),\n", " ('w', 'user'),\n", " ('user', 'url'),\n", " ('url', 'dont'),\n", " ('dont', 'think'),\n", " ('think', 'will'),\n", " ('will', 'understand'),\n", " ('understand', 'peoples'),\n", " ('peoples', 'reasons'),\n", " ('reasons', 'getting'),\n", " ('getting', 'vaccinated'),\n", " ('vaccinated', 'vaccinating'),\n", " ('vaccinating', 'kids'),\n", " ('kids', 'explain'),\n", " ('explain', 'cia'),\n", " ('cia', 'vaccination'),\n", " ('vaccination', 'campaigns'),\n", " ('campaigns', 'spy'),\n", " ('spy', 'ops'),\n", " ('ops', 'url'),\n", " ('url', 'via'),\n", " ('via', 'user'),\n", " ('user', 'harm'),\n", " ('harm', 'protected'),\n", " ('protected', 'medic'),\n", " ('medic', 'status'),\n", " ('status', 'ruse'),\n", " ('ruse', 'will'),\n", " ('will', 'linger'),\n", " ('linger', 'measles'),\n", " ('measles', 'outbreak'),\n", " ('outbreak', 'prompts'),\n", " ('prompts', 'vaccination'),\n", " ('vaccination', 'debate'),\n", " ('debate', 'political'),\n", " ('political', 'debate'),\n", " ('debate', 'rages'),\n", " ('rages', 'measles'),\n", " ('measles', 'outbreak'),\n", " ('outbreak', 'spreads'),\n", " ('spreads', 'url'),\n", " ('url', 'people'),\n", " ('people', 'need'),\n", " ('need', 'children'),\n", " ('children', 'vaccinated'),\n", " ('vaccinated', 'bad'),\n", " ('bad', 'diseases'),\n", " ('diseases', 'coming'),\n", " ('coming', 'back'),\n", " ('back', 'vaccine'),\n", " ('vaccine', 'brain'),\n", " ('brain', 'damage'),\n", " ('damage', 'cover'),\n", " ('cover', 'implodes'),\n", " ('implodes', 'url'),\n", " ('url', 'user'),\n", " ('user', 'user'),\n", " ('user', 'discussing'),\n", " ('discussing', 'user'),\n", " ('user', 'vaccines'),\n", " ('vaccines', 'childrens'),\n", " ('childrens', 'clinic'),\n", " ('clinic', 'url'),\n", " ('url', 'know'),\n", " ('know', 'infected'),\n", " ('infected', 'measles'),\n", " ('measles', 'airborne'),\n", " ('airborne', 'transmission'),\n", " ('transmission', 'wtf'),\n", " ('wtf', 'access'),\n", " ('access', 'fitchburg'),\n", " ('fitchburg', 'school'),\n", " ('school', 'limited'),\n", " ('limited', 'amid'),\n", " ('amid', 'measles'),\n", " ('measles', 'scare'),\n", " ('scare', 'url'),\n", " ('url', 'read'),\n", " ('read', 'neighbors'),\n", " ('neighbors', 'user'),\n", " ('user', 'user'),\n", " ('user', 'deadly'),\n", " ('deadly', 'kids'),\n", " ('kids', 'allergies'),\n", " ('allergies', 'joke'),\n", " ('joke', 'deadly'),\n", " ('deadly', 'child'),\n", " ('child', 'measles'),\n", " ('measles', 'vaccinate'),\n", " ('vaccinate', 'every1'),\n", " ('every1', 'measles'),\n", " ('measles', '717'),\n", " ('717', 'user'),\n", " ('user', 'topic'),\n", " ('topic', 'makes'),\n", " ('makes', 'scared'),\n", " ('scared', 'bring'),\n", " ('bring', 'kids'),\n", " ('kids', 'modern'),\n", " ('modern', 'society'),\n", " ('society', 'time'),\n", " ('time', 'start'),\n", " ('start', 'looking'),\n", " ('looking', 'private'),\n", " ('private', 'schools'),\n", " ('schools', 'mandate'),\n", " ('mandate', 'vaccines'),\n", " ('vaccines', 'user'),\n", " ('user', 'stop'),\n", " ('stop', 'blaming'),\n", " ('blaming', 'first'),\n", " ('first', 'now'),\n", " ('now', 'diseasesbrought'),\n", " ('diseasesbrought', 'ancestors'),\n", " ('ancestors', 'url'),\n", " ('url', 'already'),\n", " ('already', 'cure'),\n", " ('cure', 'measles'),\n", " ('measles', 'user'),\n", " ('user', 'user'),\n", " ('user', 'cant'),\n", " ('cant', 'attend'),\n", " ('attend', 'school'),\n", " ('school', 'unless'),\n", " ('unless', 'vaccinated'),\n", " ('vaccinated', 'dont'),\n", " ('dont', 'vaccinate'),\n", " ('vaccinate', 'children'),\n", " ('children', 'shitty'),\n", " ('shitty', 'parent'),\n", " ('parent', 'end'),\n", " ('end', 'story'),\n", " ('story', 'citizen'),\n", " ('citizen', 'kingdom'),\n", " ('kingdom', 'god'),\n", " ('god', 'assignment'),\n", " ('assignment', 'diplomatic'),\n", " ('diplomatic', 'immunity'),\n", " ('immunity', 'fear'),\n", " ('fear', 'man'),\n", " ('man', 'top'),\n", " ('top', 'colorado'),\n", " ('colorado', 'doctor'),\n", " ('doctor', 'says'),\n", " ('says', 'adults'),\n", " ('adults', 'worried'),\n", " ('worried', 'measles'),\n", " ('measles', 'vaccination'),\n", " ('vaccination', 'measles'),\n", " ('measles', 'outbreak'),\n", " ('outbreak', 'url'),\n", " ('url', 'user'),\n", " ('user', 'nah'),\n", " ('nah', 'idiot'),\n", " ('idiot', 'someone'),\n", " ('someone', 'doesnt'),\n", " ('doesnt', 'allow'),\n", " ('allow', 'kids'),\n", " ('kids', 'vaccinated'),\n", " ('vaccinated', 'kids'),\n", " ('kids', 'need'),\n", " ('need', 'vaccinations'),\n", " ('vaccinations', 'late'),\n", " ('late', '19thcentury'),\n", " ('19thcentury', 'maps'),\n", " ('maps', 'show'),\n", " ('show', 'measles'),\n", " ('measles', 'mortality'),\n", " ('mortality', 'vaccines'),\n", " ('vaccines', 'url'),\n", " ('url', 'url'),\n", " ('url', 'measles'),\n", " ('measles', 'fears'),\n", " ('fears', 'spread'),\n", " ('spread', 'nj'),\n", " ('nj', 'amid'),\n", " ('amid', 'suspected'),\n", " ('suspected', 'case'),\n", " ('case', 'url'),\n", " ('url', 'baby'),\n", " ('baby', 'vaccination'),\n", " ('vaccination', 'day'),\n", " ('day', 'shes'),\n", " ('shes', 'getting'),\n", " ('getting', 'measles'),\n", " ('measles', 'one'),\n", " ('one', 'early'),\n", " ('early', 'fuckwits'),\n", " ('fuckwits', 'chicago'),\n", " ('chicago', 'dont'),\n", " ('dont', 'vaccinate'),\n", " ('vaccinate', 'ensuring'),\n", " ('ensuring', 'problem'),\n", " ('problem', 'loving'),\n", " ('loving', 'measles'),\n", " ('measles', 'poster'),\n", " ('poster', 'making'),\n", " ('making', 'spirit'),\n", " ('spirit', 'today'),\n", " ('today', 'url'),\n", " ('url', 'baseball'),\n", " ('baseball', 'coach'),\n", " ('coach', 'santa'),\n", " ('santa', 'monica'),\n", " ('monica', 'high'),\n", " ('high', 'contracts'),\n", " ('contracts', 'measles'),\n", " ('measles', 'url'),\n", " ('url', 'im'),\n", " ('im', 'sick'),\n", " ('sick', 'hearing'),\n", " ('hearing', 'people'),\n", " ('people', 'vaccinating'),\n", " ('vaccinating', 'children'),\n", " ('children', 'sick'),\n", " ('sick', 'hearing'),\n", " ('hearing', 'peoples'),\n", " ('peoples', 'freedom'),\n", " ('freedom', 'insane'),\n", " ('insane', 'dangerous'),\n", " ('dangerous', 'things'),\n", " ('things', 'one'),\n", " ('one', 'risky'),\n", " ('risky', 'childrena'),\n", " ('childrena', 'vaccination'),\n", " ('vaccination', 'going'),\n", " ('going', 'school'),\n", " ('school', 'car'),\n", " ('car', 'odds'),\n", " ('odds', 'kid'),\n", " ('kid', 'measles'),\n", " ('measles', 'uses'),\n", " ('uses', 'lab'),\n", " ('lab', 'room'),\n", " ('room', 'user'),\n", " ('user', 'user'),\n", " ('user', 'parents'),\n", " ('parents', 'refuse'),\n", " ('refuse', 'kids'),\n", " ('kids', 'immunized'),\n", " ('immunized', 'charged'),\n", " ('charged', 'reckless'),\n", " ('reckless', 'endangerment'),\n", " ('endangerment', 'kid'),\n", " ('kid', 'gets'),\n", " ('gets', 'another'),\n", " ('another', 'sick'),\n", " ('sick', 'rt'),\n", " ('rt', 'user'),\n", " ('user', 'vaccinations'),\n", " ('vaccinations', 'measles'),\n", " ('measles', 'medicaid'),\n", " ('medicaid', 'expansion'),\n", " ('expansion', 'url'),\n", " ('url', 'user'),\n", " ('user', 'centers'),\n", " ('centers', 'disease'),\n", " ('disease', 'control'),\n", " ('control', 'years'),\n", " ('years', 'flu'),\n", " ('flu', 'vaccine'),\n", " ('vaccine', 'doesnt'),\n", " ('doesnt', 'work'),\n", " ('work', 'url'),\n", " ('url', 'user'),\n", " ('user', 'ty'),\n", " ('ty', 'fought'),\n", " ('fought', 'hard'),\n", " ('hard', '2'),\n", " ('2', 'vaccinate'),\n", " ('vaccinate', 'kids'),\n", " ('kids', 'yrs'),\n", " ('yrs', 'ppls'),\n", " ('ppls', 'judgements'),\n", " ('judgements', 'dont'),\n", " ('dont', 'hurt'),\n", " ('hurt', 'smart'),\n", " ('smart', 'vaccinate'),\n", " ('vaccinate', 'people'),\n", " ('people', 'user'),\n", " ('user', '15'),\n", " ('15', 'tarrant'),\n", " ('tarrant', 'county'),\n", " ('county', 'cases'),\n", " ('cases', 'five'),\n", " ('five', 'denton'),\n", " ('denton', 'co'),\n", " ('co', 'traced'),\n", " ('traced', 'megachurch'),\n", " ('megachurch', 'url'),\n", " ('url', 'hb1251'),\n", " ('hb1251', 'new'),\n", " ('new', 'providing'),\n", " ('providing', 'health'),\n", " ('health', 'insurance'),\n", " ('insurance', 'coverage'),\n", " ('coverage', 'immunizations'),\n", " ('immunizations', 'administered'),\n", " ('administered', 'pharmacy'),\n", " ('pharmacy', 'pharmacist'),\n", " ('pharmacist', 'url'),\n", " ('url', 'a4077'),\n", " ('a4077', 'new'),\n", " ('new', 'revises'),\n", " ('revises', 'codifies'),\n", " ('codifies', 'schedule'),\n", " ('schedule', 'childhood'),\n", " ('childhood', 'lead'),\n", " ('lead', 'screening'),\n", " ('screening', 'along'),\n", " ('along', 'immunization'),\n", " ('immunization', 'wellness'),\n", " ('wellness', 'url'),\n", " ('url', 'fridays'),\n", " ('fridays', 'immunity'),\n", " ('immunity', 'challenge'),\n", " ('challenge', 'walking'),\n", " ('walking', 'backwards'),\n", " ('backwards', 'safe'),\n", " ('safe', 'time'),\n", " ('time', 'sure'),\n", " ('sure', 'watch'),\n", " ('watch', 'back'),\n", " ('back', '70s'),\n", " ('70s', 'healthnut'),\n", " ('healthnut', 'parents'),\n", " ('parents', 'didnt'),\n", " ('didnt', 'vaccinate'),\n", " ('vaccinate', 'childhood'),\n", " ('childhood', 'url'),\n", " ('url', 'user'),\n", " ('user', 'people'),\n", " ('people', 'complain'),\n", " ('complain', 'live'),\n", " ('live', 'longer'),\n", " ('longer', 'releasing'),\n", " ('releasing', 'tension'),\n", " ('tension', 'increases'),\n", " ('increases', 'immunity'),\n", " ('immunity', 'improves'),\n", " ('improves', 'overall'),\n", " ('overall', 'health'),\n", " ('health', 'well'),\n", " ('well', 'good'),\n", " ('good', 'news'),\n", " ('news', 'austerity'),\n", " ('austerity', 'vaccine'),\n", " ('vaccine', 'crisis'),\n", " ('crisis', 'parasite'),\n", " ('parasite', 'pandemic'),\n", " ('pandemic', 'potential'),\n", " ('potential', 'drains'),\n", " ('drains', 'life'),\n", " ('life', 'killing'),\n", " ('killing', 'slowly'),\n", " ('slowly', 'user'),\n", " ('user', 'compensation'),\n", " ('compensation', 'autism'),\n", " ('autism', 'brain'),\n", " ('brain', 'damage'),\n", " ('damage', 'illnesses'),\n", " ('illnesses', 'vaccine'),\n", " ('vaccine', 'court'),\n", " ('court', 'via'),\n", " ('via', 'user'),\n", " ('user', 'url'),\n", " ('url', 'seek'),\n", " ('seek', 'bear'),\n", " ('bear', 'w'),\n", " ('w', 'see'),\n", " ('see', 'state'),\n", " ('state', 'ppls'),\n", " ('ppls', 'hearts'),\n", " ('hearts', 'url'),\n", " ('url', 'user'),\n", " ('user', 'conservative'),\n", " ('conservative', 'neurosurgeon'),\n", " ('neurosurgeon', 'ben'),\n", " ('ben', 'carson'),\n", " ('carson', 'says'),\n", " ('says', 'vaccines'),\n", " ('vaccines', 'public'),\n", " ...]" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## to help me get more meaning to each word, I am going to use bigrams and trigrams.\n", "words= \" \".join(df_train[\"clean_tweet\"])\n", "\n", "tokens= nltk.word_tokenize(words)\n", "\n", "bigrams= list(nltk.bigrams(tokens))\n", "\n", "bigrams" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wxKxsv8hdnt1", "outputId": "b0a91c00-3d09-44f9-9881-4ae46c51bfde" }, "outputs": [ { "data": { "text/plain": [ "[('amp', 'big', 'homie'),\n", " ('big', 'homie', 'meanboy3000'),\n", " ('homie', 'meanboy3000', 'stegman'),\n", " ('meanboy3000', 'stegman', 'st'),\n", " ('stegman', 'st', 'url'),\n", " ('st', 'url', 'im'),\n", " ('url', 'im', '100'),\n", " ('im', '100', 'thinking'),\n", " ('100', 'thinking', 'devoting'),\n", " ('thinking', 'devoting', 'career'),\n", " ('devoting', 'career', 'proving'),\n", " ('career', 'proving', 'autism'),\n", " ('proving', 'autism', 'isnt'),\n", " ('autism', 'isnt', 'caused'),\n", " ('isnt', 'caused', 'vaccines'),\n", " ('caused', 'vaccines', 'due'),\n", " ('vaccines', 'due', 'idiotic'),\n", " ('due', 'idiotic', 'posts'),\n", " ('idiotic', 'posts', 'ive'),\n", " ('posts', 'ive', 'seen'),\n", " ('ive', 'seen', 'world'),\n", " ('seen', 'world', 'autism'),\n", " ('world', 'autism', 'day'),\n", " ('autism', 'day', 'vaccines'),\n", " ('day', 'vaccines', 'vaccinate'),\n", " ('vaccines', 'vaccinate', 'child'),\n", " ('vaccinate', 'child', 'mean'),\n", " ('child', 'mean', 'immunize'),\n", " ('mean', 'immunize', 'kid'),\n", " ('immunize', 'kid', 'something'),\n", " ('kid', 'something', 'wont'),\n", " ('something', 'wont', 'secretly'),\n", " ('wont', 'secretly', 'kill'),\n", " ('secretly', 'kill', 'years'),\n", " ('kill', 'years', 'line'),\n", " ('years', 'line', 'im'),\n", " ('line', 'im', 'dont'),\n", " ('im', 'dont', 'trust'),\n", " ('dont', 'trust', 'thanks'),\n", " ('trust', 'thanks', 'user'),\n", " ('thanks', 'user', 'catch'),\n", " ('user', 'catch', 'performing'),\n", " ('catch', 'performing', 'la'),\n", " ('performing', 'la', 'nuit'),\n", " ('la', 'nuit', 'nyc'),\n", " ('nuit', 'nyc', '1134'),\n", " ('nyc', '1134', '1st'),\n", " ('1134', '1st', 'ave'),\n", " ('1st', 'ave', 'show'),\n", " ('ave', 'show', 'starts'),\n", " ('show', 'starts', '6'),\n", " ('starts', '6', 'url'),\n", " ('6', 'url', 'user'),\n", " ('url', 'user', 'nearly'),\n", " ('user', 'nearly', '67'),\n", " ('nearly', '67', 'year'),\n", " ('67', 'year', 'old'),\n", " ('year', 'old', 'study'),\n", " ('old', 'study', 'mental'),\n", " ('study', 'mental', 'health'),\n", " ('mental', 'health', 'studies'),\n", " ('health', 'studies', 'vaccines'),\n", " ('studies', 'vaccines', 'relatively'),\n", " ('vaccines', 'relatively', 'infancies'),\n", " ('relatively', 'infancies', 'refuted'),\n", " ('infancies', 'refuted', 'study'),\n", " ('refuted', 'study', '95000'),\n", " ('study', '95000', 'kids'),\n", " ('95000', 'kids', 'finds'),\n", " ('kids', 'finds', 'link'),\n", " ('finds', 'link', 'mmr'),\n", " ('link', 'mmr', 'vaccine'),\n", " ('mmr', 'vaccine', 'autism'),\n", " ('vaccine', 'autism', 'url'),\n", " ('autism', 'url', 'psa'),\n", " ('url', 'psa', 'vaccinate'),\n", " ('psa', 'vaccinate', 'fucking'),\n", " ('vaccinate', 'fucking', 'kids'),\n", " ('fucking', 'kids', 'coughing'),\n", " ('kids', 'coughing', 'extra'),\n", " ('coughing', 'extra', 'shuttle'),\n", " ('extra', 'shuttle', 'everyone'),\n", " ('shuttle', 'everyone', 'thinks'),\n", " ('everyone', 'thinks', 'measles'),\n", " ('thinks', 'measles', 'aids'),\n", " ('measles', 'aids', 'vaccine'),\n", " ('aids', 'vaccine', 'created'),\n", " ('vaccine', 'created', 'oregon'),\n", " ('created', 'oregon', 'health'),\n", " ('oregon', 'health', 'amp'),\n", " ('health', 'amp', 'science'),\n", " ('amp', 'science', 'university'),\n", " ('science', 'university', 'may'),\n", " ('university', 'may', 'clear'),\n", " ('may', 'clear', 'virus'),\n", " ('clear', 'virus', 'body'),\n", " ('virus', 'body', 'url'),\n", " ('body', 'url', 'url'),\n", " ('url', 'url', 'user'),\n", " ('url', 'user', 'point'),\n", " ('user', 'point', '2'),\n", " ('point', '2', 'text'),\n", " ('2', 'text', 'butwbon'),\n", " ('text', 'butwbon', 'jovi'),\n", " ('butwbon', 'jovi', 'cover'),\n", " ('jovi', 'cover', 'playin'),\n", " ('cover', 'playin', 'alibis'),\n", " ('playin', 'alibis', 'hope'),\n", " ('alibis', 'hope', 'u'),\n", " ('hope', 'u', 'come'),\n", " ('u', 'come', '2'),\n", " ('come', '2', 'mmr'),\n", " ('2', 'mmr', 'bbquser'),\n", " ('mmr', 'bbquser', 'will'),\n", " ('bbquser', 'will', 'b'),\n", " ('will', 'b', 'prediction'),\n", " ('b', 'prediction', 'vaccine'),\n", " ('prediction', 'vaccine', 'exemption'),\n", " ('vaccine', 'exemption', 'arizona'),\n", " ('exemption', 'arizona', 'will'),\n", " ('arizona', 'will', 'end'),\n", " ('will', 'end', 'soon'),\n", " ('end', 'soon', 'much'),\n", " ('soon', 'much', 'money'),\n", " ('much', 'money', 'lost'),\n", " ('money', 'lost', 'big'),\n", " ('lost', 'big', 'pharma'),\n", " ('big', 'pharma', 'getting'),\n", " ('pharma', 'getting', 'vaccines'),\n", " ('getting', 'vaccines', 'cheryl'),\n", " ('vaccines', 'cheryl', 'southern'),\n", " ('cheryl', 'southern', 'nevada'),\n", " ('southern', 'nevada', 'health'),\n", " ('nevada', 'health', 'district'),\n", " ('health', 'district', 'url'),\n", " ('district', 'url', '1mug'),\n", " ('url', '1mug', 'noche'),\n", " ('1mug', 'noche', 'user'),\n", " ('noche', 'user', 'mad'),\n", " ('user', 'mad', 'hatter'),\n", " ('mad', 'hatter', 'url'),\n", " ('hatter', 'url', 'got'),\n", " ('url', 'got', 'influenza'),\n", " ('got', 'influenza', 'vaccine'),\n", " ('influenza', 'vaccine', 'purdue'),\n", " ('vaccine', 'purdue', 'university'),\n", " ('purdue', 'university', 'student'),\n", " ('university', 'student', 'health'),\n", " ('student', 'health', 'push'),\n", " ('health', 'push', 'user'),\n", " ('push', 'user', 'url'),\n", " ('user', 'url', 'sb121'),\n", " ('url', 'sb121', 'enroll'),\n", " ('sb121', 'enroll', 'meningococcal'),\n", " ('enroll', 'meningococcal', 'diseasepupils'),\n", " ('meningococcal', 'diseasepupils', 'immunized'),\n", " ('diseasepupils', 'immunized', 'recommended'),\n", " ('immunized', 'recommended', 'age'),\n", " ('recommended', 'age', 'url'),\n", " ('age', 'url', 'increasing'),\n", " ('url', 'increasing', 'number'),\n", " ('increasing', 'number', 'parents'),\n", " ('number', 'parents', 'skip'),\n", " ('parents', 'skip', 'vaccinations'),\n", " ('skip', 'vaccinations', 'childrencolumbia'),\n", " ('vaccinations', 'childrencolumbia', 'missourian'),\n", " ('childrencolumbia', 'missourian', 'url'),\n", " ('missourian', 'url', 'user'),\n", " ('url', 'user', 'thank'),\n", " ('user', 'thank', 'standing'),\n", " ('thank', 'standing', 'ca'),\n", " ('standing', 'ca', 'parents'),\n", " ('ca', 'parents', 'children'),\n", " ('parents', 'children', 'support'),\n", " ('children', 'support', 'user'),\n", " ('support', 'user', 'dude'),\n", " ('user', 'dude', 'idc'),\n", " ('dude', 'idc', 'disney'),\n", " ('idc', 'disney', 'land'),\n", " ('disney', 'land', 'measles'),\n", " ('land', 'measles', 'means'),\n", " ('measles', 'means', 'shorter'),\n", " ('means', 'shorter', 'lines'),\n", " ('shorter', 'lines', 'rides'),\n", " ('lines', 'rides', 'im'),\n", " ('rides', 'im', 'driving'),\n", " ('im', 'driving', 'rn'),\n", " ('driving', 'rn', 'beeftalk'),\n", " ('rn', 'beeftalk', 'start'),\n", " ('beeftalk', 'start', 'calf'),\n", " ('start', 'calf', 'vaccinations'),\n", " ('calf', 'vaccinations', 'now'),\n", " ('vaccinations', 'now', 'url'),\n", " ('now', 'url', 'via'),\n", " ('url', 'via', 'user'),\n", " ('via', 'user', 'good'),\n", " ('user', 'good', 'article'),\n", " ('good', 'article', 'herd'),\n", " ('article', 'herd', 'health'),\n", " ('herd', 'health', 'dont'),\n", " ('health', 'dont', 'care'),\n", " ('dont', 'care', 'user'),\n", " ('care', 'user', 'says'),\n", " ('user', 'says', 'probably'),\n", " ('says', 'probably', 'kids'),\n", " ('probably', 'kids', 'vaccinated'),\n", " ('kids', 'vaccinated', 'small'),\n", " ('vaccinated', 'small', 'study'),\n", " ('small', 'study', 'shows'),\n", " ('study', 'shows', '10'),\n", " ('shows', '10', 'pts'),\n", " ('10', 'pts', 'lupus'),\n", " ('pts', 'lupus', 'shingles'),\n", " ('lupus', 'shingles', 'vaccine'),\n", " ('shingles', 'vaccine', 'safe'),\n", " ('vaccine', 'safe', 'needs'),\n", " ('safe', 'needs', 'confirmed'),\n", " ('needs', 'confirmed', 'larger'),\n", " ('confirmed', 'larger', 'study'),\n", " ('larger', 'study', 'url'),\n", " ('study', 'url', 'cdc'),\n", " ('url', 'cdc', 'measles'),\n", " ('cdc', 'measles', 'epidemic'),\n", " ('measles', 'epidemic', 'poses'),\n", " ('epidemic', 'poses', 'travel'),\n", " ('poses', 'travel', 'risks'),\n", " ('travel', 'risks', 'usatlyixdtv1'),\n", " ('risks', 'usatlyixdtv1', 'yes'),\n", " ('usatlyixdtv1', 'yes', 'true'),\n", " ('yes', 'true', 'zombie'),\n", " ('true', 'zombie', 'epidemic'),\n", " ('zombie', 'epidemic', 'closing'),\n", " ('epidemic', 'closing', 'us'),\n", " ('closing', 'us', 'every'),\n", " ('us', 'every', 'time'),\n", " ('every', 'time', 'see'),\n", " ('time', 'see', 'vaccinate'),\n", " ('see', 'vaccinate', 'debate'),\n", " ('vaccinate', 'debate', 'wonder'),\n", " ('debate', 'wonder', 'one'),\n", " ('wonder', 'one', 'side'),\n", " ('one', 'side', 'argument'),\n", " ('side', 'argument', 'even'),\n", " ('argument', 'even', 'children'),\n", " ('even', 'children', 'user'),\n", " ('children', 'user', 'lied'),\n", " ('user', 'lied', 'hid'),\n", " ('lied', 'hid', 'data'),\n", " ('hid', 'data', 'black'),\n", " ('data', 'black', 'boys'),\n", " ('black', 'boys', '340'),\n", " ('boys', '340', 'uncreased'),\n", " ('340', 'uncreased', 'risk'),\n", " ('uncreased', 'risk', 'developing'),\n", " ('risk', 'developing', 'autism'),\n", " ('developing', 'autism', 'mmr'),\n", " ('autism', 'mmr', 'help'),\n", " ('mmr', 'help', 'user'),\n", " ('help', 'user', 'vaccines'),\n", " ('user', 'vaccines', 'causing'),\n", " ('vaccines', 'causing', 'autism'),\n", " ('causing', 'autism', 'user'),\n", " ('autism', 'user', 'rarely'),\n", " ('user', 'rarely', 'see'),\n", " ('rarely', 'see', 'arguments'),\n", " ('see', 'arguments', 'vaccination'),\n", " ('arguments', 'vaccination', 'actually'),\n", " ('vaccination', 'actually', 'big'),\n", " ('actually', 'big', 'deal'),\n", " ('big', 'deal', 'im'),\n", " ('deal', 'im', 'obsessed'),\n", " ('im', 'obsessed', 'w'),\n", " ('obsessed', 'w', 'ebola'),\n", " ('w', 'ebola', 'following'),\n", " ('ebola', 'following', 'outbreak'),\n", " ('following', 'outbreak', 'worlds'),\n", " ('outbreak', 'worlds', 'deadliest'),\n", " ('worlds', 'deadliest', 'disease'),\n", " ('deadliest', 'disease', 'vaccine'),\n", " ('disease', 'vaccine', 'cure'),\n", " ('vaccine', 'cure', 'amp'),\n", " ('cure', 'amp', '90'),\n", " ('amp', '90', 'death'),\n", " ('90', 'death', 'rate'),\n", " ('death', 'rate', 'user'),\n", " ('rate', 'user', 'joshthenewt'),\n", " ('user', 'joshthenewt', 'suck'),\n", " ('joshthenewt', 'suck', 'game'),\n", " ('suck', 'game', 'haha'),\n", " ('game', 'haha', 'well'),\n", " ('haha', 'well', 'people'),\n", " ('well', 'people', 'say'),\n", " ('people', 'say', 'dont'),\n", " ('say', 'dont', 'mmr'),\n", " ('dont', 'mmr', 'eh'),\n", " ('mmr', 'eh', 'dont'),\n", " ('eh', 'dont', 'shake'),\n", " ('dont', 'shake', 'hand'),\n", " ('shake', 'hand', 'pocahontas'),\n", " ('hand', 'pocahontas', 'thats'),\n", " ('pocahontas', 'thats', 'probably'),\n", " ('thats', 'probably', 'measles'),\n", " ('probably', 'measles', 'now'),\n", " ('measles', 'now', 'user'),\n", " ('now', 'user', 'yes'),\n", " ('user', 'yes', 'im'),\n", " ('yes', 'im', 'part'),\n", " ('im', 'part', 'public'),\n", " ('part', 'public', 'health'),\n", " ('public', 'health', 'im'),\n", " ('health', 'im', 'part'),\n", " ('im', 'part', 'nature'),\n", " ('part', 'nature', 'feed'),\n", " ('nature', 'feed', 'abandoned'),\n", " ('feed', 'abandoned', 'baby'),\n", " ('abandoned', 'baby', 'bird'),\n", " ('baby', 'bird', 'vaccinate'),\n", " ('bird', 'vaccinate', 'privileged'),\n", " ('vaccinate', 'privileged', 'kids'),\n", " ('privileged', 'kids', 'new'),\n", " ('kids', 'new', 'studies'),\n", " ('new', 'studies', 'show'),\n", " ('studies', 'show', 'vaccines'),\n", " ('show', 'vaccines', 'associated'),\n", " ('vaccines', 'associated', 'autism'),\n", " ('associated', 'autism', 'news'),\n", " ('autism', 'news', 'sky'),\n", " ('news', 'sky', 'blue'),\n", " ('sky', 'blue', 'repetitive'),\n", " ('blue', 'repetitive', 'less'),\n", " ('repetitive', 'less', 'shocked'),\n", " ('less', 'shocked', 'increase'),\n", " ('shocked', 'increase', 'asd'),\n", " ('increase', 'asd', 'even'),\n", " ('asd', 'even', 'genetically'),\n", " ('even', 'genetically', 'high'),\n", " ('genetically', 'high', 'risk'),\n", " ('high', 'risk', 'url'),\n", " ('risk', 'url', 'glad'),\n", " ('url', 'glad', 'got'),\n", " ('glad', 'got', 'vaccinated'),\n", " ('got', 'vaccinated', 'user'),\n", " ('vaccinated', 'user', 'health'),\n", " ('user', 'health', 'alert'),\n", " ('health', 'alert', 'case'),\n", " ('alert', 'case', 'meningitis'),\n", " ('case', 'meningitis', 'reported'),\n", " ('meningitis', 'reported', 'monmouth'),\n", " ('reported', 'monmouth', 'university'),\n", " ('monmouth', 'university', 'url'),\n", " ('university', 'url', 'look'),\n", " ('url', 'look', 'got'),\n", " ('look', 'got', 'measles'),\n", " ('got', 'measles', 'user'),\n", " ('measles', 'user', 'user'),\n", " ('user', 'user', 'defense'),\n", " ('user', 'defense', 'vaccines'),\n", " ('defense', 'vaccines', 'harmful'),\n", " ('vaccines', 'harmful', 'american'),\n", " ('harmful', 'american', 'life'),\n", " ('american', 'life', 'unhealthy'),\n", " ('life', 'unhealthy', 'hello'),\n", " ('unhealthy', 'hello', 'ranked'),\n", " ('hello', 'ranked', 'reset'),\n", " ('ranked', 'reset', 'probably'),\n", " ('reset', 'probably', 'bad'),\n", " ('probably', 'bad', 'placements'),\n", " ('bad', 'placements', 'mmr'),\n", " ('placements', 'mmr', 'went'),\n", " ('mmr', 'went', 'went'),\n", " ('went', 'went', 'lol'),\n", " ('went', 'lol', 'amid'),\n", " ('lol', 'amid', 'measles'),\n", " ('amid', 'measles', 'outbreak'),\n", " ('measles', 'outbreak', 'vaccines'),\n", " ('outbreak', 'vaccines', 'teachers'),\n", " ('vaccines', 'teachers', 'arent'),\n", " ('teachers', 'arent', 'required'),\n", " ('arent', 'required', 'much'),\n", " ('required', 'much', 'attention'),\n", " ('much', 'attention', 'ongoing'),\n", " ('attention', 'ongoing', 'measles'),\n", " ('ongoing', 'measles', 'url'),\n", " ('measles', 'url', 'user'),\n", " ('url', 'user', 'user'),\n", " ('user', 'user', 'user'),\n", " ('user', 'user', 'user'),\n", " ('user', 'user', 'user'),\n", " ('user', 'user', 'url'),\n", " ('user', 'url', 'measles'),\n", " ('url', 'measles', 'threat'),\n", " ('measles', 'threat', 'improve'),\n", " ('threat', 'improve', 'mood'),\n", " ('improve', 'mood', 'energy'),\n", " ('mood', 'energy', 'immunity'),\n", " ('energy', 'immunity', 'cardio'),\n", " ('immunity', 'cardio', 'health'),\n", " ('cardio', 'health', '15'),\n", " ('health', '15', 'kit'),\n", " ('15', 'kit', 'profit'),\n", " ('kit', 'profit', 'user'),\n", " ('profit', 'user', 'thru'),\n", " ('user', 'thru', '1111'),\n", " ('thru', '1111', 'url'),\n", " ('1111', 'url', 'mt'),\n", " ('url', 'mt', 'user'),\n", " ('mt', 'user', 'new'),\n", " ('user', 'new', 'bill'),\n", " ('new', 'bill', 'end'),\n", " ('bill', 'end', 'exemptions'),\n", " ('end', 'exemptions', '4personal'),\n", " ('exemptions', '4personal', 'religious'),\n", " ('4personal', 'religious', 'url'),\n", " ('religious', 'url', 'via'),\n", " ('url', 'via', 'user'),\n", " ('via', 'user', 'url'),\n", " ('user', 'url', 'autism'),\n", " ('url', 'autism', 'immunizations'),\n", " ('autism', 'immunizations', 'vaccinate'),\n", " ('immunizations', 'vaccinate', 'url'),\n", " ('vaccinate', 'url', 'bart'),\n", " ('url', 'bart', 'riders'),\n", " ('bart', 'riders', 'warned'),\n", " ('riders', 'warned', 'measles'),\n", " ('warned', 'measles', 'infection'),\n", " ('measles', 'infection', 'contagious'),\n", " ('infection', 'contagious', 'passenger'),\n", " ('contagious', 'passenger', 'oakland'),\n", " ('passenger', 'oakland', 'cbs'),\n", " ('oakland', 'cbs', 'sf'),\n", " ('cbs', 'sf', 'bay'),\n", " ('sf', 'bay', 'area'),\n", " ('bay', 'area', 'rapid'),\n", " ('area', 'rapid', 'transit'),\n", " ('rapid', 'transit', 'url'),\n", " ('transit', 'url', 'user'),\n", " ('url', 'user', 'us'),\n", " ('user', 'us', 'cases'),\n", " ('us', 'cases', 'hit'),\n", " ('cases', 'hit', '15year'),\n", " ('hit', '15year', 'high'),\n", " ('15year', 'high', 'far'),\n", " ('high', 'far', 'year'),\n", " ('far', 'year', '118'),\n", " ('year', '118', 'cases'),\n", " ('118', 'cases', 'measles'),\n", " ('cases', 'measles', 'reported'),\n", " ('measles', 'reported', 'unit'),\n", " ('reported', 'unit', 'url'),\n", " ('unit', 'url', 'pull'),\n", " ('url', 'pull', 'myxx'),\n", " ('pull', 'myxx', 'nightlife'),\n", " ('myxx', 'nightlife', 'stint'),\n", " ('nightlife', 'stint', 't'),\n", " ('stint', 't', 'performing'),\n", " ('t', 'performing', 'live'),\n", " ('performing', 'live', 'hosted'),\n", " ('live', 'hosted', 'user'),\n", " ('hosted', 'user', 'url'),\n", " ('user', 'url', 'cdc'),\n", " ('url', 'cdc', 'eyeing'),\n", " ('cdc', 'eyeing', 'bird'),\n", " ('eyeing', 'bird', 'flu'),\n", " ('bird', 'flu', 'vaccine'),\n", " ('flu', 'vaccine', 'humans'),\n", " ('vaccine', 'humans', 'though'),\n", " ('humans', 'though', 'risk'),\n", " ('though', 'risk', 'low'),\n", " ('risk', 'low', 'federal'),\n", " ('low', 'federal', 'officials'),\n", " ('federal', 'officials', 'said'),\n", " ('officials', 'said', 'wednesday'),\n", " ('said', 'wednesday', 'theyre'),\n", " ('wednesday', 'theyre', 'taking'),\n", " ('theyre', 'taking', 'steps'),\n", " ('taking', 'steps', 'url'),\n", " ('steps', 'url', 'involved'),\n", " ('url', 'involved', 'fight'),\n", " ('involved', 'fight', 'preventable'),\n", " ('fight', 'preventable', 'diseases'),\n", " ('preventable', 'diseases', 'tenure'),\n", " ('diseases', 'tenure', 'track'),\n", " ('tenure', 'track', 'opening'),\n", " ('track', 'opening', 'user'),\n", " ('opening', 'user', 'url'),\n", " ('user', 'url', 'user'),\n", " ('url', 'user', 'im'),\n", " ('user', 'im', 'vaccinating'),\n", " ('im', 'vaccinating', 'kids'),\n", " ('vaccinating', 'kids', 'lol'),\n", " ('kids', 'lol', 'alleged'),\n", " ('lol', 'alleged', 'victim'),\n", " ('alleged', 'victim', 'reviewing'),\n", " ('victim', 'reviewing', 'immunity'),\n", " ('reviewing', 'immunity', 'paperwork'),\n", " ('immunity', 'paperwork', 'resume'),\n", " ('paperwork', 'resume', 'testimony'),\n", " ('resume', 'testimony', 'wo'),\n", " ('testimony', 'wo', 'fear'),\n", " ('wo', 'fear', 'blackmail'),\n", " ('fear', 'blackmail', 'prosecution'),\n", " ('blackmail', 'prosecution', 'thanksuser'),\n", " ('prosecution', 'thanksuser', 'crystal'),\n", " ('thanksuser', 'crystal', 'clear'),\n", " ('crystal', 'clear', 'lack'),\n", " ('clear', 'lack', 'evidence'),\n", " ('lack', 'evidence', 'linking'),\n", " ('evidence', 'linking', 'mmr'),\n", " ('linking', 'mmr', 'autism'),\n", " ('mmr', 'autism', 'noticed'),\n", " ('autism', 'noticed', 'user'),\n", " ('noticed', 'user', 'giving'),\n", " ('user', 'giving', 'dvds'),\n", " ('giving', 'dvds', 'vaccinating'),\n", " ('dvds', 'vaccinating', 'children'),\n", " ('vaccinating', 'children', 'bad'),\n", " ('children', 'bad', 'hey'),\n", " ('bad', 'hey', 'nowuser'),\n", " ('hey', 'nowuser', 'mixmasterrods'),\n", " ('nowuser', 'mixmasterrods', 'upstairs'),\n", " ('mixmasterrods', 'upstairs', 'lounge'),\n", " ('upstairs', 'lounge', 'url'),\n", " ('lounge', 'url', 'seriously'),\n", " ('url', 'seriously', 'dont'),\n", " ('seriously', 'dont', 'want'),\n", " ('dont', 'want', 'child'),\n", " ('want', 'child', 'antivaccination'),\n", " ('child', 'antivaccination', 'movement'),\n", " ('antivaccination', 'movement', 'ended'),\n", " ('movement', 'ended', 'terrifying'),\n", " ('ended', 'terrifying', 'user'),\n", " ('terrifying', 'user', 'user'),\n", " ('user', 'user', 'user'),\n", " ('user', 'user', 'epidemic'),\n", " ('user', 'epidemic', 'enterovirus'),\n", " ('epidemic', 'enterovirus', 'mumps'),\n", " ('enterovirus', 'mumps', 'now'),\n", " ('mumps', 'now', 'measles'),\n", " ('now', 'measles', 'drug'),\n", " ('measles', 'drug', 'resistant'),\n", " ('drug', 'resistant', 'tb'),\n", " ('resistant', 'tb', 'next'),\n", " ('tb', 'next', 'said'),\n", " ('next', 'said', 'wasnt'),\n", " ('said', 'wasnt', 'gone'),\n", " ('wasnt', 'gone', 'shit'),\n", " ('gone', 'shit', 'lol'),\n", " ('shit', 'lol', 'looked'),\n", " ('lol', 'looked', 'proved'),\n", " ('looked', 'proved', 'wrong'),\n", " ('proved', 'wrong', 'baby'),\n", " ('wrong', 'baby', 'url'),\n", " ('baby', 'url', 'flu'),\n", " ('url', 'flu', 'shots'),\n", " ('flu', 'shots', 'school'),\n", " ('shots', 'school', 'boost'),\n", " ('school', 'boost', 'vaccination'),\n", " ('boost', 'vaccination', 'rates'),\n", " ('vaccination', 'rates', 'offering'),\n", " ('rates', 'offering', 'flu'),\n", " ('offering', 'flu', 'shots'),\n", " ('flu', 'shots', 'elementary'),\n", " ('shots', 'elementary', 'schools'),\n", " ('elementary', 'schools', 'reduce'),\n", " ('schools', 'reduce', 'number'),\n", " ('reduce', 'number', 'url'),\n", " ('number', 'url', 'user'),\n", " ('url', 'user', '1'),\n", " ('user', '1', 'marin'),\n", " ('1', 'marin', 'county'),\n", " ('marin', 'county', 'school'),\n", " ('county', 'school', 'board'),\n", " ('school', 'board', 'sides'),\n", " ('board', 'sides', 'young'),\n", " ('sides', 'young', 'leukemia'),\n", " ('young', 'leukemia', 'patient'),\n", " ('leukemia', 'patient', 'vaccinations'),\n", " ('patient', 'vaccinations', 'url'),\n", " ('vaccinations', 'url', 'still'),\n", " ('url', 'still', 'running'),\n", " ('still', 'running', 'niggas'),\n", " ('running', 'niggas', 'til'),\n", " ('niggas', 'til', 'death'),\n", " ('til', 'death', 'dibiasimb'),\n", " ('death', 'dibiasimb', 'user'),\n", " ('dibiasimb', 'user', 'url'),\n", " ('user', 'url', 'user'),\n", " ('url', 'user', 'yeah'),\n", " ('user', 'yeah', 'ill'),\n", " ('yeah', 'ill', 'stick'),\n", " ('ill', 'stick', 'regular'),\n", " ('stick', 'regular', 'vaccines'),\n", " ('regular', 'vaccines', 'oh'),\n", " ('vaccines', 'oh', 'wait'),\n", " ('oh', 'wait', 'gives'),\n", " ('wait', 'gives', 'autism'),\n", " ('gives', 'autism', 'user'),\n", " ('autism', 'user', 'antivaccine'),\n", " ('user', 'antivaccine', 'people'),\n", " ('antivaccine', 'people', 'want'),\n", " ('people', 'want', 'control'),\n", " ('want', 'control', 'people'),\n", " ('control', 'people', 'fear'),\n", " ('people', 'fear', 'weaken'),\n", " ('fear', 'weaken', 'herd'),\n", " ('weaken', 'herd', 'immunityso'),\n", " ('herd', 'immunityso', 'anyone'),\n", " ('immunityso', 'anyone', 'knows'),\n", " ('anyone', 'knows', 'mind'),\n", " ('knows', 'mind', 'control'),\n", " ('mind', 'control', 'important'),\n", " ('control', 'important', 'user'),\n", " ('important', 'user', 'measles'),\n", " ('user', 'measles', 'update'),\n", " ('measles', 'update', 'user'),\n", " ('update', 'user', 'says'),\n", " ('user', 'says', 'working'),\n", " ('says', 'working', 'w'),\n", " ('working', 'w', 'user'),\n", " ('w', 'user', 'url'),\n", " ('user', 'url', 'dont'),\n", " ('url', 'dont', 'think'),\n", " ('dont', 'think', 'will'),\n", " ('think', 'will', 'understand'),\n", " ('will', 'understand', 'peoples'),\n", " ('understand', 'peoples', 'reasons'),\n", " ('peoples', 'reasons', 'getting'),\n", " ('reasons', 'getting', 'vaccinated'),\n", " ('getting', 'vaccinated', 'vaccinating'),\n", " ('vaccinated', 'vaccinating', 'kids'),\n", " ('vaccinating', 'kids', 'explain'),\n", " ('kids', 'explain', 'cia'),\n", " ('explain', 'cia', 'vaccination'),\n", " ('cia', 'vaccination', 'campaigns'),\n", " ('vaccination', 'campaigns', 'spy'),\n", " ('campaigns', 'spy', 'ops'),\n", " ('spy', 'ops', 'url'),\n", " ('ops', 'url', 'via'),\n", " ('url', 'via', 'user'),\n", " ('via', 'user', 'harm'),\n", " ('user', 'harm', 'protected'),\n", " ('harm', 'protected', 'medic'),\n", " ('protected', 'medic', 'status'),\n", " ('medic', 'status', 'ruse'),\n", " ('status', 'ruse', 'will'),\n", " ('ruse', 'will', 'linger'),\n", " ('will', 'linger', 'measles'),\n", " ('linger', 'measles', 'outbreak'),\n", " ('measles', 'outbreak', 'prompts'),\n", " ('outbreak', 'prompts', 'vaccination'),\n", " ('prompts', 'vaccination', 'debate'),\n", " ('vaccination', 'debate', 'political'),\n", " ('debate', 'political', 'debate'),\n", " ('political', 'debate', 'rages'),\n", " ('debate', 'rages', 'measles'),\n", " ('rages', 'measles', 'outbreak'),\n", " ('measles', 'outbreak', 'spreads'),\n", " ('outbreak', 'spreads', 'url'),\n", " ('spreads', 'url', 'people'),\n", " ('url', 'people', 'need'),\n", " ('people', 'need', 'children'),\n", " ('need', 'children', 'vaccinated'),\n", " ('children', 'vaccinated', 'bad'),\n", " ('vaccinated', 'bad', 'diseases'),\n", " ('bad', 'diseases', 'coming'),\n", " ('diseases', 'coming', 'back'),\n", " ('coming', 'back', 'vaccine'),\n", " ('back', 'vaccine', 'brain'),\n", " ('vaccine', 'brain', 'damage'),\n", " ('brain', 'damage', 'cover'),\n", " ('damage', 'cover', 'implodes'),\n", " ('cover', 'implodes', 'url'),\n", " ('implodes', 'url', 'user'),\n", " ('url', 'user', 'user'),\n", " ('user', 'user', 'discussing'),\n", " ('user', 'discussing', 'user'),\n", " ('discussing', 'user', 'vaccines'),\n", " ('user', 'vaccines', 'childrens'),\n", " ('vaccines', 'childrens', 'clinic'),\n", " ('childrens', 'clinic', 'url'),\n", " ('clinic', 'url', 'know'),\n", " ('url', 'know', 'infected'),\n", " ('know', 'infected', 'measles'),\n", " ('infected', 'measles', 'airborne'),\n", " ('measles', 'airborne', 'transmission'),\n", " ('airborne', 'transmission', 'wtf'),\n", " ('transmission', 'wtf', 'access'),\n", " ('wtf', 'access', 'fitchburg'),\n", " ('access', 'fitchburg', 'school'),\n", " ('fitchburg', 'school', 'limited'),\n", " ('school', 'limited', 'amid'),\n", " ('limited', 'amid', 'measles'),\n", " ('amid', 'measles', 'scare'),\n", " ('measles', 'scare', 'url'),\n", " ('scare', 'url', 'read'),\n", " ('url', 'read', 'neighbors'),\n", " ('read', 'neighbors', 'user'),\n", " ('neighbors', 'user', 'user'),\n", " ('user', 'user', 'deadly'),\n", " ('user', 'deadly', 'kids'),\n", " ('deadly', 'kids', 'allergies'),\n", " ('kids', 'allergies', 'joke'),\n", " ('allergies', 'joke', 'deadly'),\n", " ('joke', 'deadly', 'child'),\n", " ('deadly', 'child', 'measles'),\n", " ('child', 'measles', 'vaccinate'),\n", " ('measles', 'vaccinate', 'every1'),\n", " ('vaccinate', 'every1', 'measles'),\n", " ('every1', 'measles', '717'),\n", " ('measles', '717', 'user'),\n", " ('717', 'user', 'topic'),\n", " ('user', 'topic', 'makes'),\n", " ('topic', 'makes', 'scared'),\n", " ('makes', 'scared', 'bring'),\n", " ('scared', 'bring', 'kids'),\n", " ('bring', 'kids', 'modern'),\n", " ('kids', 'modern', 'society'),\n", " ('modern', 'society', 'time'),\n", " ('society', 'time', 'start'),\n", " ('time', 'start', 'looking'),\n", " ('start', 'looking', 'private'),\n", " ('looking', 'private', 'schools'),\n", " ('private', 'schools', 'mandate'),\n", " ('schools', 'mandate', 'vaccines'),\n", " ('mandate', 'vaccines', 'user'),\n", " ('vaccines', 'user', 'stop'),\n", " ('user', 'stop', 'blaming'),\n", " ('stop', 'blaming', 'first'),\n", " ('blaming', 'first', 'now'),\n", " ('first', 'now', 'diseasesbrought'),\n", " ('now', 'diseasesbrought', 'ancestors'),\n", " ('diseasesbrought', 'ancestors', 'url'),\n", " ('ancestors', 'url', 'already'),\n", " ('url', 'already', 'cure'),\n", " ('already', 'cure', 'measles'),\n", " ('cure', 'measles', 'user'),\n", " ('measles', 'user', 'user'),\n", " ('user', 'user', 'cant'),\n", " ('user', 'cant', 'attend'),\n", " ('cant', 'attend', 'school'),\n", " ('attend', 'school', 'unless'),\n", " ('school', 'unless', 'vaccinated'),\n", " ('unless', 'vaccinated', 'dont'),\n", " ('vaccinated', 'dont', 'vaccinate'),\n", " ('dont', 'vaccinate', 'children'),\n", " ('vaccinate', 'children', 'shitty'),\n", " ('children', 'shitty', 'parent'),\n", " ('shitty', 'parent', 'end'),\n", " ('parent', 'end', 'story'),\n", " ('end', 'story', 'citizen'),\n", " ('story', 'citizen', 'kingdom'),\n", " ('citizen', 'kingdom', 'god'),\n", " ('kingdom', 'god', 'assignment'),\n", " ('god', 'assignment', 'diplomatic'),\n", " ('assignment', 'diplomatic', 'immunity'),\n", " ('diplomatic', 'immunity', 'fear'),\n", " ('immunity', 'fear', 'man'),\n", " ('fear', 'man', 'top'),\n", " ('man', 'top', 'colorado'),\n", " ('top', 'colorado', 'doctor'),\n", " ('colorado', 'doctor', 'says'),\n", " ('doctor', 'says', 'adults'),\n", " ('says', 'adults', 'worried'),\n", " ('adults', 'worried', 'measles'),\n", " ('worried', 'measles', 'vaccination'),\n", " ('measles', 'vaccination', 'measles'),\n", " ('vaccination', 'measles', 'outbreak'),\n", " ('measles', 'outbreak', 'url'),\n", " ('outbreak', 'url', 'user'),\n", " ('url', 'user', 'nah'),\n", " ('user', 'nah', 'idiot'),\n", " ('nah', 'idiot', 'someone'),\n", " ('idiot', 'someone', 'doesnt'),\n", " ('someone', 'doesnt', 'allow'),\n", " ('doesnt', 'allow', 'kids'),\n", " ('allow', 'kids', 'vaccinated'),\n", " ('kids', 'vaccinated', 'kids'),\n", " ('vaccinated', 'kids', 'need'),\n", " ('kids', 'need', 'vaccinations'),\n", " ('need', 'vaccinations', 'late'),\n", " ('vaccinations', 'late', '19thcentury'),\n", " ('late', '19thcentury', 'maps'),\n", " ('19thcentury', 'maps', 'show'),\n", " ('maps', 'show', 'measles'),\n", " ('show', 'measles', 'mortality'),\n", " ('measles', 'mortality', 'vaccines'),\n", " ('mortality', 'vaccines', 'url'),\n", " ('vaccines', 'url', 'url'),\n", " ('url', 'url', 'measles'),\n", " ('url', 'measles', 'fears'),\n", " ('measles', 'fears', 'spread'),\n", " ('fears', 'spread', 'nj'),\n", " ('spread', 'nj', 'amid'),\n", " ('nj', 'amid', 'suspected'),\n", " ('amid', 'suspected', 'case'),\n", " ('suspected', 'case', 'url'),\n", " ('case', 'url', 'baby'),\n", " ('url', 'baby', 'vaccination'),\n", " ('baby', 'vaccination', 'day'),\n", " ('vaccination', 'day', 'shes'),\n", " ('day', 'shes', 'getting'),\n", " ('shes', 'getting', 'measles'),\n", " ('getting', 'measles', 'one'),\n", " ('measles', 'one', 'early'),\n", " ('one', 'early', 'fuckwits'),\n", " ('early', 'fuckwits', 'chicago'),\n", " ('fuckwits', 'chicago', 'dont'),\n", " ('chicago', 'dont', 'vaccinate'),\n", " ('dont', 'vaccinate', 'ensuring'),\n", " ('vaccinate', 'ensuring', 'problem'),\n", " ('ensuring', 'problem', 'loving'),\n", " ('problem', 'loving', 'measles'),\n", " ('loving', 'measles', 'poster'),\n", " ('measles', 'poster', 'making'),\n", " ('poster', 'making', 'spirit'),\n", " ('making', 'spirit', 'today'),\n", " ('spirit', 'today', 'url'),\n", " ('today', 'url', 'baseball'),\n", " ('url', 'baseball', 'coach'),\n", " ('baseball', 'coach', 'santa'),\n", " ('coach', 'santa', 'monica'),\n", " ('santa', 'monica', 'high'),\n", " ('monica', 'high', 'contracts'),\n", " ('high', 'contracts', 'measles'),\n", " ('contracts', 'measles', 'url'),\n", " ('measles', 'url', 'im'),\n", " ('url', 'im', 'sick'),\n", " ('im', 'sick', 'hearing'),\n", " ('sick', 'hearing', 'people'),\n", " ('hearing', 'people', 'vaccinating'),\n", " ('people', 'vaccinating', 'children'),\n", " ('vaccinating', 'children', 'sick'),\n", " ('children', 'sick', 'hearing'),\n", " ('sick', 'hearing', 'peoples'),\n", " ('hearing', 'peoples', 'freedom'),\n", " ('peoples', 'freedom', 'insane'),\n", " ('freedom', 'insane', 'dangerous'),\n", " ('insane', 'dangerous', 'things'),\n", " ('dangerous', 'things', 'one'),\n", " ('things', 'one', 'risky'),\n", " ('one', 'risky', 'childrena'),\n", " ('risky', 'childrena', 'vaccination'),\n", " ('childrena', 'vaccination', 'going'),\n", " ('vaccination', 'going', 'school'),\n", " ('going', 'school', 'car'),\n", " ('school', 'car', 'odds'),\n", " ('car', 'odds', 'kid'),\n", " ('odds', 'kid', 'measles'),\n", " ('kid', 'measles', 'uses'),\n", " ('measles', 'uses', 'lab'),\n", " ('uses', 'lab', 'room'),\n", " ('lab', 'room', 'user'),\n", " ('room', 'user', 'user'),\n", " ('user', 'user', 'parents'),\n", " ('user', 'parents', 'refuse'),\n", " ('parents', 'refuse', 'kids'),\n", " ('refuse', 'kids', 'immunized'),\n", " ('kids', 'immunized', 'charged'),\n", " ('immunized', 'charged', 'reckless'),\n", " ('charged', 'reckless', 'endangerment'),\n", " ('reckless', 'endangerment', 'kid'),\n", " ('endangerment', 'kid', 'gets'),\n", " ('kid', 'gets', 'another'),\n", " ('gets', 'another', 'sick'),\n", " ('another', 'sick', 'rt'),\n", " ('sick', 'rt', 'user'),\n", " ('rt', 'user', 'vaccinations'),\n", " ('user', 'vaccinations', 'measles'),\n", " ('vaccinations', 'measles', 'medicaid'),\n", " ('measles', 'medicaid', 'expansion'),\n", " ('medicaid', 'expansion', 'url'),\n", " ('expansion', 'url', 'user'),\n", " ('url', 'user', 'centers'),\n", " ('user', 'centers', 'disease'),\n", " ('centers', 'disease', 'control'),\n", " ('disease', 'control', 'years'),\n", " ('control', 'years', 'flu'),\n", " ('years', 'flu', 'vaccine'),\n", " ('flu', 'vaccine', 'doesnt'),\n", " ('vaccine', 'doesnt', 'work'),\n", " ('doesnt', 'work', 'url'),\n", " ('work', 'url', 'user'),\n", " ('url', 'user', 'ty'),\n", " ('user', 'ty', 'fought'),\n", " ('ty', 'fought', 'hard'),\n", " ('fought', 'hard', '2'),\n", " ('hard', '2', 'vaccinate'),\n", " ('2', 'vaccinate', 'kids'),\n", " ('vaccinate', 'kids', 'yrs'),\n", " ('kids', 'yrs', 'ppls'),\n", " ('yrs', 'ppls', 'judgements'),\n", " ('ppls', 'judgements', 'dont'),\n", " ('judgements', 'dont', 'hurt'),\n", " ('dont', 'hurt', 'smart'),\n", " ('hurt', 'smart', 'vaccinate'),\n", " ('smart', 'vaccinate', 'people'),\n", " ('vaccinate', 'people', 'user'),\n", " ('people', 'user', '15'),\n", " ('user', '15', 'tarrant'),\n", " ('15', 'tarrant', 'county'),\n", " ('tarrant', 'county', 'cases'),\n", " ('county', 'cases', 'five'),\n", " ('cases', 'five', 'denton'),\n", " ('five', 'denton', 'co'),\n", " ('denton', 'co', 'traced'),\n", " ('co', 'traced', 'megachurch'),\n", " ('traced', 'megachurch', 'url'),\n", " ('megachurch', 'url', 'hb1251'),\n", " ('url', 'hb1251', 'new'),\n", " ('hb1251', 'new', 'providing'),\n", " ('new', 'providing', 'health'),\n", " ('providing', 'health', 'insurance'),\n", " ('health', 'insurance', 'coverage'),\n", " ('insurance', 'coverage', 'immunizations'),\n", " ('coverage', 'immunizations', 'administered'),\n", " ('immunizations', 'administered', 'pharmacy'),\n", " ('administered', 'pharmacy', 'pharmacist'),\n", " ('pharmacy', 'pharmacist', 'url'),\n", " ('pharmacist', 'url', 'a4077'),\n", " ('url', 'a4077', 'new'),\n", " ('a4077', 'new', 'revises'),\n", " ('new', 'revises', 'codifies'),\n", " ('revises', 'codifies', 'schedule'),\n", " ('codifies', 'schedule', 'childhood'),\n", " ('schedule', 'childhood', 'lead'),\n", " ('childhood', 'lead', 'screening'),\n", " ('lead', 'screening', 'along'),\n", " ('screening', 'along', 'immunization'),\n", " ('along', 'immunization', 'wellness'),\n", " ('immunization', 'wellness', 'url'),\n", " ('wellness', 'url', 'fridays'),\n", " ('url', 'fridays', 'immunity'),\n", " ('fridays', 'immunity', 'challenge'),\n", " ('immunity', 'challenge', 'walking'),\n", " ('challenge', 'walking', 'backwards'),\n", " ('walking', 'backwards', 'safe'),\n", " ('backwards', 'safe', 'time'),\n", " ('safe', 'time', 'sure'),\n", " ('time', 'sure', 'watch'),\n", " ('sure', 'watch', 'back'),\n", " ('watch', 'back', '70s'),\n", " ('back', '70s', 'healthnut'),\n", " ('70s', 'healthnut', 'parents'),\n", " ('healthnut', 'parents', 'didnt'),\n", " ('parents', 'didnt', 'vaccinate'),\n", " ('didnt', 'vaccinate', 'childhood'),\n", " ('vaccinate', 'childhood', 'url'),\n", " ('childhood', 'url', 'user'),\n", " ('url', 'user', 'people'),\n", " ('user', 'people', 'complain'),\n", " ('people', 'complain', 'live'),\n", " ('complain', 'live', 'longer'),\n", " ('live', 'longer', 'releasing'),\n", " ('longer', 'releasing', 'tension'),\n", " ('releasing', 'tension', 'increases'),\n", " ('tension', 'increases', 'immunity'),\n", " ('increases', 'immunity', 'improves'),\n", " ('immunity', 'improves', 'overall'),\n", " ('improves', 'overall', 'health'),\n", " ('overall', 'health', 'well'),\n", " ('health', 'well', 'good'),\n", " ('well', 'good', 'news'),\n", " ('good', 'news', 'austerity'),\n", " ('news', 'austerity', 'vaccine'),\n", " ('austerity', 'vaccine', 'crisis'),\n", " ('vaccine', 'crisis', 'parasite'),\n", " ('crisis', 'parasite', 'pandemic'),\n", " ('parasite', 'pandemic', 'potential'),\n", " ('pandemic', 'potential', 'drains'),\n", " ('potential', 'drains', 'life'),\n", " ('drains', 'life', 'killing'),\n", " ('life', 'killing', 'slowly'),\n", " ('killing', 'slowly', 'user'),\n", " ('slowly', 'user', 'compensation'),\n", " ('user', 'compensation', 'autism'),\n", " ('compensation', 'autism', 'brain'),\n", " ('autism', 'brain', 'damage'),\n", " ('brain', 'damage', 'illnesses'),\n", " ('damage', 'illnesses', 'vaccine'),\n", " ('illnesses', 'vaccine', 'court'),\n", " ('vaccine', 'court', 'via'),\n", " ('court', 'via', 'user'),\n", " ('via', 'user', 'url'),\n", " ('user', 'url', 'seek'),\n", " ('url', 'seek', 'bear'),\n", " ('seek', 'bear', 'w'),\n", " ('bear', 'w', 'see'),\n", " ('w', 'see', 'state'),\n", " ('see', 'state', 'ppls'),\n", " ('state', 'ppls', 'hearts'),\n", " ('ppls', 'hearts', 'url'),\n", " ('hearts', 'url', 'user'),\n", " ('url', 'user', 'conservative'),\n", " ('user', 'conservative', 'neurosurgeon'),\n", " ('conservative', 'neurosurgeon', 'ben'),\n", " ('neurosurgeon', 'ben', 'carson'),\n", " ('ben', 'carson', 'says'),\n", " ('carson', 'says', 'vaccines'),\n", " ('says', 'vaccines', 'public'),\n", " ('vaccines', 'public', 'health'),\n", " ...]" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trigrams= list(nltk.trigrams(tokens))\n", "\n", "trigrams" ] }, { "cell_type": "markdown", "metadata": { "id": "AI_-EgBG2yp6" }, "source": [ "#### Approach\n", "- We will get rid of numbers\n", "- We will get rid of the two words" ] }, { "cell_type": "markdown", "metadata": { "id": "VXCO41CaOBBA" }, "source": [ "##### a. Number Remover" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "IBRojV_AN3Bc" }, "outputs": [], "source": [ "def number_remover(text):\n", " text= re.sub(r'\\d+', '', text)\n", " return \" \".join([word for word in text.split() if not word.isdigit()])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "SprG4IZaN4K4" }, "outputs": [], "source": [ "df_train[\"clean_tweet\"]= df_train[\"clean_tweet\"].apply(number_remover)" ] }, { "cell_type": "markdown", "metadata": { "id": "nyKmJUj_OH6p" }, "source": [ "##### b.Text remover" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "3Uh2Y9IOAJao" }, "outputs": [], "source": [ "def text_remover(text):\n", " return \" \".join([word for word in text.split() if word not in [\"user\", \"url\"]])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Dyy5J--oAKFM" }, "outputs": [], "source": [ "df_train[\"clean_tweet\"]= df_train[\"clean_tweet\"].apply(lambda x: text_remover(x))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "qZqXvbGBkt6x" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "P1Ba6Mt3kvs-" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 285 }, "id": "B-VPb0P6DRtN", "outputId": "2ddca607-e8f9-4377-c91f-25a783ca160c" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweetslabelagreementtweet_lengthclean_tweet
0me &amp; the big homie meanboy3000 #meanboy #m...0.01.015amp big homie meanboy stegman st
1i'm 100% thinking of devoting my career to pro...1.01.025im thinking devoting career proving autism isn...
2#whatcausesautism vaccines, do not vaccinate y...-1.01.07vaccines vaccinate child
3i mean if they immunize my kid with something ...-1.01.028mean immunize kid something wont secretly kill...
4thanks to <user> catch me performing at la nui...0.01.020thanks catch performing la nuit nyc st ave sho...
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " tweets label agreement \\\n", "0 me & the big homie meanboy3000 #meanboy #m... 0.0 1.0 \n", "1 i'm 100% thinking of devoting my career to pro... 1.0 1.0 \n", "2 #whatcausesautism vaccines, do not vaccinate y... -1.0 1.0 \n", "3 i mean if they immunize my kid with something ... -1.0 1.0 \n", "4 thanks to catch me performing at la nui... 0.0 1.0 \n", "\n", " tweet_length clean_tweet \n", "0 15 amp big homie meanboy stegman st \n", "1 25 im thinking devoting career proving autism isn... \n", "2 7 vaccines vaccinate child \n", "3 28 mean immunize kid something wont secretly kill... \n", "4 20 thanks catch performing la nuit nyc st ave sho... " ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "uHPN87kYksCg" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 677 }, "id": "Y06dwqbCOzGE", "outputId": "9e8e3b65-41b5-470e-ea9d-9ef3f49015c7" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordcount
0measles3177
1vaccine1469
2kids1260
3vaccines1190
4health1066
5vaccinate905
6children831
7people702
8dont677
9mmr619
10vaccinated601
11outbreak590
12autism589
13immunity548
14amp535
15parents517
16vaccinations505
17child465
18school434
19vaccination430
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " word count\n", "0 measles 3177\n", "1 vaccine 1469\n", "2 kids 1260\n", "3 vaccines 1190\n", "4 health 1066\n", "5 vaccinate 905\n", "6 children 831\n", "7 people 702\n", "8 dont 677\n", "9 mmr 619\n", "10 vaccinated 601\n", "11 outbreak 590\n", "12 autism 589\n", "13 immunity 548\n", "14 amp 535\n", "15 parents 517\n", "16 vaccinations 505\n", "17 child 465\n", "18 school 434\n", "19 vaccination 430" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## now let's visualize our most frequent words\n", "word= \" \".join(df_train[\"clean_tweet\"])\n", "freq= FreqDist(word.split())\n", "top_20= pd.DataFrame(freq.most_common(20), columns= [\"word\", \"count\"])\n", "\n", "top_20" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "C28wV7W_43Tx" }, "outputs": [], "source": [ "## Visualizing my dataset again" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 542 }, "id": "zGPCskUKQXoA", "outputId": "4c7d8858-7522-4a2c-d56e-8a603e42ff36" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "
\n", "
\n", "\n", "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "px.treemap(data_frame=top_20, path=[\"word\"], values= \"count\", title= \"Top 20 Most Frequent Words\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SRksLclIS27t", "outputId": "648fba20-755d-41ce-9559-2e78dc919aad" }, "outputs": [ { "data": { "text/plain": [ "[('measles', 3177),\n", " ('vaccine', 1469),\n", " ('kids', 1260),\n", " ('vaccines', 1190),\n", " ('health', 1066),\n", " ('vaccinate', 905),\n", " ('children', 831),\n", " ('people', 702),\n", " ('dont', 677),\n", " ('mmr', 619),\n", " ('vaccinated', 601),\n", " ('outbreak', 590),\n", " ('autism', 589),\n", " ('immunity', 548),\n", " ('amp', 535),\n", " ('parents', 517),\n", " ('vaccinations', 505),\n", " ('child', 465),\n", " ('school', 434),\n", " ('vaccination', 430),\n", " ('cases', 419),\n", " ('flu', 387),\n", " ('im', 361),\n", " ('disease', 359),\n", " ('new', 337),\n", " ('now', 331),\n", " ('us', 321),\n", " ('got', 302),\n", " ('officials', 284),\n", " ('will', 267),\n", " ('cause', 267),\n", " ('county', 254),\n", " ('live', 245),\n", " ('disneyland', 241),\n", " ('one', 240),\n", " ('getting', 239),\n", " ('cdc', 223),\n", " ('diseases', 220),\n", " ('immunization', 217),\n", " ('via', 216),\n", " ('know', 205),\n", " ('ebola', 204),\n", " ('rt', 203),\n", " ('vaccinating', 202),\n", " ('say', 200),\n", " ('case', 199),\n", " ('think', 199),\n", " ('california', 197),\n", " ('need', 193),\n", " ('baby', 190),\n", " ('kid', 188),\n", " ('immunizations', 188),\n", " ('go', 184),\n", " ('longer', 179),\n", " ('may', 178),\n", " ('want', 177),\n", " ('complain', 177),\n", " ('increases', 177),\n", " ('risk', 176),\n", " ('public', 176),\n", " ('safe', 172),\n", " ('tension', 172),\n", " ('study', 171),\n", " ('time', 166),\n", " ('first', 166),\n", " ('releasing', 164),\n", " ('u', 162),\n", " ('day', 157),\n", " ('confirmed', 157),\n", " ('good', 155),\n", " ('still', 152),\n", " ('state', 151),\n", " ('shot', 150),\n", " ('make', 148),\n", " ('back', 146),\n", " ('cant', 146),\n", " ('year', 145),\n", " ('today', 144),\n", " ('boosts', 143),\n", " ('free', 142),\n", " ('says', 139),\n", " ('shots', 138),\n", " ('average', 137),\n", " ('stop', 131),\n", " ('care', 125),\n", " ('going', 125),\n", " ('many', 124),\n", " ('science', 121),\n", " ('even', 121),\n", " ('polio', 121),\n", " ('link', 120),\n", " ('mumps', 118),\n", " ('years', 117),\n", " ('death', 117),\n", " ('babies', 117),\n", " ('right', 116),\n", " ('believe', 114),\n", " ('high', 111),\n", " ('come', 110),\n", " ('world', 109),\n", " ('thats', 109),\n", " ('doesnt', 107),\n", " ('didnt', 107),\n", " ('news', 105),\n", " ('sick', 104),\n", " ('hiv', 104),\n", " ('never', 103),\n", " ('please', 102),\n", " ('w', 101),\n", " ('youre', 101),\n", " ('unvaccinated', 101),\n", " ('see', 100),\n", " ('schools', 100),\n", " ('really', 100),\n", " ('exposed', 99),\n", " ('work', 97),\n", " ('take', 97),\n", " ('last', 96),\n", " ('bad', 95),\n", " ('department', 95),\n", " ('aids', 94),\n", " ('bill', 94),\n", " ('give', 94),\n", " ('another', 93),\n", " ('oh', 92),\n", " ('two', 92),\n", " ('cough', 92),\n", " ('well', 91),\n", " ('great', 91),\n", " ('thanks', 90),\n", " ('childhood', 90),\n", " ('linked', 90),\n", " ('hope', 89),\n", " ('disney', 89),\n", " ('exemptions', 88),\n", " ('cancer', 88),\n", " ('everyone', 87),\n", " ('virus', 87),\n", " ('said', 87),\n", " ('protect', 87),\n", " ('states', 87),\n", " ('fucking', 86),\n", " ('way', 86),\n", " ('n', 86),\n", " ('better', 86),\n", " ('life', 83),\n", " ('lol', 83),\n", " ('students', 83),\n", " ('spread', 82),\n", " ('love', 82),\n", " ('put', 82),\n", " ('wont', 81),\n", " ('thing', 81),\n", " ('whooping', 81),\n", " ('sure', 80),\n", " ('rates', 79),\n", " ('shit', 78),\n", " ('ill', 78),\n", " ('dr', 78),\n", " ('every', 77),\n", " ('dead', 77),\n", " ('hpv', 77),\n", " ('much', 76),\n", " ('reported', 76),\n", " ('arent', 76),\n", " ('die', 76),\n", " ('number', 75),\n", " ('around', 74),\n", " ('effective', 74),\n", " ('isnt', 73),\n", " ('fear', 73),\n", " ('pox', 73),\n", " ('home', 73),\n", " ('medical', 71),\n", " ('s', 70),\n", " ('safety', 70),\n", " ('doctors', 70),\n", " ('jenny', 69),\n", " ('keep', 69),\n", " ('lives', 69),\n", " ('immunized', 68),\n", " ('yes', 68),\n", " ('help', 68),\n", " ('texas', 68),\n", " ('let', 68),\n", " ('week', 67),\n", " ('next', 66),\n", " ('center', 66),\n", " ('infants', 65),\n", " ('antivaccine', 64),\n", " ('killing', 64),\n", " ('truth', 64),\n", " ('prevent', 64),\n", " ('stupid', 64),\n", " ('due', 63),\n", " ('debate', 63),\n", " ('clinic', 63),\n", " ('gets', 63),\n", " ('old', 62),\n", " ('look', 62),\n", " ('lets', 62),\n", " ('others', 62),\n", " ('ive', 61),\n", " ('childrens', 61),\n", " ('read', 61),\n", " ('little', 61),\n", " ('law', 61),\n", " ('doctor', 60),\n", " ('whats', 60),\n", " ('chicken', 60),\n", " ('ur', 60),\n", " ('best', 60),\n", " ('refuse', 59),\n", " ('real', 59),\n", " ('research', 59),\n", " ('b', 58),\n", " ('brain', 58),\n", " ('makes', 58),\n", " ('mccarthy', 58),\n", " ('deaths', 58),\n", " ('ppl', 58),\n", " ('damn', 58),\n", " ('exposure', 58),\n", " ('mandatory', 58),\n", " ('person', 58),\n", " ('causes', 57),\n", " ('theres', 57),\n", " ('family', 57),\n", " ('thank', 56),\n", " ('spreading', 56),\n", " ('young', 55),\n", " ('feel', 55),\n", " ('away', 55),\n", " ('diagnosed', 55),\n", " ('possible', 55),\n", " ('update', 54),\n", " ('might', 54),\n", " ('mom', 54),\n", " ('big', 53),\n", " ('immunize', 53),\n", " ('start', 53),\n", " ('son', 53),\n", " ('mercury', 53),\n", " ('save', 53),\n", " ('died', 53),\n", " ('kill', 52),\n", " ('catch', 52),\n", " ('university', 52),\n", " ('exemption', 52),\n", " ('meningitis', 52),\n", " ('bring', 52),\n", " ('someone', 52),\n", " ('reason', 52),\n", " ('outbreaks', 52),\n", " ('bc', 52),\n", " ('without', 52),\n", " ('rate', 51),\n", " ('preventable', 51),\n", " ('yet', 51),\n", " ('choice', 51),\n", " ('stay', 51),\n", " ('caused', 50),\n", " ('finds', 50),\n", " ('sb', 50),\n", " ('anyone', 50),\n", " ('story', 50),\n", " ('maybe', 50),\n", " ('gonna', 50),\n", " ('fuck', 50),\n", " ('wait', 49),\n", " ('use', 49),\n", " ('th', 48),\n", " ('tell', 48),\n", " ('age', 47),\n", " ('required', 47),\n", " ('providing', 47),\n", " ('injury', 47),\n", " ('dept', 47),\n", " ('find', 47),\n", " ('show', 46),\n", " ('mt', 46),\n", " ('poor', 46),\n", " ('immune', 46),\n", " ('services', 46),\n", " ('something', 45),\n", " ('epidemic', 45),\n", " ('blaming', 45),\n", " ('adults', 45),\n", " ('change', 45),\n", " ('country', 45),\n", " ('support', 44),\n", " ('shows', 44),\n", " ('bart', 44),\n", " ('coming', 44),\n", " ('making', 44),\n", " ('amnews', 44),\n", " ('benefits', 44),\n", " ('hospital', 44),\n", " ('girl', 44),\n", " ('check', 44)]" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "words= \" \".join(df_train[\"clean_tweet\"]).split()\n", "\n", "dist=FreqDist(words)\n", "\n", "dist.most_common(300)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ig3-QhlYYmkW" }, "outputs": [], "source": [ "##we will do some further cleaning\n", "\n", "df_train[\"clean_tweet\"]= df_train[\"clean_tweet\"].apply(lambda x : x.replace(\"ppl\",\"people\"))\n" ] }, { "cell_type": "markdown", "metadata": { "id": "XoT-KAphAKoP" }, "source": [ "### iv. Feature Engineering\n", "\n", "\n", "\n", "\n", "\n" ] }, { "cell_type": "markdown", "metadata": { "id": "gTnXB_h1los5" }, "source": [ "**Approach: **\n", "\n", "- filter out the most important columns from our train dataframe\n", "- Perform feature engineering\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "YZRRH26El9Nj", "outputId": "f65e2994-a2c1-4913-d011-3fe6cd1d0e1b" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweetslabelagreementtweet_lengthclean_tweet
0me &amp; the big homie meanboy3000 #meanboy #m...0.01.00000015amp big homie meanboy stegman st
1i'm 100% thinking of devoting my career to pro...1.01.00000025im thinking devoting career proving autism isn...
2#whatcausesautism vaccines, do not vaccinate y...-1.01.0000007vaccines vaccinate child
3i mean if they immunize my kid with something ...-1.01.00000028mean immunize kid something wont secretly kill...
4thanks to <user> catch me performing at la nui...0.01.00000020thanks catch performing la nuit nyc st ave sho...
5<user> a nearly 67 year old study when mental ...1.00.66666722nearly year old study mental health studies va...
6study of more than 95,000 kids finds no link b...1.00.66666715study kids finds link mmr vaccine autism
7psa: vaccinate your fucking kids1.01.0000005psa vaccinate fucking kids
8coughing extra on the shuttle and everyone thi...1.00.66666714coughing extra shuttle everyone thinks measles
9aids vaccine created at oregon health &amp; sc...1.00.66666717aids vaccine created oregon health amp science...
10<user> @ this point i have 2 text, butw/bon jo...0.01.00000025point text butwbon jovi cover playin alibis ho...
11my prediction, vaccine exemption in arizona wi...0.00.66666718prediction vaccine exemption arizona will end ...
12getting my vaccines ! #china #nervous #moving ...1.01.00000016getting vaccines cheryl southern nevada health...
131$mug noche <user> #mmr #mixmasterrod #dcdj #m...0.01.00000013mug noche mad hatter
14got my influenza vaccine! (@ purdue university...1.00.66666713got influenza vaccine purdue university studen...
15sb121 [enroll] meningococcal disease-pupils to...0.00.66666712sb enroll meningococcal diseasepupils immunize...
16increasing number of parents skip vaccinations...0.01.00000012increasing number parents skip vaccinations ch...
17<user> thank you for standing with ca parents ...1.01.00000016thank standing ca parents children support
18dude idc if disney land has the measles, that ...0.01.00000020dude idc disney land measles means shorter lin...
19beeftalk: start your calf vaccinations now <ur...1.01.00000014beeftalk start calf vaccinations now via good ...
20i don't care what <user> says, you should prob...1.01.00000013dont care says probably kids vaccinated
21#acr13 small study shows in 10 pts with lupus ...1.00.66666720small study shows pts lupus shingles vaccine s...
22cdc: measles epidemic poses travel risks usat....0.01.00000019cdc measles epidemic poses travel risks usatly...
23every time i see the \"to vaccinate or not\" deb...1.01.00000024every time see vaccinate debate wonder one sid...
24<user> #cdc lied and hid data that black boys ...-1.01.00000021lied hid data black boys uncreased risk develo...
25<user> vaccines causing autism-1.01.0000004vaccines causing autism
26“<user> i rarely see arguments about over vacc...0.00.66666714rarely see arguments vaccination actually big ...
27i'm not obsessed w ebola, just following an ou...1.00.66666725im obsessed w ebola following outbreak worlds ...
28<user> joshthenewt i suck at the game, haha we...0.01.00000019joshthenewt suck game haha well people say don...
29don't shake his hand, pocahontas! that's proba...0.00.66666713dont shake hand pocahontas thats probably meas...
30<user> yes. i'm a part of public health just l...1.01.00000026yes im part public health im part nature feed ...
31new studies show that vaccines are not associa...1.01.00000017new studies show vaccines associated autism ne...
32not to be repetitive, but i could not be less ...1.01.00000022repetitive less shocked increase asd even gene...
33glad i got vaccinated! “<user> health alert: a...1.01.00000018glad got vaccinated health alert case meningit...
34you look like you got the measles0.00.6666677look got measles
35<user> <user> other than that, his defense is ...-1.00.33333320defense vaccines harmful american life unhealthy
36like hello ranked reset they probably did bad ...0.01.00000021hello ranked reset probably bad placements mmr...
37amid measles outbreak, vaccines for teachers a...0.00.66666718amid measles outbreak vaccines teachers arent ...
38<user> <user> <user> <user> “<user> <url> meas...0.00.6666678measles threat
39improve mood, energy, immunity, cardio health....0.01.00000015improve mood energy immunity cardio health kit...
40mt <user> new #vaccination bill would end exem...0.01.00000015mt new bill end exemptions personal religious via
41autism and immunizations: should you vaccinate...0.00.6666677autism immunizations vaccinate
42bart riders warned about measles infection fro...0.01.00000018bart riders warned measles infection contagiou...
43.<user> u.s. #measles cases hit 15-year high. ...0.00.66666722us cases hit year high far year cases measles ...
44pull up myxx nightlife.... stint t performing ...0.01.00000015pull myxx nightlife stint t performing live ho...
45cdc eyeing bird flu vaccine for humans, though...0.00.66666719cdc eyeing bird flu vaccine humans though risk...
46involved in fight against #measles and other #...1.00.66666716involved fight preventable diseases tenure tra...
47<user> i'm not vaccinating my kids lol-1.01.0000007im vaccinating kids lol
48alleged victim reviewing immunity paperwork so...0.01.00000015alleged victim reviewing immunity paperwork re...
49thanks<user> for being more crystal clear abou...1.00.66666720thanksuser crystal clear lack evidence linking...
50<user> giving me dvds on how vaccinating child...0.00.66666710giving dvds vaccinating children bad
51hey now...<user> #mixmasterrod #follow #madhat...0.01.00000012hey nowuser mixmasterrods upstairs lounge
52i seriously don't want to have a child until t...1.01.00000016seriously dont want child antivaccination move...
53<user> <user> <user> epidemic of enterovirus;...0.01.00000017epidemic enterovirus mumps now measles drug re...
54they said i wasn't gone be shit, lol looked li...0.01.00000022said wasnt gone shit lol looked proved wrong baby
55flu shots at school boost vaccination rates: o...0.00.66666719flu shots school boost vaccination rates offer...
56“<user> 1) marin county school board sides wit...0.00.66666715marin county school board sides young leukemia...
57\"still running with the same niggas til the de...0.01.00000016still running niggas til death dibiasimb
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " tweets label agreement \\\n", "0 me & the big homie meanboy3000 #meanboy #m... 0.0 1.000000 \n", "1 i'm 100% thinking of devoting my career to pro... 1.0 1.000000 \n", "2 #whatcausesautism vaccines, do not vaccinate y... -1.0 1.000000 \n", "3 i mean if they immunize my kid with something ... -1.0 1.000000 \n", "4 thanks to catch me performing at la nui... 0.0 1.000000 \n", "5 a nearly 67 year old study when mental ... 1.0 0.666667 \n", "6 study of more than 95,000 kids finds no link b... 1.0 0.666667 \n", "7 psa: vaccinate your fucking kids 1.0 1.000000 \n", "8 coughing extra on the shuttle and everyone thi... 1.0 0.666667 \n", "9 aids vaccine created at oregon health & sc... 1.0 0.666667 \n", "10 @ this point i have 2 text, butw/bon jo... 0.0 1.000000 \n", "11 my prediction, vaccine exemption in arizona wi... 0.0 0.666667 \n", "12 getting my vaccines ! #china #nervous #moving ... 1.0 1.000000 \n", "13 1$mug noche #mmr #mixmasterrod #dcdj #m... 0.0 1.000000 \n", "14 got my influenza vaccine! (@ purdue university... 1.0 0.666667 \n", "15 sb121 [enroll] meningococcal disease-pupils to... 0.0 0.666667 \n", "16 increasing number of parents skip vaccinations... 0.0 1.000000 \n", "17 thank you for standing with ca parents ... 1.0 1.000000 \n", "18 dude idc if disney land has the measles, that ... 0.0 1.000000 \n", "19 beeftalk: start your calf vaccinations now says, you should prob... 1.0 1.000000 \n", "21 #acr13 small study shows in 10 pts with lupus ... 1.0 0.666667 \n", "22 cdc: measles epidemic poses travel risks usat.... 0.0 1.000000 \n", "23 every time i see the \"to vaccinate or not\" deb... 1.0 1.000000 \n", "24 #cdc lied and hid data that black boys ... -1.0 1.000000 \n", "25 vaccines causing autism -1.0 1.000000 \n", "26 “ i rarely see arguments about over vacc... 0.0 0.666667 \n", "27 i'm not obsessed w ebola, just following an ou... 1.0 0.666667 \n", "28 joshthenewt i suck at the game, haha we... 0.0 1.000000 \n", "29 don't shake his hand, pocahontas! that's proba... 0.0 0.666667 \n", "30 yes. i'm a part of public health just l... 1.0 1.000000 \n", "31 new studies show that vaccines are not associa... 1.0 1.000000 \n", "32 not to be repetitive, but i could not be less ... 1.0 1.000000 \n", "33 glad i got vaccinated! “ health alert: a... 1.0 1.000000 \n", "34 you look like you got the measles 0.0 0.666667 \n", "35 other than that, his defense is ... -1.0 0.333333 \n", "36 like hello ranked reset they probably did bad ... 0.0 1.000000 \n", "37 amid measles outbreak, vaccines for teachers a... 0.0 0.666667 \n", "38 meas... 0.0 0.666667 \n", "39 improve mood, energy, immunity, cardio health.... 0.0 1.000000 \n", "40 mt new #vaccination bill would end exem... 0.0 1.000000 \n", "41 autism and immunizations: should you vaccinate... 0.0 0.666667 \n", "42 bart riders warned about measles infection fro... 0.0 1.000000 \n", "43 . u.s. #measles cases hit 15-year high. ... 0.0 0.666667 \n", "44 pull up myxx nightlife.... stint t performing ... 0.0 1.000000 \n", "45 cdc eyeing bird flu vaccine for humans, though... 0.0 0.666667 \n", "46 involved in fight against #measles and other #... 1.0 0.666667 \n", "47 i'm not vaccinating my kids lol -1.0 1.000000 \n", "48 alleged victim reviewing immunity paperwork so... 0.0 1.000000 \n", "49 thanks for being more crystal clear abou... 1.0 0.666667 \n", "50 giving me dvds on how vaccinating child... 0.0 0.666667 \n", "51 hey now... #mixmasterrod #follow #madhat... 0.0 1.000000 \n", "52 i seriously don't want to have a child until t... 1.0 1.000000 \n", "53 epidemic of enterovirus;... 0.0 1.000000 \n", "54 they said i wasn't gone be shit, lol looked li... 0.0 1.000000 \n", "55 flu shots at school boost vaccination rates: o... 0.0 0.666667 \n", "56 “ 1) marin county school board sides wit... 0.0 0.666667 \n", "57 \"still running with the same niggas til the de... 0.0 1.000000 \n", "\n", " tweet_length clean_tweet \n", "0 15 amp big homie meanboy stegman st \n", "1 25 im thinking devoting career proving autism isn... \n", "2 7 vaccines vaccinate child \n", "3 28 mean immunize kid something wont secretly kill... \n", "4 20 thanks catch performing la nuit nyc st ave sho... \n", "5 22 nearly year old study mental health studies va... \n", "6 15 study kids finds link mmr vaccine autism \n", "7 5 psa vaccinate fucking kids \n", "8 14 coughing extra shuttle everyone thinks measles \n", "9 17 aids vaccine created oregon health amp science... \n", "10 25 point text butwbon jovi cover playin alibis ho... \n", "11 18 prediction vaccine exemption arizona will end ... \n", "12 16 getting vaccines cheryl southern nevada health... \n", "13 13 mug noche mad hatter \n", "14 13 got influenza vaccine purdue university studen... \n", "15 12 sb enroll meningococcal diseasepupils immunize... \n", "16 12 increasing number parents skip vaccinations ch... \n", "17 16 thank standing ca parents children support \n", "18 20 dude idc disney land measles means shorter lin... \n", "19 14 beeftalk start calf vaccinations now via good ... \n", "20 13 dont care says probably kids vaccinated \n", "21 20 small study shows pts lupus shingles vaccine s... \n", "22 19 cdc measles epidemic poses travel risks usatly... \n", "23 24 every time see vaccinate debate wonder one sid... \n", "24 21 lied hid data black boys uncreased risk develo... \n", "25 4 vaccines causing autism \n", "26 14 rarely see arguments vaccination actually big ... \n", "27 25 im obsessed w ebola following outbreak worlds ... \n", "28 19 joshthenewt suck game haha well people say don... \n", "29 13 dont shake hand pocahontas thats probably meas... \n", "30 26 yes im part public health im part nature feed ... \n", "31 17 new studies show vaccines associated autism ne... \n", "32 22 repetitive less shocked increase asd even gene... \n", "33 18 glad got vaccinated health alert case meningit... \n", "34 7 look got measles \n", "35 20 defense vaccines harmful american life unhealthy \n", "36 21 hello ranked reset probably bad placements mmr... \n", "37 18 amid measles outbreak vaccines teachers arent ... \n", "38 8 measles threat \n", "39 15 improve mood energy immunity cardio health kit... \n", "40 15 mt new bill end exemptions personal religious via \n", "41 7 autism immunizations vaccinate \n", "42 18 bart riders warned measles infection contagiou... \n", "43 22 us cases hit year high far year cases measles ... \n", "44 15 pull myxx nightlife stint t performing live ho... \n", "45 19 cdc eyeing bird flu vaccine humans though risk... \n", "46 16 involved fight preventable diseases tenure tra... \n", "47 7 im vaccinating kids lol \n", "48 15 alleged victim reviewing immunity paperwork re... \n", "49 20 thanksuser crystal clear lack evidence linking... \n", "50 10 giving dvds vaccinating children bad \n", "51 12 hey nowuser mixmasterrods upstairs lounge \n", "52 16 seriously dont want child antivaccination move... \n", "53 17 epidemic enterovirus mumps now measles drug re... \n", "54 22 said wasnt gone shit lol looked proved wrong baby \n", "55 19 flu shots school boost vaccination rates offer... \n", "56 15 marin county school board sides young leukemia... \n", "57 16 still running niggas til death dibiasimb " ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.loc[:57]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "rIoc4GwDjG3Q" }, "outputs": [], "source": [ "clean_train= df_train[[\"clean_tweet\", \"label\", \"agreement\"]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 394 }, "id": "O0DdpM4qmzMO", "outputId": "9e1737d4-b4ae-4638-edf1-326a95ce1112" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clean_tweetlabelagreement
0amp big homie meanboy stegman st0.01.000000
1im thinking devoting career proving autism isn...1.01.000000
2vaccines vaccinate child-1.01.000000
3mean immunize kid something wont secretly kill...-1.01.000000
4thanks catch performing la nuit nyc st ave sho...0.01.000000
5nearly year old study mental health studies va...1.00.666667
6study kids finds link mmr vaccine autism1.00.666667
7psa vaccinate fucking kids1.01.000000
8coughing extra shuttle everyone thinks measles1.00.666667
9aids vaccine created oregon health amp science...1.00.666667
10point text butwbon jovi cover playin alibis ho...0.01.000000
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " clean_tweet label agreement\n", "0 amp big homie meanboy stegman st 0.0 1.000000\n", "1 im thinking devoting career proving autism isn... 1.0 1.000000\n", "2 vaccines vaccinate child -1.0 1.000000\n", "3 mean immunize kid something wont secretly kill... -1.0 1.000000\n", "4 thanks catch performing la nuit nyc st ave sho... 0.0 1.000000\n", "5 nearly year old study mental health studies va... 1.0 0.666667\n", "6 study kids finds link mmr vaccine autism 1.0 0.666667\n", "7 psa vaccinate fucking kids 1.0 1.000000\n", "8 coughing extra shuttle everyone thinks measles 1.0 0.666667\n", "9 aids vaccine created oregon health amp science... 1.0 0.666667\n", "10 point text butwbon jovi cover playin alibis ho... 0.0 1.000000" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean_train.loc[:10]" ] }, { "cell_type": "markdown", "metadata": { "id": "dDDOgGH1m0Dg" }, "source": [ "#####. a. creating word length" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "nwp_mTWkonDC" }, "outputs": [], "source": [ "##i am just creting a copy of the dataframe\n", "\n", "clean_copy= clean_train.copy()" ] }, { "cell_type": "markdown", "metadata": { "id": "oPUr_IdCo4ri" }, "source": [ " ### c.Lemmatization" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "xa7Q2eLQMsDU" }, "outputs": [], "source": [ "def lemmatize_text(text):\n", " lemmatizer = WordNetLemmatizer()\n", " return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "DEu2t5iGNKC6", "outputId": "e8d9a650-9b93-4efa-ee5b-94f4f7f1ae69" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: SettingWithCopyWarning:\n", "\n", "\n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", "\n" ] } ], "source": [ "clean_train[\"clean_tweet\"]= clean_train[\"clean_tweet\"].apply(lemmatize_text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "BGURYgp0NRkw", "outputId": "a13a39aa-f189-429a-eb40-a8303686bbd4" }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clean_tweetlabelagreement
0amp big homie meanboy stegman st0.01.000000
1im thinking devoting career proving autism isn...1.01.000000
2vaccine vaccinate child-1.01.000000
3mean immunize kid something wont secretly kill...-1.01.000000
4thanks catch performing la nuit nyc st ave sho...0.01.000000
5nearly year old study mental health study vacc...1.00.666667
6study kid find link mmr vaccine autism1.00.666667
7psa vaccinate fucking kid1.01.000000
8coughing extra shuttle everyone think measles1.00.666667
9aid vaccine created oregon health amp science ...1.00.666667
10point text butwbon jovi cover playin alibi hop...0.01.000000
11prediction vaccine exemption arizona will end ...0.00.666667
12getting vaccine cheryl southern nevada health ...1.01.000000
13mug noche mad hatter0.01.000000
14got influenza vaccine purdue university studen...1.00.666667
15sb enroll meningococcal diseasepupils immunize...0.00.666667
16increasing number parent skip vaccination chil...0.01.000000
17thank standing ca parent child support1.01.000000
18dude idc disney land measles mean shorter line...0.01.000000
19beeftalk start calf vaccination now via good a...1.01.000000
20dont care say probably kid vaccinated1.01.000000
21small study show pt lupus shingle vaccine safe...1.00.666667
22cdc measles epidemic pose travel risk usatlyix...0.01.000000
23every time see vaccinate debate wonder one sid...1.01.000000
24lied hid data black boy uncreased risk develop...-1.01.000000
25vaccine causing autism-1.01.000000
26rarely see argument vaccination actually big deal0.00.666667
27im obsessed w ebola following outbreak world d...1.00.666667
28joshthenewt suck game haha well people say don...0.01.000000
29dont shake hand pocahontas thats probably meas...0.00.666667
30yes im part public health im part nature feed ...1.01.000000
31new study show vaccine associated autism news ...1.01.000000
32repetitive le shocked increase asd even geneti...1.01.000000
33glad got vaccinated health alert case meningit...1.01.000000
34look got measles0.00.666667
35defense vaccine harmful american life unhealthy-1.00.333333
36hello ranked reset probably bad placement mmr ...0.01.000000
37amid measles outbreak vaccine teacher arent re...0.00.666667
38measles threat0.00.666667
39improve mood energy immunity cardio health kit...0.01.000000
40mt new bill end exemption personal religious via0.01.000000
41autism immunization vaccinate0.00.666667
42bart rider warned measles infection contagious...0.01.000000
43u case hit year high far year case measles rep...0.00.666667
44pull myxx nightlife stint t performing live ho...0.01.000000
45cdc eyeing bird flu vaccine human though risk ...0.00.666667
46involved fight preventable disease tenure trac...1.00.666667
47im vaccinating kid lol-1.01.000000
48alleged victim reviewing immunity paperwork re...0.01.000000
49thanksuser crystal clear lack evidence linking...1.00.666667
50giving dvd vaccinating child bad0.00.666667
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ], "text/plain": [ " clean_tweet label agreement\n", "0 amp big homie meanboy stegman st 0.0 1.000000\n", "1 im thinking devoting career proving autism isn... 1.0 1.000000\n", "2 vaccine vaccinate child -1.0 1.000000\n", "3 mean immunize kid something wont secretly kill... -1.0 1.000000\n", "4 thanks catch performing la nuit nyc st ave sho... 0.0 1.000000\n", "5 nearly year old study mental health study vacc... 1.0 0.666667\n", "6 study kid find link mmr vaccine autism 1.0 0.666667\n", "7 psa vaccinate fucking kid 1.0 1.000000\n", "8 coughing extra shuttle everyone think measles 1.0 0.666667\n", "9 aid vaccine created oregon health amp science ... 1.0 0.666667\n", "10 point text butwbon jovi cover playin alibi hop... 0.0 1.000000\n", "11 prediction vaccine exemption arizona will end ... 0.0 0.666667\n", "12 getting vaccine cheryl southern nevada health ... 1.0 1.000000\n", "13 mug noche mad hatter 0.0 1.000000\n", "14 got influenza vaccine purdue university studen... 1.0 0.666667\n", "15 sb enroll meningococcal diseasepupils immunize... 0.0 0.666667\n", "16 increasing number parent skip vaccination chil... 0.0 1.000000\n", "17 thank standing ca parent child support 1.0 1.000000\n", "18 dude idc disney land measles mean shorter line... 0.0 1.000000\n", "19 beeftalk start calf vaccination now via good a... 1.0 1.000000\n", "20 dont care say probably kid vaccinated 1.0 1.000000\n", "21 small study show pt lupus shingle vaccine safe... 1.0 0.666667\n", "22 cdc measles epidemic pose travel risk usatlyix... 0.0 1.000000\n", "23 every time see vaccinate debate wonder one sid... 1.0 1.000000\n", "24 lied hid data black boy uncreased risk develop... -1.0 1.000000\n", "25 vaccine causing autism -1.0 1.000000\n", "26 rarely see argument vaccination actually big deal 0.0 0.666667\n", "27 im obsessed w ebola following outbreak world d... 1.0 0.666667\n", "28 joshthenewt suck game haha well people say don... 0.0 1.000000\n", "29 dont shake hand pocahontas thats probably meas... 0.0 0.666667\n", "30 yes im part public health im part nature feed ... 1.0 1.000000\n", "31 new study show vaccine associated autism news ... 1.0 1.000000\n", "32 repetitive le shocked increase asd even geneti... 1.0 1.000000\n", "33 glad got vaccinated health alert case meningit... 1.0 1.000000\n", "34 look got measles 0.0 0.666667\n", "35 defense vaccine harmful american life unhealthy -1.0 0.333333\n", "36 hello ranked reset probably bad placement mmr ... 0.0 1.000000\n", "37 amid measles outbreak vaccine teacher arent re... 0.0 0.666667\n", "38 measles threat 0.0 0.666667\n", "39 improve mood energy immunity cardio health kit... 0.0 1.000000\n", "40 mt new bill end exemption personal religious via 0.0 1.000000\n", "41 autism immunization vaccinate 0.0 0.666667\n", "42 bart rider warned measles infection contagious... 0.0 1.000000\n", "43 u case hit year high far year case measles rep... 0.0 0.666667\n", "44 pull myxx nightlife stint t performing live ho... 0.0 1.000000\n", "45 cdc eyeing bird flu vaccine human though risk ... 0.0 0.666667\n", "46 involved fight preventable disease tenure trac... 1.0 0.666667\n", "47 im vaccinating kid lol -1.0 1.000000\n", "48 alleged victim reviewing immunity paperwork re... 0.0 1.000000\n", "49 thanksuser crystal clear lack evidence linking... 1.0 0.666667\n", "50 giving dvd vaccinating child bad 0.0 0.666667" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean_train.loc[:50]" ] }, { "cell_type": "markdown", "metadata": { "id": "8oUftd5qNTmi" }, "source": [ "#####c.adding Character Length" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "XfyRroOwStP5" }, "outputs": [], "source": [ "def remove_special_characters(input_string):\n", " pattern = r'[^a-zA-Z0-9\\s]'\n", " cleaned_string = re.sub(pattern, '', input_string)\n", " return cleaned_string" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "fdazLlKJxiRL", "outputId": "43cd0292-2994-49e1-d5e1-1850de417265" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: SettingWithCopyWarning:\n", "\n", "\n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", "\n" ] } ], "source": [ "clean_train[\"clean_tweet\"]= clean_train[\"clean_tweet\"].apply(remove_special_characters)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "r-7HCyGU-IZc" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "GMwWaNTsyRkV" }, "outputs": [], "source": [ "clean_copy = clean_copy.dropna()\n", "clean_train = clean_train.dropna()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Z9m2iyomIcBQ" }, "outputs": [], "source": [ "clean_copy.to_csv(\"/content/drive/MyDrive/deep-learning/clean_copy.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YIC1W8MNALb1", "outputId": "09d6b005-6c36-43c6-9fbe-bd3a29c9a26e" }, "outputs": [ { "data": { "text/plain": [ "(9999, 3)" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean_copy.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "v2aPN_AvA02E", "outputId": "73132e4b-f2f6-4592-ba8f-8bec8a30eb2f" }, "outputs": [ { "data": { "text/plain": [ "clean_tweet 0\n", "label 0\n", "agreement 0\n", "dtype: int64" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean_train.isna().sum()" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }