File size: 2,520 Bytes
4f78275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
071ae5f
4f78275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import re

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

import pickle

# import dataset 'full_post' that has been lemmatized
url = 'https://huggingface.co./spaces/yxmauw/subreddit-clf-app/raw/main/tts.csv'
df = pd.read_csv(url, header=0)

# train-test-split
X = df['full_post'] # pd.series because dataframe format not friendly for word vectorization
y = df['subreddit']

# make sure target variable has equal representation on both train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.2,
                                                    stratify=y, 
                                                    random_state=42)

# lemmatizing
def lemmatize_join(text):
    tokenizer = RegexpTokenizer('[a-z]+', gaps=False) # instantiate tokenizer
    lemmer = WordNetLemmatizer() # instantiate lemmatizer
    return ' '.join([lemmer.lemmatize(w) for w in tokenizer.tokenize(text.lower())]) 
    # lowercase, join back together with spaces so that word vectorizers can still operate 
    # on cell contents as strings

Z_train = X_train.apply(lemmatize_join)

# model instantiation
pipe_cvec_nb = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

# word vectorizor parameters
features = [1000]
min_df = [3]
max_df = [.6]
ngrams = [(1,2)]
stop_words = ['english']
accent = ['unicode']

# naive bayes classifier parameters
alphas = [.5]

cvec_nb_params = [{'cvec__max_features': features,
                   'cvec__min_df': min_df,
                   'cvec__max_df': max_df,
                   'cvec__ngram_range': ngrams,
                   'cvec__lowercase': [False],
                   'cvec__stop_words': stop_words,
                   'cvec__strip_accents': accent,
                   'nb__alpha': alphas
                   }]

cvec_nb_gs = GridSearchCV(pipe_cvec_nb,
                          cvec_nb_params,
                          scoring='accuracy',
                          cv=5,
                          verbose=1,
                          n_jobs=-2)

cvec_nb_gs.fit(Z_train, y_train)

pickle.dump(cvec_nb_gs, open('final_model.sav', 'wb'))