Spaces:
Sleeping
Sleeping
File size: 2,520 Bytes
4f78275 071ae5f 4f78275 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
import pickle
# import dataset 'full_post' that has been lemmatized
url = 'https://huggingface.co./spaces/yxmauw/subreddit-clf-app/raw/main/tts.csv'
df = pd.read_csv(url, header=0)
# train-test-split
X = df['full_post'] # pd.series because dataframe format not friendly for word vectorization
y = df['subreddit']
# make sure target variable has equal representation on both train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=.2,
stratify=y,
random_state=42)
# lemmatizing
def lemmatize_join(text):
tokenizer = RegexpTokenizer('[a-z]+', gaps=False) # instantiate tokenizer
lemmer = WordNetLemmatizer() # instantiate lemmatizer
return ' '.join([lemmer.lemmatize(w) for w in tokenizer.tokenize(text.lower())])
# lowercase, join back together with spaces so that word vectorizers can still operate
# on cell contents as strings
Z_train = X_train.apply(lemmatize_join)
# model instantiation
pipe_cvec_nb = Pipeline([
('cvec', CountVectorizer()),
('nb', MultinomialNB())
])
# word vectorizor parameters
features = [1000]
min_df = [3]
max_df = [.6]
ngrams = [(1,2)]
stop_words = ['english']
accent = ['unicode']
# naive bayes classifier parameters
alphas = [.5]
cvec_nb_params = [{'cvec__max_features': features,
'cvec__min_df': min_df,
'cvec__max_df': max_df,
'cvec__ngram_range': ngrams,
'cvec__lowercase': [False],
'cvec__stop_words': stop_words,
'cvec__strip_accents': accent,
'nb__alpha': alphas
}]
cvec_nb_gs = GridSearchCV(pipe_cvec_nb,
cvec_nb_params,
scoring='accuracy',
cv=5,
verbose=1,
n_jobs=-2)
cvec_nb_gs.fit(Z_train, y_train)
pickle.dump(cvec_nb_gs, open('final_model.sav', 'wb'))
|