File size: 953 Bytes
e82eef0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#preprocessing
from sklearn.preprocessing import OrdinalEncoder
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from nltk.stem.porter import PorterStemmer
import re

#stem words
def stemm(data):
    ps=PorterStemmer()
    corpus=[]
    review=re.sub('[^a-zA-Z]',' ',data)
    review=review.lower()
    review=review.split()
    #remove html tag by removing <br> also
    review=[ps.stem(word) for word in review if not word in stopwords.words('english') and not word in ['br']]
    review=' '.join(review)
    corpus.append(review)
    return corpus

#one hot encoding and padding
def preprocess(data):
    corpus=stemm(data)
    onehot_corpus=[one_hot(words,10000) for words in corpus]
    sent_length = 2470
    padded_corpus=pad_sequences(onehot_corpus,padding='pre',maxlen=sent_length)
    return padded_corpus