"""# MODEL BUILDING""" # Commented out IPython magic to ensure Python compatibility. import numpy as np # For linear algebra import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv) # import matplotlib.pyplot as plt # For Visualisation # %matplotlib inline # import seaborn as sns # For Visualisation # from bs4 import BeautifulSoup # For Text Parsing # from ydata_profiling import ProfileReport # For generating data report import nltk from nltk.corpus import stopwords nltk.download('stopwords') nltk.download('punkt') def remove_stopword(text): stopword=nltk.corpus.stopwords.words('english') stopword.remove('not') a=[w for w in nltk.word_tokenize(text) if w not in stopword] return ' '.join(a) #data['Extracted text'] = data['Extracted text'].apply(remove_stopword) data = pd.read_csv('train-cleaned.csv') data import nltk #Natural Language Processing Toolkit def punc_clean(text): import string as st a=[w for w in text if w not in st.punctuation] return ''.join(a) data[''] = data['Extracted text'].apply(punc_clean) #data.head(2) from sklearn.feature_extraction.text import TfidfVectorizer vectr = TfidfVectorizer(ngram_range=(1,2),min_df=1) vectr.fit(data['Extracted text']) vect_X = vectr.transform(data['Extracted text']) #from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.ensemble import VotingClassifier svm_classifier = SVC(kernel='linear', probability=True) logistic_classifier = LogisticRegression() model = VotingClassifier(estimators=[ ('svm', svm_classifier), ('logistic', logistic_classifier) ], voting='hard') clf=model.fit(vect_X,data['saliency']) # clf.score(vect_X, data['saliency'])*100 # """# PREDICTION""" # clf.predict(vectr.transform(['''thank you '''])) # clf.predict(vectr.transform(['''Theres no trailers or nothing on the other side of me and its been facing away from my trailer straight'''])) # clf.predict(vectr.transform([''' I dont think that should really matter Um''']))