Spaces:
Runtime error
Runtime error
import re | |
contractions = { | |
"aint": "ain't", | |
"arent": "aren't", | |
"cant": "can't", | |
"couldve": "could've", | |
"couldnt": "couldn't", | |
"couldn'tve": "couldn't've", | |
"couldnt've": "couldn't've", | |
"didnt": "didn't", | |
"doesnt": "doesn't", | |
"dont": "don't", | |
"hadnt": "hadn't", | |
"hadnt've": "hadn't've", | |
"hadn'tve": "hadn't've", | |
"hasnt": "hasn't", | |
"havent": "haven't", | |
"hed": "he'd", | |
"hed've": "he'd've", | |
"he'dve": "he'd've", | |
"hes": "he's", | |
"howd": "how'd", | |
"howll": "how'll", | |
"hows": "how's", | |
"Id've": "I'd've", | |
"I'dve": "I'd've", | |
"Im": "I'm", | |
"Ive": "I've", | |
"isnt": "isn't", | |
"itd": "it'd", | |
"itd've": "it'd've", | |
"it'dve": "it'd've", | |
"itll": "it'll", | |
"let's": "let's", | |
"maam": "ma'am", | |
"mightnt": "mightn't", | |
"mightnt've": "mightn't've", | |
"mightn'tve": "mightn't've", | |
"mightve": "might've", | |
"mustnt": "mustn't", | |
"mustve": "must've", | |
"neednt": "needn't", | |
"notve": "not've", | |
"oclock": "o'clock", | |
"oughtnt": "oughtn't", | |
"ow's'at": "'ow's'at", | |
"'ows'at": "'ow's'at", | |
"'ow'sat": "'ow's'at", | |
"shant": "shan't", | |
"shed've": "she'd've", | |
"she'dve": "she'd've", | |
"she's": "she's", | |
"shouldve": "should've", | |
"shouldnt": "shouldn't", | |
"shouldnt've": "shouldn't've", | |
"shouldn'tve": "shouldn't've", | |
"somebody'd": "somebodyd", | |
"somebodyd've": "somebody'd've", | |
"somebody'dve": "somebody'd've", | |
"somebodyll": "somebody'll", | |
"somebodys": "somebody's", | |
"someoned": "someone'd", | |
"someoned've": "someone'd've", | |
"someone'dve": "someone'd've", | |
"someonell": "someone'll", | |
"someones": "someone's", | |
"somethingd": "something'd", | |
"somethingd've": "something'd've", | |
"something'dve": "something'd've", | |
"somethingll": "something'll", | |
"thats": "that's", | |
"thered": "there'd", | |
"thered've": "there'd've", | |
"there'dve": "there'd've", | |
"therere": "there're", | |
"theres": "there's", | |
"theyd": "they'd", | |
"theyd've": "they'd've", | |
"they'dve": "they'd've", | |
"theyll": "they'll", | |
"theyre": "they're", | |
"theyve": "they've", | |
"twas": "'twas", | |
"wasnt": "wasn't", | |
"wed've": "we'd've", | |
"we'dve": "we'd've", | |
"weve": "we've", | |
"werent": "weren't", | |
"whatll": "what'll", | |
"whatre": "what're", | |
"whats": "what's", | |
"whatve": "what've", | |
"whens": "when's", | |
"whered": "where'd", | |
"wheres": "where's", | |
"whereve": "where've", | |
"whod": "who'd", | |
"whod've": "who'd've", | |
"who'dve": "who'd've", | |
"wholl": "who'll", | |
"whos": "who's", | |
"whove": "who've", | |
"whyll": "why'll", | |
"whyre": "why're", | |
"whys": "why's", | |
"wont": "won't", | |
"wouldve": "would've", | |
"wouldnt": "wouldn't", | |
"wouldnt've": "wouldn't've", | |
"wouldn'tve": "wouldn't've", | |
"yall": "y'all", | |
"yall'll": "y'all'll", | |
"y'allll": "y'all'll", | |
"yall'd've": "y'all'd've", | |
"y'alld've": "y'all'd've", | |
"y'all'dve": "y'all'd've", | |
"youd": "you'd", | |
"youd've": "you'd've", | |
"you'dve": "you'd've", | |
"youll": "you'll", | |
"youre": "you're", | |
"youve": "you've", | |
} | |
manual_map = { | |
"none": "0", | |
"zero": "0", | |
"one": "1", | |
"two": "2", | |
"three": "3", | |
"four": "4", | |
"five": "5", | |
"six": "6", | |
"seven": "7", | |
"eight": "8", | |
"nine": "9", | |
"ten": "10", | |
} | |
articles = ["a", "an", "the"] | |
period_strip = re.compile("(?!<=\d)(\.)(?!\d)") | |
comma_strip = re.compile("(\d)(\,)(\d)") | |
punct = [ | |
";", | |
r"/", | |
"[", | |
"]", | |
'"', | |
"{", | |
"}", | |
"(", | |
")", | |
"=", | |
"+", | |
"\\", | |
"_", | |
"-", | |
">", | |
"<", | |
"@", | |
"`", | |
",", | |
"?", | |
"!", | |
] | |
def normalize_word(token): | |
_token = token | |
for p in punct: | |
if (p + " " in token or " " + p in token) or ( | |
re.search(comma_strip, token) != None | |
): | |
_token = _token.replace(p, "") | |
else: | |
_token = _token.replace(p, " ") | |
token = period_strip.sub("", _token, re.UNICODE) | |
_token = [] | |
temp = token.lower().split() | |
for word in temp: | |
word = manual_map.setdefault(word, word) | |
if word not in articles: | |
_token.append(word) | |
for i, word in enumerate(_token): | |
if word in contractions: | |
_token[i] = contractions[word] | |
token = " ".join(_token) | |
token = token.replace(",", "") | |
return token | |