|
import pandas as pd |
|
from os import walk |
|
from os import listdir |
|
from os.path import isfile, join |
|
import numpy as np |
|
import re |
|
|
|
from gensim.parsing import preprocessing |
|
from gensim.parsing.preprocessing import strip_tags, strip_punctuation |
|
from nltk.tokenize import word_tokenize, sent_tokenize |
|
import math |
|
from tqdm import tqdm |
|
|
|
def remove_noise_text(txt): |
|
|
|
txt = txt.lower() |
|
txt = re.sub("primary site:", ' ', txt) |
|
|
|
|
|
|
|
|
|
txt = txt.split("findings were discussed with")[0] |
|
|
|
|
|
txt = txt.split("this study has been reviewed and interpreted")[0] |
|
txt = txt.split("this finding was communicated to")[0] |
|
txt = txt.split("important findings were identified")[0] |
|
txt = txt.split("these findings")[0] |
|
txt = txt.split("findings above were")[0] |
|
txt = txt.split("findings regarding")[0] |
|
txt = txt.split("were discussed")[0] |
|
txt = txt.split("these images were")[0] |
|
txt = txt.split("important finding")[0] |
|
|
|
|
|
txt = re.sub("post-surgical changes:", ' ', txt) |
|
txt = re.sub("post surgical changes:", ' ', txt) |
|
txt = re.sub("primary site:", ' ', txt) |
|
txt = re.sub("primary site", ' ', txt) |
|
txt = re.sub("neck:", ' ', txt) |
|
txt = re.sub("post-treatment changes:", ' ', txt) |
|
txt = re.sub("post treatment changes:", ' ', txt) |
|
txt = re.sub("brain, orbits, spine and lungs:", ' ', txt) |
|
txt = re.sub("primary :", ' ', txt) |
|
txt = re.sub("neck:", ' ', txt) |
|
txt = re.sub("aerodigestive tract:", ' ', txt) |
|
txt = re.sub("calvarium, skull base, and spine:", ' ', txt) |
|
txt = re.sub("other:", ' ', txt) |
|
txt = re.sub("upper neck:", ' ', txt) |
|
txt = re.sub("perineural disease:", ' ', txt) |
|
txt = re.sub("technique:", ' ', txt) |
|
txt = re.sub("comparison:", ' ', txt) |
|
txt = re.sub("paranasal sinuses:", ' ', txt) |
|
txt = re.sub("included orbits:", ' ', txt) |
|
txt = re.sub("nasopharynx:", ' ', txt) |
|
txt = re.sub("tympanomastoid cavities:", ' ', txt) |
|
txt = re.sub("skull base and calvarium:", ' ', txt) |
|
txt = re.sub("included intracranial structures:", ' ', txt) |
|
txt = re.sub("impression:", ' ', txt) |
|
txt = re.sub("nodes:", ' ', txt) |
|
txt = re.sub("mri orbits:", ' ', txt) |
|
txt = re.sub("mri brain:", ' ', txt) |
|
txt = re.sub("brain:", ' ', txt) |
|
txt = re.sub("ct face w/:", ' ', txt) |
|
txt = re.sub("transspatial extension:", ' ', txt) |
|
txt = re.sub("thyroid bed:", ' ', txt) |
|
txt = re.sub("additional findings:", ' ', txt) |
|
txt = re.sub("series_image", ' ', txt) |
|
txt = re.sub("series image", ' ', txt) |
|
txt = re.sub("image series", ' ', txt) |
|
txt = re.sub("see synoptic report", ' ', txt) |
|
txt = re.sub("see report", ' ', txt) |
|
|
|
txt = re.sub("brstwo|brstmarun|brstwln|brlump|lnbx", ' ', txt) |
|
|
|
txt = re.sub("post_treatment", 'post treatment', txt) |
|
txt = re.sub("post-treatment", 'post treatment', txt) |
|
|
|
txt = re.sub("nonmasslike", 'non mass like', txt) |
|
txt = re.sub("non_mass_like", 'non mass like', txt) |
|
txt = re.sub("non-mass-like", 'non mass like', txt) |
|
txt = re.sub("statuspost", 'status post', txt) |
|
|
|
|
|
|
|
txt = re.sub("dr\\.\\s[^\\s]+", ' ', txt) |
|
|
|
txt = re.sub(" series | series|series ", "", txt) |
|
txt = re.sub(" cm | cm|cm ", " centimeters ", txt) |
|
txt = re.sub(" cc | cc|cc ", " cubic centimeters ", txt) |
|
txt = re.sub(" ct | ct|ct ", " carat metric ", txt) |
|
txt = re.sub(" mm | mm|mm ", " millimeters ", txt) |
|
|
|
txt = re.sub("status_post|o\'", '', txt) |
|
txt = re.sub("status post|clock|/|'/'", '', txt) |
|
txt = re.sub("statuspost", '', txt) |
|
txt = re.sub("brstwo|brlump|brstmarun|brwire|brstcap|", '', txt) |
|
|
|
txt = re.sub("\\(|\\)", ',', txt) |
|
txt = re.sub(",,", ',', txt) |
|
txt = re.sub(",\\.", '.', txt) |
|
txt = re.sub(", \\.", '.', txt) |
|
|
|
txt = re.sub(" ,", ', ', txt) |
|
txt = re.sub("a\\.", ' ', txt[0:5]) + txt[5:] |
|
txt = re.sub("b\\.", ' ', txt[0:5]) + txt[5:] |
|
txt = re.sub("c\\.", ' ', txt[0:5]) + txt[5:] |
|
txt = re.sub("d\\.", ' ', txt[0:5]) + txt[5:] |
|
txt = re.sub("e\\.", ' ', txt[0:5]) + txt[5:] |
|
txt = re.sub("f\\.", ' ', txt[0:5]) + txt[5:] |
|
|
|
|
|
|
|
txt = re.sub("dr\\.\\s[^\\s]+", '', txt) |
|
|
|
|
|
txt = re.sub(r'\s+', ' ', txt) |
|
txt = re.sub(' +', ' ', txt) |
|
|
|
txt = txt.rstrip().lstrip() |
|
|
|
return txt |
|
|
|
|
|
def add_bigrams(txt, fixed_bigrams): |
|
|
|
for b in fixed_bigrams: |
|
sub = "" |
|
not_first = False |
|
for x in b[1:]: |
|
if not_first: |
|
sub += "|" |
|
not_first = True |
|
|
|
sub += str(x) + "|" + str(x) + " " + "|" + " " + str(x) + "|" + " " + str(x) |
|
txt = re.sub(sub, b[0], txt) |
|
|
|
|
|
return txt |
|
|
|
def extra_clean_text(clean_t,fixed_bigrams): |
|
|
|
txt = add_bigrams(clean_t, fixed_bigrams) |
|
replaces = [ ["her2|her 2|her two", " hertwo "], |
|
|
|
|
|
["\\>", " greather "], ["\\<", " less "]] |
|
|
|
for sub in replaces: |
|
txt = re.sub(sub[0], sub[1], txt) |
|
|
|
return txt |
|
|
|
|
|
def text_cleaning(data,min_lenght=2,extra_clean=True, remove_punctuation=False): |
|
|
|
|
|
fixed_bigrams = [ [' gradeone ', 'grade 1', 'grade i', 'grade I', 'grade one',], |
|
[' gradetwo ', 'grade 2', 'grade ii', 'grade II', 'grade two', ], |
|
[' gradethree ', 'grade 3' , 'grade iii', 'grade III', 'grade three']] |
|
|
|
clean_txt = [] |
|
|
|
clean_t = remove_noise_text(data) |
|
if extra_clean: |
|
clean_t = extra_clean_text(clean_t,fixed_bigrams) |
|
if remove_punctuation: |
|
filters = [lambda x: x.lower(), strip_tags, strip_punctuation] |
|
else: |
|
filters = [lambda x: x.lower(), strip_tags] |
|
|
|
clean_t = " ".join(x for x in preprocessing.preprocess_string(clean_t, filters) if len(x) >=min_lenght) |
|
|
|
|
|
|
|
clean_t = re.sub(r'\s+', ' ', clean_t) |
|
|
|
return clean_t |
|
|
|
|
|
def pre_process(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuation=False): |
|
|
|
|
|
data_pre_processed = text_cleaning(data,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation) |
|
|
|
""" |
|
Partion the data into max_size chunks |
|
""" |
|
sentences = sent_tokenize(data) |
|
data_pre_processed_chunks,sample = [],"" |
|
|
|
|
|
if len(sentences)>1: |
|
for index,sentence in enumerate(sentences): |
|
if len(sentence.split()) + len(sample.split()) <= max_size: |
|
sample += sentence |
|
else: |
|
data_pre_processed_chunks.append(text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)) |
|
sample = sentence if index < len(sentences)-1 else "" |
|
|
|
if len(sample) ==0: |
|
clean_data = text_cleaning(sentences[-1],min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation) |
|
else: |
|
clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation) |
|
|
|
|
|
data_pre_processed_chunks.append(clean_data) |
|
|
|
|
|
else: |
|
words = word_tokenize(data) |
|
lower_b, upper_b = 0, max_size |
|
for x in range(math.ceil(len(words)/max_size)): |
|
sample = " ".join(x for x in words[lower_b:upper_b]) |
|
lower_b, upper_b = upper_b, upper_b+max_size |
|
clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation) |
|
|
|
data_pre_processed_chunks.append(clean_data) |
|
|
|
|
|
return data_pre_processed,data_pre_processed_chunks |
|
|
|
if __name__ == '__main__': |
|
exit(1) |
|
|
|
|
|
|
|
|
|
|
|
|