Thiago
commited on
Commit
·
6b767f8
1
Parent(s):
f6b05c8
first commit
Browse files- README.md +29 -1
- app/.DS_Store +0 -0
- app/imgs/.DS_Store +0 -0
- app/imgs/app_1.png +0 -0
- app/imgs/app_2.png +0 -0
- app/imgs/doctor.png +0 -0
- app/imgs/emory_1.png +0 -0
- app/imgs/hybrid_system.png +0 -0
- app/imgs/icon.png +0 -0
- app/imgs/icons8-github-240.png +0 -0
- app/imgs/medical-checkup.png +0 -0
- app/imgs/pipeline.png +0 -0
- app/models/.DS_Store +0 -0
- app/models/all_labels_hierarchy/.gitignore +4 -0
- app/models/higher_order_hierarchy/.gitignore +4 -0
- app/src/__pycache__/app.cpython-38.pyc +0 -0
- app/src/__pycache__/config.cpython-37.pyc +0 -0
- app/src/__pycache__/config.cpython-38.pyc +0 -0
- app/src/__pycache__/download_models.cpython-37.pyc +0 -0
- app/src/__pycache__/pipeline.cpython-37.pyc +0 -0
- app/src/__pycache__/pipeline.cpython-38.pyc +0 -0
- app/src/__pycache__/pipeline.cpython-39.pyc +0 -0
- app/src/__pycache__/text_cleaning.cpython-37.pyc +0 -0
- app/src/__pycache__/text_cleaning.cpython-38.pyc +0 -0
- app/src/__pycache__/text_cleaning_transforerms.cpython-37.pyc +0 -0
- app/src/__pycache__/text_cleaning_transforerms.cpython-38.pyc +0 -0
- app/src/app.py +576 -0
- app/src/config.py +221 -0
- app/src/download_models.py +187 -0
- app/src/label_extraction.py +150 -0
- app/src/pipeline.py +670 -0
- app/src/text_cleaning.py +250 -0
- app/src/text_cleaning_transforerms.py +229 -0
- environment.yml +210 -0
- requirements.txt +180 -0
README.md
CHANGED
@@ -1 +1,29 @@
|
|
1 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Information Extraction from Breast Cancer Pathology Reports
|
2 |
+
|
3 |
+
<!-- TOC -->
|
4 |
+
|
5 |
+
- [Instruction Navigation](#information-extraction-from-breast-cancer-pathology-reports)
|
6 |
+
- [Web System Application](#web-system-application)
|
7 |
+
- [Model and Project Design](#model-and-project-design)
|
8 |
+
|
9 |
+
<!-- /TOC -->
|
10 |
+
|
11 |
+
|
12 |
+
## Web System Application
|
13 |
+
|
14 |
+
We developed a web system application for users to test our proposed pipilne for predicting histopathology reports. Users can interact with the platform in 2 ways: 1) Input an excel/csv spreadsheet with a column with the biopsy diagnosis (Part A,B or C). 2) Input a single biopsy diagnosis. An example of our Web System is illustraded bellow:
|
15 |
+
|
16 |
+
<table border=1>
|
17 |
+
<tr align='center' >
|
18 |
+
<td><img src="https://github.com/thiagosantos1/BreastPathologyClassificationSystem/blob/master/app/imgs/app_1.png" width="500" title="HCS App"></td>
|
19 |
+
<td><img src="https://github.com/thiagosantos1/BreastPathologyClassificationSystem/blob/master/app/imgs/app_2.png" width="500" title="HCS App"></td>
|
20 |
+
</tr>
|
21 |
+
</table>
|
22 |
+
|
23 |
+
## Model and Project Design
|
24 |
+
<table border=1>
|
25 |
+
<tr align='center' >
|
26 |
+
<td><img src="https://github.com/thiagosantos1/BreastPathologyClassificationSystem/blob/master/app/imgs/pipeline.png" width="500" title="HCS App"></td>
|
27 |
+
<td><img src="https://github.com/thiagosantos1/BreastPathologyClassificationSystem/blob/master/app/imgs/hybrid_system.png" width="400" title="HCS App"></td>
|
28 |
+
</tr>
|
29 |
+
</table>
|
app/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
app/imgs/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
app/imgs/app_1.png
ADDED
app/imgs/app_2.png
ADDED
app/imgs/doctor.png
ADDED
app/imgs/emory_1.png
ADDED
app/imgs/hybrid_system.png
ADDED
app/imgs/icon.png
ADDED
app/imgs/icons8-github-240.png
ADDED
app/imgs/medical-checkup.png
ADDED
app/imgs/pipeline.png
ADDED
app/models/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
app/models/all_labels_hierarchy/.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Ignore everything in this directory
|
2 |
+
*
|
3 |
+
# Except this file
|
4 |
+
!.gitignore
|
app/models/higher_order_hierarchy/.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Ignore everything in this directory
|
2 |
+
*
|
3 |
+
# Except this file
|
4 |
+
!.gitignore
|
app/src/__pycache__/app.cpython-38.pyc
ADDED
Binary file (4.36 kB). View file
|
|
app/src/__pycache__/config.cpython-37.pyc
ADDED
Binary file (3.64 kB). View file
|
|
app/src/__pycache__/config.cpython-38.pyc
ADDED
Binary file (3.65 kB). View file
|
|
app/src/__pycache__/download_models.cpython-37.pyc
ADDED
Binary file (5.47 kB). View file
|
|
app/src/__pycache__/pipeline.cpython-37.pyc
ADDED
Binary file (20 kB). View file
|
|
app/src/__pycache__/pipeline.cpython-38.pyc
ADDED
Binary file (18.2 kB). View file
|
|
app/src/__pycache__/pipeline.cpython-39.pyc
ADDED
Binary file (11.7 kB). View file
|
|
app/src/__pycache__/text_cleaning.cpython-37.pyc
ADDED
Binary file (7.86 kB). View file
|
|
app/src/__pycache__/text_cleaning.cpython-38.pyc
ADDED
Binary file (8.01 kB). View file
|
|
app/src/__pycache__/text_cleaning_transforerms.cpython-37.pyc
ADDED
Binary file (6.09 kB). View file
|
|
app/src/__pycache__/text_cleaning_transforerms.cpython-38.pyc
ADDED
Binary file (6.5 kB). View file
|
|
app/src/app.py
ADDED
@@ -0,0 +1,576 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (C) 2021, Mindee.
|
2 |
+
|
3 |
+
# This program is licensed under the Apache License version 2.
|
4 |
+
# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import streamlit as st
|
8 |
+
import streamlit.components.v1 as components
|
9 |
+
import time
|
10 |
+
import matplotlib.pyplot as plt
|
11 |
+
import pandas as pd
|
12 |
+
from pipeline import Pipeline
|
13 |
+
import html
|
14 |
+
from IPython.core.display import display, HTML
|
15 |
+
import json
|
16 |
+
from PIL import Image
|
17 |
+
from tqdm import tqdm
|
18 |
+
import logging
|
19 |
+
from htbuilder import HtmlElement, div, ul, li, br, hr, a, p, img, styles, classes, fonts
|
20 |
+
from htbuilder.units import percent, px
|
21 |
+
from htbuilder.funcs import rgba, rgb
|
22 |
+
import copy
|
23 |
+
from download_models import check_if_exist
|
24 |
+
import re
|
25 |
+
import numpy as np
|
26 |
+
from sklearn.manifold import TSNE
|
27 |
+
from sklearn.decomposition import PCA
|
28 |
+
import plotly.express as plotpx
|
29 |
+
import umap
|
30 |
+
|
31 |
+
def image(src_as_string, **style):
|
32 |
+
return img(src=src_as_string, style=styles(**style))
|
33 |
+
|
34 |
+
|
35 |
+
def link(link, text, **style):
|
36 |
+
return a(_href=link, _target="_blank", style=styles(**style))(text)
|
37 |
+
|
38 |
+
def update_highlight(current,old):
|
39 |
+
out = current
|
40 |
+
matches_background_new = [(m.start(0), m.end(0)) for m in re.finditer("background-color:rgba\\(234, 131, 4,", out)]
|
41 |
+
matches_background_old = [(m.start(0), m.end(0)) for m in re.finditer("background-color:rgba\\(234, 131, 4,", old)]
|
42 |
+
for x,y in zip(matches_background_old,matches_background_new):
|
43 |
+
try:
|
44 |
+
old_importance = re.search("\\d+\\.\\d+",old[x[1]:x[1]+20])
|
45 |
+
new_importance = re.search("\\d+\\.\\d+",current[y[1]:y[1]+20])
|
46 |
+
|
47 |
+
if int(out[y[1]]) ==0 and float(old[x[1]]) != 0:
|
48 |
+
out = out[0:y[1]] + str(old_importance.group(0)) + out[y[1]:]
|
49 |
+
return False,out
|
50 |
+
if float(out[y[1]]) !=0 and float(old[x[1]]) != 0:
|
51 |
+
if float(old[x[1]]) > float(out[y[1]]):
|
52 |
+
out = out[0:y[1]] + str(old_importance.group(0))[0] + out[y[1]:]
|
53 |
+
return False,out
|
54 |
+
except Exception as e:
|
55 |
+
return True, out
|
56 |
+
|
57 |
+
return True,out
|
58 |
+
|
59 |
+
def hidde_menu():
|
60 |
+
|
61 |
+
footer_style = """<style>
|
62 |
+
footer {
|
63 |
+
visibility: hidden;
|
64 |
+
}
|
65 |
+
footer:after {
|
66 |
+
content:"An end-to-end Breast Pathology Classification System to infer Breast Cancer Diagnosis and Severity";
|
67 |
+
visibility: visible;
|
68 |
+
display: block;
|
69 |
+
position: center;
|
70 |
+
#background-color: red;
|
71 |
+
padding: 5px;
|
72 |
+
top: 2px;
|
73 |
+
}
|
74 |
+
</style>
|
75 |
+
"""
|
76 |
+
|
77 |
+
st.markdown(footer_style, unsafe_allow_html=True)
|
78 |
+
|
79 |
+
def main(myargs):
|
80 |
+
project_dir = os.path.dirname(os.path.abspath(__file__))
|
81 |
+
|
82 |
+
|
83 |
+
def add_content(columns):
|
84 |
+
if 'hg_df' in st.session_state:
|
85 |
+
columns[1].dataframe(st.session_state.hg_df)
|
86 |
+
if 'all_l' in st.session_state:
|
87 |
+
columns[2].dataframe(st.session_state.all_l)
|
88 |
+
|
89 |
+
if "highlight_samples" in st.session_state:
|
90 |
+
|
91 |
+
if "selected_indices" in st.session_state:
|
92 |
+
if len(st.session_state.selected_indices) >0:
|
93 |
+
out = ""
|
94 |
+
l = st.session_state.selected_indices
|
95 |
+
l.sort()
|
96 |
+
for ind in l:
|
97 |
+
out += st.session_state.highlight_samples[ind] + "<br><br>"
|
98 |
+
components.html(out,scrolling=True)
|
99 |
+
else:
|
100 |
+
components.html(st.session_state.highlight_samples[0])
|
101 |
+
else:
|
102 |
+
components.html(st.session_state.highlight_samples[0])
|
103 |
+
|
104 |
+
|
105 |
+
# Add Plot - Only for File version
|
106 |
+
if st.session_state['input_type'] == 'File' and "embeddings_all" in st.session_state and st.session_state.embeddings_plot in ["2D", "3D"]:
|
107 |
+
indices = [x for x in range(st.session_state.data_df[st.session_state.input_column].values.shape[0])]
|
108 |
+
if "selected_indices" in st.session_state:
|
109 |
+
if len(st.session_state.selected_indices) >=4:
|
110 |
+
l = st.session_state.selected_indices
|
111 |
+
l.sort()
|
112 |
+
indices = l
|
113 |
+
|
114 |
+
if st.session_state.data_df[st.session_state.input_column].values.shape[0] >=2:
|
115 |
+
sub_embeddings = st.session_state.embeddings_all[indices]
|
116 |
+
sentences = st.session_state.data_df[st.session_state.input_column].values[indices]
|
117 |
+
sentences_parses = []
|
118 |
+
break_size = 20
|
119 |
+
for data in sentences:
|
120 |
+
d = data.split()
|
121 |
+
size_sentence = len(d)
|
122 |
+
if len(d) >break_size:
|
123 |
+
out = ""
|
124 |
+
for lower_bound in range(0,size_sentence, break_size):
|
125 |
+
upper_bound = lower_bound + break_size if lower_bound + break_size <= size_sentence else size_sentence
|
126 |
+
out += " ".join(x for x in d[lower_bound:upper_bound]) + "<br>"
|
127 |
+
sentences_parses.append(out)
|
128 |
+
else:
|
129 |
+
sentences_parses.append(data)
|
130 |
+
|
131 |
+
|
132 |
+
|
133 |
+
prediction_label = st.session_state.hg_df["Prediction"].values[indices]
|
134 |
+
prediction_worst_label = []
|
135 |
+
for pred in prediction_label:
|
136 |
+
preds = pred.split(" && ")
|
137 |
+
if len(preds) ==1:
|
138 |
+
prediction_worst_label.extend(preds)
|
139 |
+
else:
|
140 |
+
worst_index = min([st.session_state.predictor.bert_model.config['worst_rank'].index(x) for x in preds])
|
141 |
+
prediction_worst_label.append(st.session_state.predictor.bert_model.config['worst_rank'][worst_index])
|
142 |
+
|
143 |
+
|
144 |
+
if st.session_state.embeddings_type == "PCA":
|
145 |
+
|
146 |
+
low_dim_embeddings = PCA(n_components=3).fit_transform(sub_embeddings)
|
147 |
+
elif st.session_state.embeddings_type == "TSNE":
|
148 |
+
low_dim_embeddings = TSNE(n_components=3,init="pca",perplexity=st.session_state.perplexity,learning_rate=st.session_state.learning_rate).fit_transform(sub_embeddings)
|
149 |
+
|
150 |
+
else:
|
151 |
+
n_neighbors = min(st.session_state.n_neighbors, len(sub_embeddings)-1 )
|
152 |
+
low_dim_embeddings = umap.UMAP(n_neighbors=n_neighbors, min_dist=st.session_state.min_dist,n_components=3).fit(sub_embeddings).embedding_
|
153 |
+
|
154 |
+
df_embeddings = pd.DataFrame(low_dim_embeddings)
|
155 |
+
df_embeddings = df_embeddings.rename(columns={0:'x',1:'y',2:'z'})
|
156 |
+
df_embeddings = df_embeddings.assign(severity=prediction_worst_label)
|
157 |
+
df_embeddings = df_embeddings.assign(text=sentences_parses)
|
158 |
+
df_embeddings = df_embeddings.assign(data_index=indices)
|
159 |
+
df_embeddings = df_embeddings.assign(all_predictions=prediction_label)
|
160 |
+
|
161 |
+
|
162 |
+
if st.session_state.embeddings_plot == "2D":
|
163 |
+
# 2D
|
164 |
+
plot = plotpx.scatter(
|
165 |
+
df_embeddings, x='x', y='y',
|
166 |
+
color='severity', labels={'color': 'severity'},
|
167 |
+
hover_data=['text','all_predictions','data_index'],title = 'BERT Embeddings Visualization - Please select rows (at least 4) to display specific examples'
|
168 |
+
)
|
169 |
+
|
170 |
+
else:
|
171 |
+
# 3D
|
172 |
+
plot = plotpx.scatter_3d(
|
173 |
+
df_embeddings, x='x', y='y', z='z',
|
174 |
+
color='severity', labels={'color': 'severity'},
|
175 |
+
hover_data=['text','all_predictions','data_index'],title = 'BERT Embeddings Visualization - Please select rows (at least 4) to display specific examples'
|
176 |
+
)
|
177 |
+
|
178 |
+
st.plotly_chart(plot,use_container_width=True,)
|
179 |
+
|
180 |
+
|
181 |
+
#worst_rank_ind = [classes.index(x) for x in worst_rank]
|
182 |
+
|
183 |
+
if 'bert_lime_output' in st.session_state and st.session_state.bert_lime:
|
184 |
+
if len(st.session_state.bert_lime_output) >0: # need to re-run prediction
|
185 |
+
st.markdown("BERT Interpretability")
|
186 |
+
components.html(st.session_state.bert_lime_output[0])
|
187 |
+
|
188 |
+
if 'json_output' in st.session_state and st.session_state.json_out:
|
189 |
+
|
190 |
+
st.markdown("Here are your analysis results in JSON format:")
|
191 |
+
out = {}
|
192 |
+
if "selected_indices" in st.session_state:
|
193 |
+
|
194 |
+
if len(st.session_state.selected_indices) >0:
|
195 |
+
l = st.session_state.selected_indices
|
196 |
+
l.sort()
|
197 |
+
for ind in l:
|
198 |
+
out['sample_'+str(ind)] = st.session_state.json_output['sample_'+str(ind)]
|
199 |
+
st.json(out)
|
200 |
+
else:
|
201 |
+
out['sample_'+str(0)] = st.session_state.json_output['sample_'+str(0)]
|
202 |
+
st.json(out)
|
203 |
+
else:
|
204 |
+
# Display JSON
|
205 |
+
out['sample_'+str(0)] = st.session_state.json_output['sample_'+str(0)]
|
206 |
+
st.json(out)
|
207 |
+
|
208 |
+
|
209 |
+
def delete_var_session(keys:list):
|
210 |
+
for key in keys:
|
211 |
+
if key in st.session_state:
|
212 |
+
del st.session_state[key]
|
213 |
+
|
214 |
+
im = Image.open(os.path.join(project_dir, "../imgs/icon.png"))
|
215 |
+
|
216 |
+
|
217 |
+
# Wide mode
|
218 |
+
st.set_page_config(page_title='HCSBC', layout = 'wide',page_icon=im,menu_items={
|
219 |
+
'Get Help': 'https://github.com/thiagosantos1/BreastPathologyClassificationSystem',
|
220 |
+
'Report a bug': "https://github.com/thiagosantos1/BreastPathologyClassificationSystem",
|
221 |
+
'About': "An end-to-end breast pathology classification system https://github.com/thiagosantos1/BreastPathologyClassificationSystem"
|
222 |
+
})
|
223 |
+
st.sidebar.image(os.path.join(project_dir,"../imgs/doctor.png"),use_column_width=False)
|
224 |
+
|
225 |
+
# Designing the interface
|
226 |
+
st.markdown("<h1 style='text-align: center; color: black;'>HCSBC: Hierarchical Classification System for Breast Cancer</h1>", unsafe_allow_html=True)
|
227 |
+
st.markdown("System Pipeline: Pathology Emory Pubmed BERT + 6 independent Machine Learning discriminators")
|
228 |
+
# For newline
|
229 |
+
st.write('\n')
|
230 |
+
# Instructions
|
231 |
+
st.markdown("*Hint: click on the top-right corner to enlarge it!*")
|
232 |
+
# Set the columns
|
233 |
+
|
234 |
+
cols = st.columns((1, 1, 1))
|
235 |
+
#cols = st.columns(4)
|
236 |
+
cols[0].subheader("Input Data")
|
237 |
+
cols[1].subheader("Severity Predictions")
|
238 |
+
cols[2].subheader("Diagnose Predictions")
|
239 |
+
|
240 |
+
# Sidebar
|
241 |
+
# File selection
|
242 |
+
st.sidebar.title("Data Selection")
|
243 |
+
|
244 |
+
st.session_state['input_type'] = st.sidebar.radio("Input Selection", ('File', 'Text'), key="data_format")
|
245 |
+
if "prev_input_type" not in st.session_state:
|
246 |
+
st.session_state['prev_input_type'] = st.session_state.input_type
|
247 |
+
|
248 |
+
st.write('<style>div.row-widget.stRadio > div{flex-direction:row;}</style>', unsafe_allow_html=True)
|
249 |
+
|
250 |
+
|
251 |
+
# Disabling warning
|
252 |
+
st.set_option('deprecation.showfileUploaderEncoding', False)
|
253 |
+
|
254 |
+
|
255 |
+
if st.session_state['input_type'] == 'File':
|
256 |
+
if st.session_state['prev_input_type'] == 'Text':
|
257 |
+
delete_var_session(keys=["data_df","data_columns","hg_df","all_l","highlight_samples","selected_indices","json_output","bert_lime_output","embeddings_all"])
|
258 |
+
st.session_state['prev_input_type'] = "File"
|
259 |
+
|
260 |
+
# Choose your own file
|
261 |
+
new_file = st.sidebar.file_uploader("Upload Document", type=['xlsx','csv'])
|
262 |
+
if 'uploaded_file' in st.session_state and st.session_state.uploaded_file != None and new_file != None:
|
263 |
+
if st.session_state.uploaded_file.name != new_file.name and st.session_state.uploaded_file.id != new_file.id:
|
264 |
+
delete_var_session(keys=["data_df","data_columns","hg_df","all_l","highlight_samples","selected_indices","json_output","bert_lime_output","embeddings_all"])
|
265 |
+
|
266 |
+
st.session_state['uploaded_file'] = new_file
|
267 |
+
|
268 |
+
data_columns = ['Input']
|
269 |
+
if 'data_columns' not in st.session_state:
|
270 |
+
st.session_state['data_columns'] = data_columns
|
271 |
+
|
272 |
+
if st.session_state.uploaded_file is not None:
|
273 |
+
if 'data_df' not in st.session_state:
|
274 |
+
if st.session_state.uploaded_file.name.endswith('.xlsx'):
|
275 |
+
df = pd.read_excel(st.session_state.uploaded_file)
|
276 |
+
else:
|
277 |
+
df = pd.read_csv(st.session_state.uploaded_file)
|
278 |
+
|
279 |
+
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
|
280 |
+
df = df.fillna("NA")
|
281 |
+
data_columns = df.columns.values
|
282 |
+
st.session_state['data_df'] = df
|
283 |
+
st.session_state['data_columns'] = data_columns
|
284 |
+
else:
|
285 |
+
if st.session_state['prev_input_type'] == 'File':
|
286 |
+
delete_var_session(keys=["data_df","input_column","user_input","hg_df","all_l","highlight_samples","selected_indices","json_output","bert_lime_output","embeddings_all"])
|
287 |
+
st.session_state['prev_input_type'] = "Text"
|
288 |
+
|
289 |
+
input_column = "Input"
|
290 |
+
data = st.sidebar.text_area("Please enter a breast cancer pathology diagnose")
|
291 |
+
if "user_input" in st.session_state:
|
292 |
+
if data != st.session_state.user_input:
|
293 |
+
delete_var_session(keys=["data_df","input_column","user_input","hg_df","all_l","highlight_samples","selected_indices","json_output","bert_lime_output","embeddings_all"])
|
294 |
+
|
295 |
+
st.session_state['user_input'] = data
|
296 |
+
if len(st.session_state.user_input.split()) >0:
|
297 |
+
st.session_state['data_df'] = pd.DataFrame([st.session_state['user_input']], columns =[input_column])
|
298 |
+
st.session_state['input_column'] = input_column
|
299 |
+
st.session_state['uploaded_file'] = True
|
300 |
+
else:
|
301 |
+
delete_var_session(keys=["data_df","input_column","user_input","hg_df","all_l","highlight_samples","selected_indices","json_output","bert_lime_output","embeddings_all"])
|
302 |
+
|
303 |
+
|
304 |
+
if 'data_df' in st.session_state:
|
305 |
+
cols[0].dataframe(st.session_state.data_df)
|
306 |
+
|
307 |
+
|
308 |
+
if st.session_state['input_type'] == 'File':
|
309 |
+
# Columns selection
|
310 |
+
st.sidebar.write('\n')
|
311 |
+
st.sidebar.title("Column For Prediction")
|
312 |
+
input_column = st.sidebar.selectbox("Columns", st.session_state.data_columns)
|
313 |
+
|
314 |
+
st.session_state['input_column'] = input_column
|
315 |
+
|
316 |
+
|
317 |
+
st.sidebar.write('\n')
|
318 |
+
st.sidebar.title("Severity Model")
|
319 |
+
input_higher = st.sidebar.selectbox("Model", ["PathologyEmoryPubMedBERT"])
|
320 |
+
st.session_state['input_higher'] = input_higher
|
321 |
+
|
322 |
+
if "prev_input_higher" not in st.session_state:
|
323 |
+
st.session_state['prev_input_higher'] = st.session_state.input_higher
|
324 |
+
st.session_state['input_higher_exist'] = check_if_exist(st.session_state.input_higher)
|
325 |
+
st.session_state['load_new_higher_model'] = True
|
326 |
+
elif st.session_state.prev_input_higher != st.session_state.input_higher:
|
327 |
+
st.session_state['input_higher_exist'] = check_if_exist(st.session_state.input_higher)
|
328 |
+
st.session_state['prev_input_higher'] = st.session_state.input_higher
|
329 |
+
st.session_state['load_new_higher_model'] = True
|
330 |
+
delete_var_session(keys=["data_df","input_column","user_input","hg_df","all_l","highlight_samples","selected_indices","json_output","bert_lime_output","embeddings_all"])
|
331 |
+
|
332 |
+
|
333 |
+
st.sidebar.write('\n')
|
334 |
+
st.sidebar.title("Diagnosis Model")
|
335 |
+
input_all_labels = st.sidebar.selectbox("Model", ['single_vectorizer', 'branch_vectorizer'])
|
336 |
+
st.session_state['input_all_labels'] = input_all_labels
|
337 |
+
|
338 |
+
if "prev_input_all_labels" not in st.session_state:
|
339 |
+
st.session_state['prev_input_all_labels'] = st.session_state.input_all_labels
|
340 |
+
st.session_state['input_all_labels_exist'] = check_if_exist(st.session_state.input_all_labels)
|
341 |
+
st.session_state['load_new_all_label_model'] = True
|
342 |
+
elif st.session_state.prev_input_all_labels != st.session_state.input_all_labels:
|
343 |
+
st.session_state['input_all_labels_exist'] = check_if_exist(st.session_state.input_all_labels)
|
344 |
+
st.session_state['prev_input_all_labels'] = st.session_state.input_all_labels
|
345 |
+
st.session_state['load_new_all_label_model'] = True
|
346 |
+
delete_var_session(keys=["data_df","input_column","user_input","hg_df","all_l","highlight_samples","selected_indices","json_output","bert_lime_output","embeddings_all"])
|
347 |
+
|
348 |
+
|
349 |
+
# For newline
|
350 |
+
st.sidebar.write('\n')
|
351 |
+
st.sidebar.title("Analysis Options")
|
352 |
+
|
353 |
+
predictions, json_output, higher_order_pred,all_labels_pred,higher_order_prob,all_labels_prob = {},[],[],[],[],[]
|
354 |
+
hg_df, all_l,highlight_samples, bert_lime_output, embeddings_all= [],[],[],[],[]
|
355 |
+
|
356 |
+
|
357 |
+
if st.session_state['input_type'] == 'File':
|
358 |
+
embeddings_plot = st.sidebar.radio('Display embeddings plot',
|
359 |
+
['2D',
|
360 |
+
'3D',
|
361 |
+
'Dont Display'],index=1)
|
362 |
+
|
363 |
+
st.session_state['embeddings_plot'] = embeddings_plot
|
364 |
+
|
365 |
+
else:
|
366 |
+
st.session_state['embeddings_plot'] = 'Dont Display'
|
367 |
+
|
368 |
+
if st.session_state['input_type'] == 'File':
|
369 |
+
embeddings_type = st.sidebar.radio('Dimensionality Reduction',
|
370 |
+
['PCA',
|
371 |
+
'TSNE','UMAP'],index=0)
|
372 |
+
|
373 |
+
st.session_state['embeddings_type'] = embeddings_type
|
374 |
+
|
375 |
+
if st.session_state.embeddings_type == "TSNE":
|
376 |
+
perplexity = st.sidebar.slider("Perplexity", min_value=5, max_value=100, step=5, value=30)
|
377 |
+
st.session_state['perplexity'] = perplexity
|
378 |
+
|
379 |
+
learning_rate = st.sidebar.slider("Learning Rate", min_value=10, max_value=1000, step=10, value=100)
|
380 |
+
st.session_state['learning_rate'] = learning_rate
|
381 |
+
|
382 |
+
if st.session_state.embeddings_type == "UMAP":
|
383 |
+
n_neighbors = st.sidebar.slider("Neighbors", min_value=2, max_value=100, step=1, value=2)
|
384 |
+
st.session_state['n_neighbors'] = n_neighbors
|
385 |
+
|
386 |
+
min_dist = st.sidebar.slider("Minimal Distance", min_value=0.1, max_value=0.99, step=0.05, value=0.1)
|
387 |
+
st.session_state['min_dist'] = min_dist
|
388 |
+
|
389 |
+
json_out = st.sidebar.checkbox('Display Json',value = True,key='check3')
|
390 |
+
st.session_state['json_out'] = json_out
|
391 |
+
|
392 |
+
if st.session_state['input_type'] == 'Text':
|
393 |
+
bert_lime = st.sidebar.checkbox('Display BERT Interpretability',value = False,key='check3')
|
394 |
+
st.session_state['bert_lime'] = bert_lime
|
395 |
+
else:
|
396 |
+
st.session_state['bert_lime'] = False
|
397 |
+
|
398 |
+
|
399 |
+
# For newline
|
400 |
+
st.sidebar.write('\n')
|
401 |
+
st.sidebar.title("Prediction")
|
402 |
+
|
403 |
+
|
404 |
+
if st.sidebar.button("Run Prediction"):
|
405 |
+
|
406 |
+
if st.session_state.uploaded_file is None:
|
407 |
+
st.sidebar.write("Please upload a your data")
|
408 |
+
|
409 |
+
else:
|
410 |
+
st.session_state['input_all_labels_exist'] = check_if_exist(st.session_state.input_all_labels)
|
411 |
+
if not st.session_state.input_all_labels_exist:
|
412 |
+
st.sidebar.write("Please Download Model: " + str(st.session_state.input_all_labels))
|
413 |
+
|
414 |
+
st.session_state['input_higher_exist'] = check_if_exist(st.session_state.input_higher)
|
415 |
+
if not st.session_state.input_higher_exist:
|
416 |
+
st.sidebar.write("Please Download Model: " + str(st.session_state.input_higher))
|
417 |
+
|
418 |
+
if st.session_state.input_all_labels_exist and st.session_state.input_higher_exist:
|
419 |
+
if "predictor" not in st.session_state or st.session_state.load_new_higher_model or st.session_state.load_new_all_label_model:
|
420 |
+
with st.spinner('Loading model...'):
|
421 |
+
print("\n\tLoading Model")
|
422 |
+
st.session_state["predictor"] = Pipeline(bert_option=str(st.session_state.input_higher), branch_option=str(st.session_state.input_all_labels))
|
423 |
+
st.session_state['load_new_higher_model'] = False
|
424 |
+
st.session_state['load_new_all_label_model'] = False
|
425 |
+
|
426 |
+
with st.spinner('Transforming Data...'):
|
427 |
+
data = st.session_state.data_df[st.session_state.input_column].values
|
428 |
+
|
429 |
+
with st.spinner('Analyzing...'):
|
430 |
+
time.sleep(0.1)
|
431 |
+
prog_bar = st.progress(0)
|
432 |
+
logging.info("Running Predictions for data size of: " + str(len(data)))
|
433 |
+
logging.info("\n\tRunning Predictions with: " + str(st.session_state.input_higher) + str(st.session_state.input_all_labels))
|
434 |
+
for index in tqdm(range(len(data))):
|
435 |
+
d = data[index]
|
436 |
+
time.sleep(0.1)
|
437 |
+
prog_bar.progress(int( (100/len(data)) * (index+1) ))
|
438 |
+
# refactor json
|
439 |
+
preds,embeddings_output = st.session_state.predictor.run(d)
|
440 |
+
embeddings = embeddings_output.tolist()
|
441 |
+
embeddings_all.append(embeddings[0])
|
442 |
+
if st.session_state.bert_lime:
|
443 |
+
logging.info("Running BERT LIME Interpretability Predictions")
|
444 |
+
bert_lime_output.append(st.session_state.predictor.bert_interpretability(d))
|
445 |
+
|
446 |
+
predictions["sample_" + str(index)] = {}
|
447 |
+
for ind,pred in enumerate(preds):
|
448 |
+
predictions["sample_" + str(index)]["prediction_" + str(ind)] = pred
|
449 |
+
|
450 |
+
|
451 |
+
prog_bar.progress(100)
|
452 |
+
time.sleep(0.1)
|
453 |
+
|
454 |
+
for key,sample in predictions.items():
|
455 |
+
higher,all_p, prob_higher, prob_all = [],[],[],[]
|
456 |
+
for key,pred in sample.items():
|
457 |
+
for higher_order, sub_arr in pred.items():
|
458 |
+
higher.append(higher_order)
|
459 |
+
prob_higher.append(round(sub_arr["probability"], 2))
|
460 |
+
for label,v in sub_arr['labels'].items():
|
461 |
+
all_p.append(label)
|
462 |
+
prob_all.append(round(v["probability"], 2))
|
463 |
+
|
464 |
+
higher_order_pred.append(" && ".join(x for x in higher))
|
465 |
+
all_labels_pred.append(" && ".join(x for x in all_p))
|
466 |
+
|
467 |
+
higher_order_prob.append(" && ".join(str(x) for x in prob_higher))
|
468 |
+
all_labels_prob.append(" && ".join(str(x) for x in prob_all))
|
469 |
+
|
470 |
+
predictions_refact = copy.deepcopy(predictions)
|
471 |
+
|
472 |
+
for index in tqdm(range(len(data))):
|
473 |
+
highlights = ""
|
474 |
+
key = "sample_" + str(index)
|
475 |
+
for k,v in predictions[key].items():
|
476 |
+
for k_s, v_s in v.items():
|
477 |
+
predictions_refact["sample_" + str(index)]["data"] = v_s['data']
|
478 |
+
predictions_refact["sample_" + str(index)]["transformer_data"] = v_s['transformer_data']
|
479 |
+
predictions_refact["sample_" + str(index)]["discriminator_data"] = v_s['word_analysis']['discriminator_data']
|
480 |
+
highlight = v_s['word_analysis']['highlighted_html_text']
|
481 |
+
|
482 |
+
if len(highlights) >0:
|
483 |
+
done = False
|
484 |
+
merged = highlight
|
485 |
+
while not done:
|
486 |
+
done,merged = update_highlight(merged,highlights)
|
487 |
+
|
488 |
+
highlights = merged
|
489 |
+
else:
|
490 |
+
highlights = highlight
|
491 |
+
|
492 |
+
del predictions_refact[key][k][k_s]['data']
|
493 |
+
del predictions_refact[key][k][k_s]['transformer_data']
|
494 |
+
del predictions_refact[key][k][k_s]['word_analysis']['discriminator_data']
|
495 |
+
|
496 |
+
highlight_samples.append(highlights)
|
497 |
+
|
498 |
+
json_output = predictions_refact
|
499 |
+
|
500 |
+
hg_df = pd.DataFrame(list(zip(higher_order_pred, higher_order_prob)), columns =['Prediction', "Probability"])
|
501 |
+
all_l = pd.DataFrame(list(zip(all_labels_pred,all_labels_prob)), columns =['Prediction',"Probability"])
|
502 |
+
all_preds = pd.DataFrame(list(zip(higher_order_pred, all_labels_pred)), columns =['Severity Prediction',"Diagnose Prediction"])
|
503 |
+
|
504 |
+
st.session_state['hg_df'] = hg_df
|
505 |
+
st.session_state['all_l'] = all_l
|
506 |
+
st.session_state['all_preds'] = all_preds
|
507 |
+
st.session_state['json_output'] = json_output
|
508 |
+
st.session_state['highlight_samples'] = highlight_samples
|
509 |
+
st.session_state['highlight_samples_df'] = pd.DataFrame(highlight_samples, columns =["HTML Word Importance"])
|
510 |
+
st.session_state['bert_lime_output'] = bert_lime_output
|
511 |
+
st.session_state['embeddings_all'] = np.asarray(embeddings_all)
|
512 |
+
|
513 |
+
if 'data_df' in st.session_state and 'json_output' in st.session_state:
|
514 |
+
st.markdown("<h1 style='text-align: center; color: purple;'>Model Analysis</h1>", unsafe_allow_html=True)
|
515 |
+
selected_indices = st.multiselect('Select Rows to Display Word Importance, Embeddings Visualization, and Json Analysis:', [x for x in range(len(st.session_state.data_df))])
|
516 |
+
st.session_state['selected_indices'] = selected_indices
|
517 |
+
|
518 |
+
add_content(cols)
|
519 |
+
|
520 |
+
|
521 |
+
if 'json_output' in st.session_state:
|
522 |
+
st.sidebar.write('\n')
|
523 |
+
st.sidebar.title("Save Results")
|
524 |
+
|
525 |
+
st.sidebar.write('\n')
|
526 |
+
st.sidebar.download_button(
|
527 |
+
label="Download Output Json",
|
528 |
+
data=str(st.session_state.json_output),
|
529 |
+
file_name="output.json",
|
530 |
+
)
|
531 |
+
st.sidebar.download_button(
|
532 |
+
label="Download Predictions",
|
533 |
+
data=st.session_state.all_preds.to_csv(),
|
534 |
+
file_name="predictions.csv",
|
535 |
+
)
|
536 |
+
st.sidebar.download_button(
|
537 |
+
label="Download Data + Predictions",
|
538 |
+
data = pd.concat([st.session_state.data_df, st.session_state.all_preds,st.session_state.highlight_samples_df], axis=1, join='inner').to_csv(),
|
539 |
+
file_name="data_predictions.csv",
|
540 |
+
)
|
541 |
+
|
542 |
+
st.sidebar.write('\n')
|
543 |
+
st.sidebar.title("Contact Me")
|
544 |
+
sub_colms = st.sidebar.columns([1, 1, 1])
|
545 |
+
sub_colms[0].markdown('''<a href="https://github.com/thiagosantos1/BreastPathologyClassificationSystem">
|
546 |
+
<img src="https://img.icons8.com/fluency/48/000000/github.png" /></a>''',unsafe_allow_html=True)
|
547 |
+
sub_colms[1].markdown('''<a href="https://twitter.com/intent/follow?original_referer=https%3A%2F%2Fgithub.com%2Ftsantos_maia&screen_name=tsantos_maia">
|
548 |
+
<img src="https://img.icons8.com/color/48/000000/twitter--v1.png" /></a>''',unsafe_allow_html=True)
|
549 |
+
sub_colms[2].markdown('''<a href="https://www.linkedin.com/in/thiagosantos-cs/">
|
550 |
+
<img src="https://img.icons8.com/color/48/000000/linkedin.png" /></a>''',unsafe_allow_html=True)
|
551 |
+
|
552 |
+
|
553 |
+
hidde_menu()
|
554 |
+
|
555 |
+
|
556 |
+
|
557 |
+
|
558 |
+
if __name__ == '__main__':
|
559 |
+
|
560 |
+
myargs = [
|
561 |
+
"Made in ",
|
562 |
+
image('https://avatars3.githubusercontent.com/u/45109972?s=400&v=4',
|
563 |
+
width=px(25), height=px(25)),
|
564 |
+
" with ❤️ by ",
|
565 |
+
link("https://www.linkedin.com/in/thiagosantos-cs/", "@thiagosantos-cs"),
|
566 |
+
br(),
|
567 |
+
link("https://www.linkedin.com/in/thiagosantos-cs/", image('https://img.icons8.com/color/48/000000/twitter--v1.png')),
|
568 |
+
link("https://github.com/thiagosantos1/BreastPathologyClassificationSystem", image('https://img.icons8.com/fluency/48/000000/github.png')),
|
569 |
+
]
|
570 |
+
logging.basicConfig(
|
571 |
+
format="%(asctime)s - %(levelname)s - %(filename)s - %(message)s",
|
572 |
+
datefmt="%d/%m/%Y %H:%M:%S",
|
573 |
+
level=logging.INFO)
|
574 |
+
main(myargs)
|
575 |
+
|
576 |
+
|
app/src/config.py
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Input config for pipeline
|
3 |
+
"""
|
4 |
+
|
5 |
+
def config_file() -> dict:
|
6 |
+
config = {
|
7 |
+
"BERT_config": {
|
8 |
+
"model_emb": 'bert',
|
9 |
+
|
10 |
+
"model_option": {
|
11 |
+
"PathologyEmoryPubMedBERT": {
|
12 |
+
"model_folder":"../models/higher_order_hierarchy/PathologyEmoryPubMedBERT/"
|
13 |
+
},
|
14 |
+
"PathologyEmoryBERT": {
|
15 |
+
"model_folder":"../models/higher_order_hierarchy/PathologyEmoryBERT/"
|
16 |
+
},
|
17 |
+
"ClinicalBERT": {
|
18 |
+
"model_folder":"../models/higher_order_hierarchy/ClinicalBERT/"
|
19 |
+
},
|
20 |
+
"BlueBERT": {
|
21 |
+
"model_folder":"../models/higher_order_hierarchy/BlueBERT/"
|
22 |
+
},
|
23 |
+
"BioBERT": {
|
24 |
+
"model_folder":"../models/higher_order_hierarchy/BioBERT/"
|
25 |
+
},
|
26 |
+
"BERT": {
|
27 |
+
"model_folder":"../models/higher_order_hierarchy/BERT/"
|
28 |
+
},
|
29 |
+
|
30 |
+
},
|
31 |
+
"max_seq_length": "64",
|
32 |
+
"threshold_prediction":0.5,
|
33 |
+
"classes": ['Invasive breast cancer-IBC','Non-breast cancer-NBC','In situ breast cancer-ISC',
|
34 |
+
'Borderline lesion-BLL','High risk lesion-HRL','Benign-B','Negative'],
|
35 |
+
"worst_rank" : ['Invasive breast cancer-IBC', 'In situ breast cancer-ISC', 'High risk lesion-HRL',
|
36 |
+
'Borderline lesion-BLL','Benign-B','Non-breast cancer-NBC','Negative']
|
37 |
+
},
|
38 |
+
|
39 |
+
|
40 |
+
"ibc_config": {
|
41 |
+
|
42 |
+
"model_option": {
|
43 |
+
"single_tfidf": {
|
44 |
+
"path_model":"../models/all_labels_hierarchy/single_tfidf/classifiers",
|
45 |
+
"model": "ibc_xgboost_classifier.pkl",
|
46 |
+
"path_vectorizer":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
47 |
+
"vectorizer":"vectorizer_all_branches.pkl",
|
48 |
+
"path_bigrmas":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
49 |
+
"bigrams":"best_bigrams.csv",
|
50 |
+
"path_phrase_bigrams":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
51 |
+
"phrase_bigrams" : "phrase_bigrams.pkl"
|
52 |
+
},
|
53 |
+
|
54 |
+
"branch_tfidf": {
|
55 |
+
"path_model":"../models/all_labels_hierarchy/branch_tfidf/classifiers",
|
56 |
+
"model": "ibc_xgboost_classifier.pkl",
|
57 |
+
"path_vectorizer":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
58 |
+
"vectorizer":"ibc_vectorizer.pkl",
|
59 |
+
"path_bigrmas":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
60 |
+
"bigrams":"best_bigrams.csv",
|
61 |
+
"path_phrase_bigrams":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
62 |
+
"phrase_bigrams" : "phrase_bigrams.pkl"
|
63 |
+
}
|
64 |
+
},
|
65 |
+
|
66 |
+
"classes": ['apocrine carcinoma','grade i','grade ii','grade iii','invasive ductal carcinoma','invasive lobular carcinoma','medullary carcinoma','metaplastic carcinoma','mucinous carcinoma','tubular carcinoma','lymph node - metastatic']
|
67 |
+
|
68 |
+
},
|
69 |
+
|
70 |
+
"isc_config": {
|
71 |
+
"model_option": {
|
72 |
+
"single_tfidf": {
|
73 |
+
"path_model":"../models/all_labels_hierarchy/single_tfidf/classifiers",
|
74 |
+
"model": "isc_xgboost_classifier.pkl",
|
75 |
+
"path_vectorizer":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
76 |
+
"vectorizer":"vectorizer_all_branches.pkl",
|
77 |
+
"path_bigrmas":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
78 |
+
"bigrams":"best_bigrams.csv",
|
79 |
+
"path_phrase_bigrams":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
80 |
+
"phrase_bigrams" : "phrase_bigrams.pkl"
|
81 |
+
},
|
82 |
+
|
83 |
+
"branch_tfidf": {
|
84 |
+
"path_model":"../models/all_labels_hierarchy/branch_tfidf/classifiers",
|
85 |
+
"model": "isc_xgboost_classifier.pkl",
|
86 |
+
"path_vectorizer":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
87 |
+
"vectorizer":"isc_vectorizer.pkl",
|
88 |
+
"path_bigrmas":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
89 |
+
"bigrams":"best_bigrams.csv",
|
90 |
+
"path_phrase_bigrams":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
91 |
+
"phrase_bigrams" : "phrase_bigrams.pkl"
|
92 |
+
}
|
93 |
+
},
|
94 |
+
|
95 |
+
|
96 |
+
"classes": ['ductal carcinoma in situ','high','intermediate','intracystic papillary carcinoma','intraductal papillary carcinoma','low','pagets','fna - malignant']
|
97 |
+
|
98 |
+
},
|
99 |
+
|
100 |
+
"hrl_config": {
|
101 |
+
"model_option": {
|
102 |
+
"single_tfidf": {
|
103 |
+
"path_model":"../models/all_labels_hierarchy/single_tfidf/classifiers",
|
104 |
+
"model": "hrl_xgboost_classifier.pkl",
|
105 |
+
"path_vectorizer":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
106 |
+
"vectorizer":"vectorizer_all_branches.pkl",
|
107 |
+
"path_bigrmas":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
108 |
+
"bigrams":"best_bigrams.csv",
|
109 |
+
"path_phrase_bigrams":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
110 |
+
"phrase_bigrams" : "phrase_bigrams.pkl"
|
111 |
+
},
|
112 |
+
|
113 |
+
"branch_tfidf": {
|
114 |
+
"path_model":"../models/all_labels_hierarchy/branch_tfidf/classifiers",
|
115 |
+
"model": "hrl_xgboost_classifier.pkl",
|
116 |
+
"path_vectorizer":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
117 |
+
"vectorizer":"hrl_vectorizer.pkl",
|
118 |
+
"path_bigrmas":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
119 |
+
"bigrams":"best_bigrams.csv",
|
120 |
+
"path_phrase_bigrams":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
121 |
+
"phrase_bigrams" : "phrase_bigrams.pkl"
|
122 |
+
}
|
123 |
+
},
|
124 |
+
|
125 |
+
|
126 |
+
"classes": ['atypical ductal hyperplasia','atypical lobular hyperplasia','atypical papilloma','columnar cell change with atypia','flat epithelial atypia','hyperplasia with atypia','intraductal papilloma','lobular carcinoma in situ','microscopic papilloma','radial scar']
|
127 |
+
},
|
128 |
+
|
129 |
+
"bll_config": {
|
130 |
+
"model_option": {
|
131 |
+
"single_tfidf": {
|
132 |
+
"path_model":"../models/all_labels_hierarchy/single_tfidf/classifiers",
|
133 |
+
"model": "bll_xgboost_classifier.pkl",
|
134 |
+
"path_vectorizer":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
135 |
+
"vectorizer":"vectorizer_all_branches.pkl",
|
136 |
+
"path_bigrmas":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
137 |
+
"bigrams":"best_bigrams.csv",
|
138 |
+
"path_phrase_bigrams":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
139 |
+
"phrase_bigrams" : "phrase_bigrams.pkl"
|
140 |
+
},
|
141 |
+
|
142 |
+
"branch_tfidf": {
|
143 |
+
"path_model":"../models/all_labels_hierarchy/branch_tfidf/classifiers",
|
144 |
+
"model": "bll_xgboost_classifier.pkl",
|
145 |
+
"path_vectorizer":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
146 |
+
"vectorizer":"bll_vectorizer.pkl",
|
147 |
+
"path_bigrmas":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
148 |
+
"bigrams":"best_bigrams.csv",
|
149 |
+
"path_phrase_bigrams":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
150 |
+
"phrase_bigrams" : "phrase_bigrams.pkl"
|
151 |
+
}
|
152 |
+
},
|
153 |
+
|
154 |
+
|
155 |
+
"classes": ['atypical phyllodes', 'granular cell tumor', 'mucocele']
|
156 |
+
},
|
157 |
+
|
158 |
+
"benign_config": {
|
159 |
+
"model_option": {
|
160 |
+
"single_tfidf": {
|
161 |
+
"path_model":"../models/all_labels_hierarchy/single_tfidf/classifiers",
|
162 |
+
"model": "benign_xgboost_classifier.pkl",
|
163 |
+
"path_vectorizer":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
164 |
+
"vectorizer":"vectorizer_all_branches.pkl",
|
165 |
+
"path_bigrmas":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
166 |
+
"bigrams":"best_bigrams.csv",
|
167 |
+
"path_phrase_bigrams":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
168 |
+
"phrase_bigrams" : "phrase_bigrams.pkl"
|
169 |
+
},
|
170 |
+
|
171 |
+
"branch_tfidf": {
|
172 |
+
"path_model":"../models/all_labels_hierarchy/branch_tfidf/classifiers",
|
173 |
+
"model": "benign_xgboost_classifier.pkl",
|
174 |
+
"path_vectorizer":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
175 |
+
"vectorizer":"benign_vectorizer.pkl",
|
176 |
+
"path_bigrmas":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
177 |
+
"bigrams":"best_bigrams.csv",
|
178 |
+
"path_phrase_bigrams":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
179 |
+
"phrase_bigrams" : "phrase_bigrams.pkl"
|
180 |
+
}
|
181 |
+
},
|
182 |
+
|
183 |
+
|
184 |
+
"classes": ['apocrine metaplasia','biopsy site changes','columnar cell change without atypia','cyst','excisional or post-surgical change','fat necrosis','fibroadenoma','fibroadenomatoid','fibrocystic disease','fibromatoses','fibrosis','hamartoma','hemangioma','lactational change','lymph node - benign','myofibroblastoma','myxoma','phyllodes','pseudoangiomatous stromal hyperplasia','sclerosing adenosis','usual ductal hyperplasia','fna - benign','seroma']
|
185 |
+
},
|
186 |
+
|
187 |
+
"nbc_config": {
|
188 |
+
"model_option": {
|
189 |
+
"single_tfidf": {
|
190 |
+
"path_model":"../models/all_labels_hierarchy/single_tfidf/classifiers",
|
191 |
+
"model": "nbc_xgboost_classifier.pkl",
|
192 |
+
"path_vectorizer":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
193 |
+
"vectorizer":"vectorizer_all_branches.pkl",
|
194 |
+
"path_bigrmas":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
195 |
+
"bigrams":"best_bigrams.csv",
|
196 |
+
"path_phrase_bigrams":"../models/all_labels_hierarchy/single_tfidf/vectorizers",
|
197 |
+
"phrase_bigrams" : "phrase_bigrams.pkl"
|
198 |
+
},
|
199 |
+
|
200 |
+
"branch_tfidf": {
|
201 |
+
"path_model":"../models/all_labels_hierarchy/branch_tfidf/classifiers",
|
202 |
+
"model": "nbc_xgboost_classifier.pkl",
|
203 |
+
"path_vectorizer":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
204 |
+
"vectorizer":"nbc_vectorizer.pkl",
|
205 |
+
"path_bigrmas":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
206 |
+
"bigrams":"best_bigrams.csv",
|
207 |
+
"path_phrase_bigrams":"../models/all_labels_hierarchy/branch_tfidf/vectorizers",
|
208 |
+
"phrase_bigrams" : "phrase_bigrams.pkl"
|
209 |
+
}
|
210 |
+
},
|
211 |
+
|
212 |
+
|
213 |
+
"classes": ['lymphoma', 'malignant(sarcomas)', 'non-breast metastasis']
|
214 |
+
},
|
215 |
+
}
|
216 |
+
|
217 |
+
return config
|
218 |
+
|
219 |
+
if __name__ == '__main__':
|
220 |
+
pass
|
221 |
+
|
app/src/download_models.py
ADDED
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
""" Download pre-trained models from Google drive. """
|
3 |
+
import os
|
4 |
+
import argparse
|
5 |
+
import zipfile
|
6 |
+
import logging
|
7 |
+
import requests
|
8 |
+
from tqdm import tqdm
|
9 |
+
import fire
|
10 |
+
import re
|
11 |
+
|
12 |
+
logging.basicConfig(
|
13 |
+
format="%(asctime)s - %(levelname)s - %(filename)s - %(message)s",
|
14 |
+
datefmt="%d/%m/%Y %H:%M:%S",
|
15 |
+
level=logging.INFO)
|
16 |
+
|
17 |
+
|
18 |
+
"", "", "", "","",""
|
19 |
+
|
20 |
+
|
21 |
+
MODEL_TO_URL = {
|
22 |
+
|
23 |
+
'PathologyEmoryPubMedBERT': 'https://drive.google.com/open?id=1l_el_mYXoTIQvGwKN2NZbp97E4svH4Fh',
|
24 |
+
'PathologyEmoryBERT': 'https://drive.google.com/open?id=11vzo6fJBw1RcdHVBAh6nnn8yua-4kj2IX',
|
25 |
+
'ClinicalBERT': 'https://drive.google.com/open?id=1UK9HqSspVneK8zGg7B93vIdTGKK9MI_v',
|
26 |
+
'BlueBERT': 'https://drive.google.com/open?id=1o-tcItErOiiwqZ-YRa3sMM3hGB4d3WkP',
|
27 |
+
'BioBERT': 'https://drive.google.com/open?id=1m7EkWkFBIBuGbfwg7j0R_WINNnYk3oS9',
|
28 |
+
'BERT': 'https://drive.google.com/open?id=1SB_AQAAsHkF79iSAaB3kumYT1rwcOJru',
|
29 |
+
|
30 |
+
'single_tfidf': 'https://drive.google.com/open?id=1-hxf7sKRtFGMOenlafdkeAr8_9pOz6Ym',
|
31 |
+
'branch_tfidf': 'https://drive.google.com/open?id=1pDSnwLFn3YzPRac9rKFV_FN9kdzj2Lb0'
|
32 |
+
}
|
33 |
+
|
34 |
+
"""
|
35 |
+
For large Files, Drive requires a Virus Check.
|
36 |
+
This function is reponsivle to extract the link from the button confirmation
|
37 |
+
"""
|
38 |
+
def get_url_from_gdrive_confirmation(contents):
|
39 |
+
url = ""
|
40 |
+
for line in contents.splitlines():
|
41 |
+
m = re.search(r'href="(\/uc\?export=download[^"]+)', line)
|
42 |
+
if m:
|
43 |
+
url = "https://docs.google.com" + m.groups()[0]
|
44 |
+
url = url.replace("&", "&")
|
45 |
+
break
|
46 |
+
m = re.search('id="downloadForm" action="(.+?)"', line)
|
47 |
+
if m:
|
48 |
+
url = m.groups()[0]
|
49 |
+
url = url.replace("&", "&")
|
50 |
+
break
|
51 |
+
m = re.search('"downloadUrl":"([^"]+)', line)
|
52 |
+
if m:
|
53 |
+
url = m.groups()[0]
|
54 |
+
url = url.replace("\\u003d", "=")
|
55 |
+
url = url.replace("\\u0026", "&")
|
56 |
+
break
|
57 |
+
m = re.search('<p class="uc-error-subcaption">(.*)</p>', line)
|
58 |
+
if m:
|
59 |
+
error = m.groups()[0]
|
60 |
+
raise RuntimeError(error)
|
61 |
+
if not url:
|
62 |
+
return None
|
63 |
+
return url
|
64 |
+
|
65 |
+
def download_file_from_google_drive(id, destination):
|
66 |
+
URL = "https://docs.google.com/uc?export=download"
|
67 |
+
|
68 |
+
session = requests.Session()
|
69 |
+
|
70 |
+
|
71 |
+
response = session.get(URL, params={ 'id' : id }, stream=True)
|
72 |
+
URL_new = get_url_from_gdrive_confirmation(response.text)
|
73 |
+
|
74 |
+
if URL_new != None:
|
75 |
+
URL = URL_new
|
76 |
+
response = session.get(URL, params={ 'id' : id }, stream=True)
|
77 |
+
|
78 |
+
token = get_confirm_token(response)
|
79 |
+
|
80 |
+
if token:
|
81 |
+
params = { 'id' : id, 'confirm' : token }
|
82 |
+
response = session.get(URL, params=params, stream=True)
|
83 |
+
|
84 |
+
save_response_content(response, destination)
|
85 |
+
|
86 |
+
def get_confirm_token(response):
|
87 |
+
for key, value in response.cookies.items():
|
88 |
+
if key.startswith('download_warning'):
|
89 |
+
return value
|
90 |
+
|
91 |
+
return None
|
92 |
+
|
93 |
+
def save_response_content(response, destination):
|
94 |
+
CHUNK_SIZE = 32768
|
95 |
+
|
96 |
+
with open(destination, "wb") as f:
|
97 |
+
for chunk in tqdm(response.iter_content(CHUNK_SIZE)):
|
98 |
+
if chunk: # filter out keep-alive new chunks
|
99 |
+
f.write(chunk)
|
100 |
+
|
101 |
+
def check_if_exist(model:str = "single_tfidf"):
|
102 |
+
|
103 |
+
if model =="single_vectorizer":
|
104 |
+
model = "single_tfidf"
|
105 |
+
if model =="branch_vectorizer":
|
106 |
+
model = "branch_tfidf"
|
107 |
+
|
108 |
+
project_dir = os.path.dirname(os.path.abspath(__file__))
|
109 |
+
if model != None:
|
110 |
+
if model in ['single_tfidf', 'branch_tfidf' ]:
|
111 |
+
path='models/all_labels_hierarchy/'
|
112 |
+
path_model = os.path.join(project_dir, "..", path, model,'classifiers')
|
113 |
+
path_vectorizer = os.path.join(project_dir, "..", path, model,'vectorizers')
|
114 |
+
if os.path.exists(path_model) and os.path.exists(path_vectorizer):
|
115 |
+
if len(os.listdir(path_model)) >0 and len(os.listdir(path_vectorizer)) >0:
|
116 |
+
return True
|
117 |
+
else:
|
118 |
+
path='models/higher_order_hierarchy/'
|
119 |
+
path_folder = os.path.join(project_dir, "..", path, model)
|
120 |
+
if os.path.exists(path_folder):
|
121 |
+
if len(os.listdir(path_folder + "/" )) >1:
|
122 |
+
return True
|
123 |
+
return False
|
124 |
+
|
125 |
+
def download_model(all_labels='single_tfidf', higher_order='PathologyEmoryPubMedBERT'):
|
126 |
+
project_dir = os.path.dirname(os.path.abspath(__file__))
|
127 |
+
|
128 |
+
path_all_labels='models/all_labels_hierarchy/'
|
129 |
+
path_higher_order='models/higher_order_hierarchy/'
|
130 |
+
|
131 |
+
def extract_model(path_file, name):
|
132 |
+
|
133 |
+
os.makedirs(os.path.join(project_dir, "..", path_file), exist_ok=True)
|
134 |
+
|
135 |
+
file_destination = os.path.join(project_dir, "..", path_file, name + '.zip')
|
136 |
+
|
137 |
+
file_id = MODEL_TO_URL[name].split('id=')[-1]
|
138 |
+
|
139 |
+
logging.info(f'Downloading {name} model (~1000MB tar.xz archive)')
|
140 |
+
download_file_from_google_drive(file_id, file_destination)
|
141 |
+
|
142 |
+
logging.info('Extracting model from archive (~1300MB folder) and saving to ' + str(file_destination))
|
143 |
+
with zipfile.ZipFile(file_destination, 'r') as zip_ref:
|
144 |
+
zip_ref.extractall(path=os.path.dirname(file_destination))
|
145 |
+
|
146 |
+
logging.info('Removing archive')
|
147 |
+
os.remove(file_destination)
|
148 |
+
logging.info('Done.')
|
149 |
+
|
150 |
+
|
151 |
+
if higher_order != None:
|
152 |
+
if not check_if_exist(higher_order):
|
153 |
+
extract_model(path_higher_order, higher_order)
|
154 |
+
else:
|
155 |
+
logging.info('Model ' + str(higher_order) + ' already exist')
|
156 |
+
|
157 |
+
if all_labels!= None:
|
158 |
+
if not check_if_exist(all_labels):
|
159 |
+
extract_model(path_all_labels, all_labels)
|
160 |
+
else:
|
161 |
+
logging.info('Model ' + str(all_labels) + ' already exist')
|
162 |
+
|
163 |
+
|
164 |
+
|
165 |
+
|
166 |
+
def download(all_labels:str = "single_tfidf", higher_order:str = "PathologyEmoryPubMedBERT"):
|
167 |
+
"""
|
168 |
+
Input Options:
|
169 |
+
all_labels : single_tfidf, branch_tfidf
|
170 |
+
higher_order : clinicalBERT, blueBERT, patho_clinicalBERT, patho_blueBERT, charBERT
|
171 |
+
"""
|
172 |
+
all_labels_options = [ "single_tfidf", "branch_tfidf"]
|
173 |
+
higher_order_option = [ "PathologyEmoryPubMedBERT", "PathologyEmoryBERT", "ClinicalBERT", "BlueBERT","BioBERT","BERT" ]
|
174 |
+
|
175 |
+
if all_labels not in all_labels_options or higher_order not in higher_order_option:
|
176 |
+
print("\n\tPlease provide a valid model for downloading")
|
177 |
+
print("\n\t\tall_labels: " + " ".join(x for x in all_labels_options))
|
178 |
+
print("\n\t\thigher_order: " + " ".join(x for x in higher_order))
|
179 |
+
exit()
|
180 |
+
|
181 |
+
download_model(all_labels,higher_order)
|
182 |
+
|
183 |
+
if __name__ == "__main__":
|
184 |
+
fire.Fire(download)
|
185 |
+
|
186 |
+
|
187 |
+
|
app/src/label_extraction.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from tqdm import tqdm
|
4 |
+
import re
|
5 |
+
import fire
|
6 |
+
import json
|
7 |
+
from tqdm import tqdm
|
8 |
+
import logging
|
9 |
+
from pipeline import Pipeline
|
10 |
+
import copy
|
11 |
+
from download_models import check_if_exist
|
12 |
+
|
13 |
+
"""
|
14 |
+
Install dependecies by running: pip3 install -r requirements.txt
|
15 |
+
|
16 |
+
Running command example:
|
17 |
+
python3 label_extraction.py --path_to_file data.xlsx --column_name report --save_predictions predictions.xlsx --save_json output.json
|
18 |
+
"""
|
19 |
+
|
20 |
+
def data_extraction(path_to_file:str, column_name:str, higher_model:str="clinicalBERT", all_label_model="single_tfidf", save_predictions:str=None, output_model_data=None,save_input=None, save_json:str=None):
|
21 |
+
|
22 |
+
"""
|
23 |
+
This program takes an excell/csv sheet and extract the higher order and cancer characteristics from pathology reports
|
24 |
+
|
25 |
+
Input Options:
|
26 |
+
1) path_to_file - Path to an excel/csv with pathology diagnosis: String (Required)
|
27 |
+
2) column_name - Which column has the pathology diagnosis: String (Required)
|
28 |
+
3) higher_model - Which version of higher order model to use: String (Required)
|
29 |
+
4) all_label_model - Which version of all labels model to use: String (Required)
|
30 |
+
5) save_predictions - Path to save output: String (Optional)
|
31 |
+
6) output_model_data - Option to output model data to csv True/False (Optional)
|
32 |
+
7) save_input - Option to output the input fields True/False (Optional)
|
33 |
+
8) save_json - Path to save json analyis: String (Optional)
|
34 |
+
|
35 |
+
|
36 |
+
"""
|
37 |
+
|
38 |
+
data_orig = read_data(path_to_file)
|
39 |
+
data_orig = data_orig.fillna("NA")
|
40 |
+
data = data_orig.loc[:, ~data_orig.columns.str.contains('^Unnamed')][column_name].values
|
41 |
+
|
42 |
+
predictions, json_output, higher_order_pred,all_labels_pred = {},[],[],[]
|
43 |
+
|
44 |
+
if not check_if_exist(higher_model):
|
45 |
+
print("\n\t ##### Please Download Model: " + str(higher_model) + "#####")
|
46 |
+
exit()
|
47 |
+
if not check_if_exist(all_label_model):
|
48 |
+
print("\n\t ##### Please Download Model: " + str(all_label_model) + "#####")
|
49 |
+
exit()
|
50 |
+
|
51 |
+
model = Pipeline(bert_option=higher_model, branch_option=all_label_model)
|
52 |
+
|
53 |
+
logging.info("\nRunning Predictions for data size of: " + str(len(data)))
|
54 |
+
for index in tqdm(range(len(data))):
|
55 |
+
d = data[index]
|
56 |
+
# refactor json
|
57 |
+
preds,all_layer_hidden_states = model.run(d)
|
58 |
+
predictions["sample_" + str(index)] = {}
|
59 |
+
for ind,pred in enumerate(preds):
|
60 |
+
predictions["sample_" + str(index)]["prediction_" + str(ind)] = pred
|
61 |
+
|
62 |
+
for key,sample in predictions.items():
|
63 |
+
higher,all_p = [],[]
|
64 |
+
for key,pred in sample.items():
|
65 |
+
for higher_order, sub_arr in pred.items():
|
66 |
+
higher.append(higher_order)
|
67 |
+
for label,v in sub_arr['labels'].items():
|
68 |
+
all_p.append(label)
|
69 |
+
|
70 |
+
higher_order_pred.append(" && ".join(x for x in higher))
|
71 |
+
all_labels_pred.append(" && ".join(x for x in all_p))
|
72 |
+
|
73 |
+
|
74 |
+
predictions_refact = copy.deepcopy(predictions)
|
75 |
+
transformer_data, discriminator_data= [0 for x in range(len(data))], [0 for x in range(len(data))]
|
76 |
+
|
77 |
+
for index in tqdm(range(len(data))):
|
78 |
+
key = "sample_" + str(index)
|
79 |
+
for k,v in predictions[key].items():
|
80 |
+
for k_s, v_s in v.items():
|
81 |
+
predictions_refact["sample_" + str(index)]["data"] = v_s['data']
|
82 |
+
predictions_refact["sample_" + str(index)]["transformer_data"] = v_s['transformer_data']
|
83 |
+
predictions_refact["sample_" + str(index)]["discriminator_data"] = v_s['word_analysis']['discriminator_data']
|
84 |
+
transformer_data[index] = v_s['transformer_data']
|
85 |
+
discriminator_data[index] = v_s['word_analysis']['discriminator_data']
|
86 |
+
|
87 |
+
del predictions_refact[key][k][k_s]['data']
|
88 |
+
del predictions_refact[key][k][k_s]['transformer_data']
|
89 |
+
del predictions_refact[key][k][k_s]['word_analysis']['discriminator_data']
|
90 |
+
|
91 |
+
json_output = predictions_refact
|
92 |
+
|
93 |
+
|
94 |
+
if save_predictions!= None:
|
95 |
+
logging.info("Saving Predictions")
|
96 |
+
if output_model_data != None:
|
97 |
+
all_preds = pd.DataFrame(list(zip(higher_order_pred, all_labels_pred,transformer_data,discriminator_data,data)), columns =['Higher Order',"All Labels", 'Higher Order Model Data','All Labels Model Data',column_name])
|
98 |
+
else:
|
99 |
+
all_preds = pd.DataFrame(list(zip(higher_order_pred, all_labels_pred)), columns =['Higher Order',"All Labels"])
|
100 |
+
|
101 |
+
if save_input != None:
|
102 |
+
all_preds = pd.concat([data_orig, all_preds], axis=1)
|
103 |
+
try:
|
104 |
+
all_preds.to_excel(save_predictions)
|
105 |
+
except ValueError:
|
106 |
+
try:
|
107 |
+
all_preds.to_csv(save_predictions)
|
108 |
+
except ValueError:
|
109 |
+
logging.exception("Error while saving predictions " + str(e))
|
110 |
+
exit()
|
111 |
+
logging.info("Done")
|
112 |
+
|
113 |
+
if save_json!= None:
|
114 |
+
logging.info("Saving Json")
|
115 |
+
try:
|
116 |
+
with open(save_json, 'w') as f:
|
117 |
+
for k, v in json_output.items():
|
118 |
+
f.write('{'+str(k) + ':'+ str(v) + '\n')
|
119 |
+
|
120 |
+
except ValueError:
|
121 |
+
logging.exception("Error while saving json analysis " + str(e))
|
122 |
+
exit()
|
123 |
+
logging.info("Done")
|
124 |
+
|
125 |
+
|
126 |
+
def read_data(path_to_file):
|
127 |
+
|
128 |
+
try:
|
129 |
+
df = pd.read_excel(path_to_file)
|
130 |
+
return df
|
131 |
+
except ValueError:
|
132 |
+
try:
|
133 |
+
df = pd.read_csv(path_to_file)
|
134 |
+
return df
|
135 |
+
except ValueError:
|
136 |
+
logging.exception("### Error occurred while splitting document. Info: " + str(e))
|
137 |
+
exit()
|
138 |
+
|
139 |
+
|
140 |
+
|
141 |
+
def run():
|
142 |
+
fire.Fire(data_extraction)
|
143 |
+
|
144 |
+
if __name__ == '__main__':
|
145 |
+
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(filename)s - %(message)s",datefmt="%d/%m/%Y %H:%M:%S",level=logging.INFO)
|
146 |
+
run()
|
147 |
+
|
148 |
+
|
149 |
+
|
150 |
+
|
app/src/pipeline.py
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
|
4 |
+
import text_cleaning_transforerms as tc
|
5 |
+
import text_cleaning
|
6 |
+
|
7 |
+
import logging
|
8 |
+
import torch
|
9 |
+
|
10 |
+
import matplotlib.pyplot as plt
|
11 |
+
import numpy as np
|
12 |
+
import pandas as pd
|
13 |
+
import itertools
|
14 |
+
import json
|
15 |
+
import joblib
|
16 |
+
from gensim.models import phrases
|
17 |
+
|
18 |
+
import math
|
19 |
+
|
20 |
+
import xgboost
|
21 |
+
import re
|
22 |
+
import nltk
|
23 |
+
nltk.download('stopwords')
|
24 |
+
nltk.download('wordnet')
|
25 |
+
import html
|
26 |
+
|
27 |
+
from config import config_file
|
28 |
+
|
29 |
+
|
30 |
+
from lime import lime_text
|
31 |
+
from lime.lime_text import LimeTextExplainer
|
32 |
+
|
33 |
+
|
34 |
+
from transformers import AutoModelForSequenceClassification,AutoTokenizer
|
35 |
+
|
36 |
+
from nltk.tokenize import word_tokenize
|
37 |
+
|
38 |
+
|
39 |
+
"""
|
40 |
+
Cancer Severity Class.
|
41 |
+
|
42 |
+
export env_name="path"
|
43 |
+
"""
|
44 |
+
class BERT_Model(object):
|
45 |
+
def __init__(self, config,bert_option:str="clinicalBERT"):
|
46 |
+
|
47 |
+
try:
|
48 |
+
self.config = config
|
49 |
+
self.project_dir = os.path.dirname(os.path.abspath(__file__))
|
50 |
+
self.bert_option = bert_option
|
51 |
+
# check if a path was alreadey added to os env table
|
52 |
+
|
53 |
+
if "model_folder" in os.environ:
|
54 |
+
self.config['model_folder'] = os.environ['model_folder']
|
55 |
+
else:
|
56 |
+
self.config['model_folder'] = os.path.join(self.project_dir, self.config['model_option'][self.bert_option]['model_folder'])
|
57 |
+
|
58 |
+
self.initialize()
|
59 |
+
except Exception as e:
|
60 |
+
logging.exception("Error occurred while Initializing BERT Model, please double check you have a config file " +" Info: " + str(e))
|
61 |
+
exit()
|
62 |
+
|
63 |
+
def initialize(self):
|
64 |
+
# Set up logging
|
65 |
+
logging.basicConfig(
|
66 |
+
format="%(asctime)s - %(levelname)s - %(filename)s - %(message)s",
|
67 |
+
datefmt="%d/%m/%Y %H:%M:%S",
|
68 |
+
level=logging.INFO)
|
69 |
+
|
70 |
+
# Check for GPUs
|
71 |
+
if torch.cuda.is_available():
|
72 |
+
self.config["use_cuda"] = True
|
73 |
+
self.config["cuda_device"] = torch.cuda.current_device()
|
74 |
+
logging.info("Using GPU (`%s`)", torch.cuda.get_device_name())
|
75 |
+
else:
|
76 |
+
self.config["use_cuda"] = False
|
77 |
+
self.config["cuda_device"] = "cpu"
|
78 |
+
logging.info("Using CPU")
|
79 |
+
|
80 |
+
|
81 |
+
self.model = AutoModelForSequenceClassification.from_pretrained(self.config["model_folder"], num_labels=len(self.config['classes']),output_hidden_states=True).to(self.config["cuda_device"])
|
82 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.config["model_folder"])
|
83 |
+
|
84 |
+
|
85 |
+
def clean_data(self,text:str):
|
86 |
+
return tc.pre_process(text,max_size=int(self.config["max_seq_length"]),remove_punctuation=True )
|
87 |
+
|
88 |
+
def sigmoid(self,x):
|
89 |
+
return 1 / (1 + math.exp(-x))
|
90 |
+
|
91 |
+
"""
|
92 |
+
Convert output of multi-class to probabilities between 0-1
|
93 |
+
"""
|
94 |
+
def raw_to_probs(self,vector):
|
95 |
+
return [self.sigmoid(x) for x in vector]
|
96 |
+
|
97 |
+
|
98 |
+
"""
|
99 |
+
Given a threshold, convert a vector of probabiities into predictions (0 or 1)
|
100 |
+
"""
|
101 |
+
def _threshold(self, vector:list, threshold:float=0.5) -> list:
|
102 |
+
logit_vector = [1 if x >=threshold else 0 for x in vector]
|
103 |
+
return logit_vector
|
104 |
+
|
105 |
+
"""
|
106 |
+
Pre-Process the data according to the same strategy used during training
|
107 |
+
"""
|
108 |
+
def pre_process(self,texts:list)-> list:
|
109 |
+
transformer_clean_data,transformer_clean_data_chunks = [],[]
|
110 |
+
for index,t in enumerate(texts):
|
111 |
+
clean_data, clean_data_chunks = self.clean_data(t)
|
112 |
+
transformer_clean_data.append(clean_data)
|
113 |
+
transformer_clean_data_chunks.append(clean_data_chunks)
|
114 |
+
|
115 |
+
return transformer_clean_data,transformer_clean_data_chunks
|
116 |
+
|
117 |
+
|
118 |
+
"""
|
119 |
+
Giving a list of texts, return the sentence embedding (CLS token from last BERT layer)
|
120 |
+
"""
|
121 |
+
def get_embeddings(self,texts:list)-> list:
|
122 |
+
|
123 |
+
transformer_clean_data,_ = self.pre_process(texts)
|
124 |
+
|
125 |
+
inputs = self.tokenizer(transformer_clean_data, return_tensors="pt", padding=True).to(self.config["cuda_device"])
|
126 |
+
outputs = self.model(**inputs,output_hidden_states=True)
|
127 |
+
last_hidden_states = outputs[1][-1].detach().cpu().numpy()
|
128 |
+
embeddings_output = np.asarray(last_hidden_states[:, 0])
|
129 |
+
|
130 |
+
return embeddings_output
|
131 |
+
|
132 |
+
"""
|
133 |
+
Giving a list of texts, run BERT prediction for each sample
|
134 |
+
If use_chunks is set to True (default), it chunks de data into chunks of max_size (set on config.py)
|
135 |
+
The final prediction for that sample is the concatenation of predictions from every chunck
|
136 |
+
|
137 |
+
Returns:
|
138 |
+
* Predictions
|
139 |
+
* Probabiities
|
140 |
+
* Sentence Embedding (CLS token from last BERT layer)
|
141 |
+
* Pre-Processed data used for Prediction
|
142 |
+
"""
|
143 |
+
def predict(self,texts:list, use_chunks=True)-> list:
|
144 |
+
|
145 |
+
transformer_clean_data,transformer_clean_data_chunks = self.pre_process(texts)
|
146 |
+
print(transformer_clean_data,transformer_clean_data_chunks)
|
147 |
+
ids_chunks = []
|
148 |
+
# Flat all chunks (2d list) into 1d List (each chunck is feed separetly to prediction)
|
149 |
+
if use_chunks:
|
150 |
+
|
151 |
+
flatten_chunks = [j for sub in transformer_clean_data_chunks for j in sub]
|
152 |
+
ids = [[x]*len(transformer_clean_data_chunks[x]) for x in range(len(transformer_clean_data_chunks))]
|
153 |
+
ids_chunks = [j for sub in ids for j in sub]
|
154 |
+
data = flatten_chunks.copy()
|
155 |
+
else:
|
156 |
+
data = transformer_clean_data.copy()
|
157 |
+
|
158 |
+
print(data)
|
159 |
+
inputs = self.tokenizer(data, return_tensors="pt", padding=True).to(self.config["cuda_device"])
|
160 |
+
outputs = self.model(**inputs,output_hidden_states=True)
|
161 |
+
|
162 |
+
# Post-Process output if using chunks --> Merge chunck Predictions into 1
|
163 |
+
if use_chunks:
|
164 |
+
raw_probs_chunks = outputs[0].detach().cpu().numpy()
|
165 |
+
probs_chunks = [self.raw_to_probs(x) for x in raw_probs_chunks]
|
166 |
+
probs = np.asarray([[0 for x in range(len(probs_chunks[0]))] for x in range(len(texts))],dtype=float)
|
167 |
+
for index, prob in enumerate(probs_chunks):
|
168 |
+
id_ = ids_chunks[index]
|
169 |
+
|
170 |
+
# if no predictions for such index yet, add (this is the base - avoid zero preds)
|
171 |
+
if np.sum(probs[id_])<=0:
|
172 |
+
probs[id_] = prob
|
173 |
+
else: # update to merge predictions
|
174 |
+
pred = np.asarray(self._threshold(vector=prob,threshold=self.config["threshold_prediction"]))
|
175 |
+
pos_pred_index = np.where(pred>0)[0]
|
176 |
+
if len(pos_pred_index)>0:
|
177 |
+
for pos in pos_pred_index:
|
178 |
+
probs[id_][pos] = prob[pos]
|
179 |
+
|
180 |
+
else:
|
181 |
+
raw_probs = outputs[0].detach().cpu().numpy()
|
182 |
+
probs = [self.raw_to_probs(x) for x in raw_probs]
|
183 |
+
|
184 |
+
predictions = [self._threshold(vector=pred,threshold=self.config["threshold_prediction"]) for pred in probs]
|
185 |
+
|
186 |
+
|
187 |
+
|
188 |
+
last_hidden_states = outputs[1][-1].detach().cpu().numpy()
|
189 |
+
embeddings_output = np.asarray(last_hidden_states[:, 0])
|
190 |
+
|
191 |
+
return predictions, probs, embeddings_output, transformer_clean_data
|
192 |
+
|
193 |
+
|
194 |
+
|
195 |
+
"""
|
196 |
+
Giving a list of text, it executes the branch prediction
|
197 |
+
This function call BERT Predict, pre-process predictions, and return the post-process branch prediction
|
198 |
+
Returns:
|
199 |
+
* Branch Prediction
|
200 |
+
* Sentence Embedding (CLS token from last BERT layer)
|
201 |
+
"""
|
202 |
+
def branch_prediction(self,texts:list)-> list:
|
203 |
+
out_pred = []
|
204 |
+
|
205 |
+
predictions, probs, embeddings_output, transformer_clean_data = self.predict(texts,use_chunks=True)
|
206 |
+
|
207 |
+
try:
|
208 |
+
for index, preds in enumerate(probs):
|
209 |
+
preds = np.asarray(preds)
|
210 |
+
pos = np.where(preds > 0.5)[0]
|
211 |
+
pred = []
|
212 |
+
if len(pos) >0:
|
213 |
+
for ind in pos:
|
214 |
+
pred.append({self.config['classes'][ind]: {"probability":preds[ind], "data":texts[index], "transformer_data": transformer_clean_data[index] }})
|
215 |
+
else:
|
216 |
+
pred.append({"No Prediction": {"probability":0, "data":texts[index], "transformer_data": transformer_clean_data[index]}})
|
217 |
+
|
218 |
+
out_pred.append(pred)
|
219 |
+
except Exception as e:
|
220 |
+
logging.exception("Error occurred on BERT model prediction" +" Info: " + str(e))
|
221 |
+
exit()
|
222 |
+
|
223 |
+
return out_pred,embeddings_output
|
224 |
+
|
225 |
+
|
226 |
+
"""
|
227 |
+
Cancer Diagnose Prediction Class.
|
228 |
+
This class is used to load each individual branch classifier
|
229 |
+
"""
|
230 |
+
class Branch_Classifier(object):
|
231 |
+
def __init__(self, config, branch_option:str="single_tfidf"):
|
232 |
+
self.config = config
|
233 |
+
self.branch_option = branch_option
|
234 |
+
self.project_dir = os.path.dirname(os.path.abspath(__file__))
|
235 |
+
|
236 |
+
try:
|
237 |
+
if "path_model" in os.environ:
|
238 |
+
self.config['path_model'] = os.environ['path_model']
|
239 |
+
else:
|
240 |
+
self.config['path_model'] = os.path.join(self.project_dir, self.config['model_option'][self.branch_option]['path_model'])
|
241 |
+
|
242 |
+
if "path_vectorizer" in os.environ:
|
243 |
+
self.config['path_vectorizer'] = os.environ['path_vectorizer']
|
244 |
+
else:
|
245 |
+
self.config['path_vectorizer'] = os.path.join(self.project_dir, self.config['model_option'][self.branch_option]['path_vectorizer'])
|
246 |
+
|
247 |
+
if "path_bigrmas" in os.environ:
|
248 |
+
self.config['path_bigrmas'] = os.environ['path_bigrmas']
|
249 |
+
else:
|
250 |
+
self.config['path_bigrmas'] = os.path.join(self.project_dir, self.config['model_option'][self.branch_option]['path_bigrmas'])
|
251 |
+
|
252 |
+
if "path_phrase_bigrams" in os.environ:
|
253 |
+
self.config['path_phrase_bigrams'] = os.environ['path_phrase_bigrams']
|
254 |
+
else:
|
255 |
+
self.config['path_phrase_bigrams'] = os.path.join(self.project_dir, self.config['model_option'][self.branch_option]['path_phrase_bigrams'])
|
256 |
+
|
257 |
+
except Exception as e:
|
258 |
+
logging.exception("Error occurred while reading config file. Please read config instructions" +" Info: " + str(e))
|
259 |
+
exit()
|
260 |
+
|
261 |
+
self.initialize()
|
262 |
+
|
263 |
+
|
264 |
+
def initialize(self):
|
265 |
+
|
266 |
+
try:
|
267 |
+
self.model = joblib.load(os.path.join(self.config['path_model'],self.config['model_option'][self.branch_option]['model']))
|
268 |
+
self.vectorizer = joblib.load(os.path.join(self.config['path_vectorizer'],self.config['model_option'][self.branch_option]['vectorizer']))
|
269 |
+
self.good_bigrams = pd.read_csv(os.path.join(self.config["path_bigrmas"],self.config['model_option'][self.branch_option]['bigrams']))['bigram'].to_list()
|
270 |
+
self.phrase_bigrams = phrases.Phrases.load(os.path.join(self.config["path_phrase_bigrams"],self.config['model_option'][self.branch_option]['phrase_bigrams']))
|
271 |
+
|
272 |
+
except Exception as e:
|
273 |
+
logging.exception("Error occurred while initializing models and vectorizer" +" Info: " + str(e))
|
274 |
+
exit()
|
275 |
+
|
276 |
+
"""
|
277 |
+
Only add specific Bi-grams (Pre-calculated during Training)
|
278 |
+
"""
|
279 |
+
def clean_bigram(self,data:list)-> list:
|
280 |
+
|
281 |
+
data_clean = []
|
282 |
+
|
283 |
+
for word in data:
|
284 |
+
if re.search("_",word) == None:
|
285 |
+
data_clean.append(word)
|
286 |
+
else: # gotta add the word without _ as well
|
287 |
+
if word in self.good_bigrams:
|
288 |
+
data_clean.append(word)
|
289 |
+
else:
|
290 |
+
data_clean.append(word.split("_")[0])
|
291 |
+
data_clean.append(word.split("_")[1])
|
292 |
+
|
293 |
+
return np.asarray(data_clean)
|
294 |
+
|
295 |
+
"""
|
296 |
+
Giving a list of text, pre-process and format the data
|
297 |
+
"""
|
298 |
+
def format_data(self,data:list)-> list:
|
299 |
+
try:
|
300 |
+
X = text_cleaning.text_cleaning(data, steam=False, lemma=True,single_input=True)[0]
|
301 |
+
|
302 |
+
### Add Bigrams and keep only the good ones(pre-selected)
|
303 |
+
X_bigrmas = self.phrase_bigrams[X]
|
304 |
+
data_clean = self.clean_bigram(X_bigrmas)
|
305 |
+
X_bigrams_clean = ' '.join(map(str, data_clean))
|
306 |
+
pre_processed = self.vectorizer.transform([X_bigrams_clean]).toarray(),X_bigrams_clean
|
307 |
+
|
308 |
+
except Exception as e:
|
309 |
+
logging.exception("Error occurred while formatting and cleaning data" +" Info: " + str(e))
|
310 |
+
exit()
|
311 |
+
|
312 |
+
return pre_processed
|
313 |
+
|
314 |
+
|
315 |
+
def html_escape(self,text):
|
316 |
+
return html.escape(text)
|
317 |
+
|
318 |
+
def predict(self, texts:list)-> list:
|
319 |
+
"""
|
320 |
+
Steps:
|
321 |
+
1) Run the predictions from higher-order
|
322 |
+
2) Based on the prediction, activate which brach(es) to send for final prediction (cancer characteristics)
|
323 |
+
3) For final prediction, create a word importance HTML for each input
|
324 |
+
"""
|
325 |
+
out_pred = {'predictions': {}, 'word_analysis':{},}
|
326 |
+
|
327 |
+
color = "234, 131, 4" # orange
|
328 |
+
try:
|
329 |
+
for t in texts:
|
330 |
+
text_tfidf,clean_data = self.format_data(t)
|
331 |
+
probs = self.model.predict_proba(text_tfidf).toarray()
|
332 |
+
predictions = self.model.predict(text_tfidf).toarray()
|
333 |
+
for index,preds in enumerate(predictions):
|
334 |
+
pos = np.where(preds > 0.5)[0]
|
335 |
+
pred = []
|
336 |
+
if len(pos) >0:
|
337 |
+
for ind in pos:
|
338 |
+
highlighted_html_text = []
|
339 |
+
weigts = self.model.classifiers_[ind].feature_importances_
|
340 |
+
word_weights = {}
|
341 |
+
words = clean_data.split()
|
342 |
+
min_new = 0
|
343 |
+
max_new = 100
|
344 |
+
min_old = np.min(weigts)
|
345 |
+
max_old = np.max(weigts)
|
346 |
+
for w in words:
|
347 |
+
found = False
|
348 |
+
for word, key in self.vectorizer.vocabulary_.items():
|
349 |
+
if w == word:
|
350 |
+
found = True
|
351 |
+
# rescale weights
|
352 |
+
weight = ( (max_new - min_new) / (max_old - min_old) * (weigts[key] - max_old) + max_new)
|
353 |
+
if weight <0.5:
|
354 |
+
weight = 0
|
355 |
+
|
356 |
+
|
357 |
+
if "_" in w: # add for each word
|
358 |
+
w1,w2 = w.split("_")
|
359 |
+
word_weights[w1] = weight
|
360 |
+
word_weights[w2] = weight
|
361 |
+
if w2 =="one":
|
362 |
+
word_weights["1"] = weight
|
363 |
+
word_weights["i"] = weight
|
364 |
+
if w2 =="two":
|
365 |
+
word_weights["2"] = weight
|
366 |
+
word_weights["ii"] = weight
|
367 |
+
if w2 =="three":
|
368 |
+
word_weights["3"] = weight
|
369 |
+
word_weights["iii"] = weight
|
370 |
+
else:
|
371 |
+
word_weights[w] = weight
|
372 |
+
if found == False: # some words aren't presented in the model
|
373 |
+
word_weights[w] = 0
|
374 |
+
|
375 |
+
words = word_tokenize(t.lower().replace("-", " - ").replace("_", " ").replace(".", " . ").replace(",", " , ").replace("(", " ( ").replace(")", " ) "))
|
376 |
+
for i,w in enumerate(words):
|
377 |
+
if w not in word_weights or w=='-' or w==',' or w=='.' or w=="(" or w==")":
|
378 |
+
word_weights[w] = 0
|
379 |
+
highlighted_html_text.append(w)
|
380 |
+
else:
|
381 |
+
weight = 0 if word_weights[w] <1 else word_weights[w]
|
382 |
+
highlighted_html_text.append('<span font-size:40px; ; style="background-color:rgba(' + color + ',' + str(weight) + ');">' + self.html_escape(w) + '</span>')
|
383 |
+
|
384 |
+
|
385 |
+
|
386 |
+
highlighted_html_text = ' '.join(highlighted_html_text)
|
387 |
+
#pred.append({ "predictions": {self.config['classes'][ind]: {"probability":probs[index][ind]}},"word_analysis": {"discriminator_data": clean_data,"word_importance": word_weights, "highlighted_html_text":highlighted_html_text}})
|
388 |
+
out_pred["predictions"][self.config['classes'][ind]] = {"probability":probs[index][ind]}
|
389 |
+
out_pred["word_analysis"] = {"discriminator_data": clean_data,"word_importance": word_weights, "highlighted_html_text":highlighted_html_text}
|
390 |
+
|
391 |
+
else:
|
392 |
+
out_pred["predictions"] = {"Unkown": {"probability":0.5}}
|
393 |
+
out_pred["word_analysis"] = {"discriminator_data": clean_data,"word_importance": {x:0 for x in t.split()}, "highlighted_html_text": " ".join(x for x in t.split())}
|
394 |
+
|
395 |
+
#pred.append({"predictions": {"Unkown": {"probability":0.5}}, "word_analysis": {"discriminator_data": clean_data,"word_importance": {x:0 for x in t.split()}, "highlighted_html_text": " ".join(x for x in t.split())}})
|
396 |
+
|
397 |
+
#out_pred.append(pred)
|
398 |
+
|
399 |
+
except Exception as e:
|
400 |
+
logging.exception("Error occurred on model prediction" +" Info: " + str(e))
|
401 |
+
exit()
|
402 |
+
|
403 |
+
return out_pred
|
404 |
+
|
405 |
+
|
406 |
+
class LIME_Interpretability(object):
|
407 |
+
|
408 |
+
"""
|
409 |
+
Class for LIME Analysis
|
410 |
+
|
411 |
+
"""
|
412 |
+
|
413 |
+
def __init__(self, label_colors = { "positive": "234, 131, 4", # orange
|
414 |
+
"negative":'65, 137, 225', # blue
|
415 |
+
}):
|
416 |
+
|
417 |
+
self.color_classes = label_colors
|
418 |
+
|
419 |
+
# function to normalize, if applicable
|
420 |
+
def __normalize_MinMax(self,arr, t_min=0, t_max=1):
|
421 |
+
norm_arr = []
|
422 |
+
diff = t_max - t_min
|
423 |
+
diff_arr = max(arr) - min(arr)
|
424 |
+
for i in arr:
|
425 |
+
temp = (((i - min(arr)) * diff) / diff_arr) + t_min
|
426 |
+
norm_arr.append(temp)
|
427 |
+
return norm_arr
|
428 |
+
|
429 |
+
|
430 |
+
def __html_escape(self,text):
|
431 |
+
return html.escape(text)
|
432 |
+
|
433 |
+
|
434 |
+
def __add_bigrams(self,txt):
|
435 |
+
fixed_bigrams = [ [' gradeone ', 'grade 1', 'grade i', 'grade I', 'grade one',],
|
436 |
+
[' gradetwo ', 'grade 2', 'grade ii', 'grade II', 'grade two', ],
|
437 |
+
[' gradethree ', 'grade 3' , 'grade iii', 'grade III', 'grade three']]
|
438 |
+
for b in fixed_bigrams:
|
439 |
+
sub = ""
|
440 |
+
not_first = False
|
441 |
+
for x in b[1:]:
|
442 |
+
if not_first:
|
443 |
+
sub += "|"
|
444 |
+
not_first = True
|
445 |
+
|
446 |
+
sub += str(x) + "|" + str(x) + " " + "|" + " " + str(x) + "|" + " " + str(x)
|
447 |
+
txt = re.sub(sub, b[0], txt)
|
448 |
+
# Removing multiple spaces
|
449 |
+
txt = re.sub(r'\s+', ' ', txt)
|
450 |
+
txt = re.sub(' +', ' ', txt)
|
451 |
+
return txt
|
452 |
+
|
453 |
+
def __highlight_full_data(self,lime_weights, data, exp_labels,class_names):
|
454 |
+
words_p = [x[0] for x in lime_weights if x[1]>0]
|
455 |
+
weights_p = np.asarray([x[1] for x in lime_weights if x[1] >0])
|
456 |
+
if len(weights_p) >1:
|
457 |
+
weights_p = self.__normalize_MinMax(weights_p, t_min=min(weights_p), t_max=1)
|
458 |
+
else:
|
459 |
+
weights_p = [1]
|
460 |
+
words_n = [x[0] for x in lime_weights if x[1]<0]
|
461 |
+
weights_n = np.asarray([x[1] for x in lime_weights if x[1] <0])
|
462 |
+
# weights_n = self.__normalize_MinMax(weights_n, t_min=max(weights_p), t_max=-0.8)
|
463 |
+
|
464 |
+
labels = exp_labels
|
465 |
+
pred = class_names[labels[0]]
|
466 |
+
corr_pred = class_names[labels[1]] # negative lime weights
|
467 |
+
|
468 |
+
# positive values
|
469 |
+
df_coeff = pd.DataFrame(
|
470 |
+
{'word': words_p,
|
471 |
+
'num_code': weights_p
|
472 |
+
})
|
473 |
+
word_to_coeff_mapping_p = {}
|
474 |
+
for row in df_coeff.iterrows():
|
475 |
+
row = row[1]
|
476 |
+
word_to_coeff_mapping_p[row[0]] = row[1]
|
477 |
+
|
478 |
+
# negative values
|
479 |
+
df_coeff = pd.DataFrame(
|
480 |
+
{'word': words_n,
|
481 |
+
'num_code': weights_n
|
482 |
+
})
|
483 |
+
|
484 |
+
word_to_coeff_mapping_n = {}
|
485 |
+
for row in df_coeff.iterrows():
|
486 |
+
row = row[1]
|
487 |
+
word_to_coeff_mapping_n[row[0]] = row[1]
|
488 |
+
|
489 |
+
max_alpha = 1
|
490 |
+
highlighted_text = []
|
491 |
+
data = re.sub("-"," ", data)
|
492 |
+
data = re.sub("/","", data)
|
493 |
+
for word in word_tokenize(self.__add_bigrams(data)):
|
494 |
+
if word.lower() in word_to_coeff_mapping_p or word.lower() in word_to_coeff_mapping_n:
|
495 |
+
if word.lower() in word_to_coeff_mapping_p:
|
496 |
+
weight = word_to_coeff_mapping_p[word.lower()]
|
497 |
+
else:
|
498 |
+
weight = word_to_coeff_mapping_n[word.lower()]
|
499 |
+
|
500 |
+
if weight >0:
|
501 |
+
color = self.color_classes["positive"]
|
502 |
+
else:
|
503 |
+
color = self.color_classes["negative"]
|
504 |
+
weight *= -1
|
505 |
+
weight *=10
|
506 |
+
|
507 |
+
highlighted_text.append('<span font-size:40px; ; style="background-color:rgba(' + color + ',' + str(weight) + ');">' + self.__html_escape(word) + '</span>')
|
508 |
+
|
509 |
+
else:
|
510 |
+
highlighted_text.append(word)
|
511 |
+
|
512 |
+
highlighted_text = ' '.join(highlighted_text)
|
513 |
+
|
514 |
+
return highlighted_text
|
515 |
+
|
516 |
+
|
517 |
+
def lime_analysis(self,model,data_original, data_clean, num_features=30, num_samples=50, top_labels=2,
|
518 |
+
class_names=['ibc', 'nbc', 'isc', 'bll', 'hrl', 'benign', 'negative']):
|
519 |
+
|
520 |
+
# LIME Predictor Function
|
521 |
+
def predict(texts):
|
522 |
+
results = []
|
523 |
+
for text in texts:
|
524 |
+
predictions, probs, embeddings_output, transformer_clean_data = model.predict([text],use_chunks=False)
|
525 |
+
results.append(probs[0])
|
526 |
+
|
527 |
+
return np.array(results)
|
528 |
+
|
529 |
+
explainer = LimeTextExplainer(class_names=class_names)
|
530 |
+
exp = explainer.explain_instance(data_clean, predict, num_features=num_features,
|
531 |
+
num_samples=num_samples, top_labels=top_labels)
|
532 |
+
l = exp.available_labels()
|
533 |
+
run_info = exp.as_list(l[0])
|
534 |
+
return self.__highlight_full_data(run_info, data_original, l,class_names)
|
535 |
+
|
536 |
+
|
537 |
+
"""
|
538 |
+
The pipeline is responsible to consolidate the output of all models (higher order and all labels hierarchy)
|
539 |
+
It takes a string as input, and returns a jason with higher-order(Severity) and all labels(Diagnose) predictions and their probability score
|
540 |
+
"""
|
541 |
+
class Pipeline(object):
|
542 |
+
|
543 |
+
def __init__(self, bert_option:str="clinicalBERT", branch_option:str="single_tfidf"):
|
544 |
+
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(filename)s - %(message)s",datefmt="%d/%m/%Y %H:%M:%S",level=logging.INFO)
|
545 |
+
|
546 |
+
if branch_option =="single_vectorizer":
|
547 |
+
self.branch_option = "single_tfidf"
|
548 |
+
elif branch_option =="branch_vectorizer":
|
549 |
+
self.branch_option = "branch_tfidf"
|
550 |
+
else:
|
551 |
+
self.branch_option=branch_option
|
552 |
+
|
553 |
+
self.bert_option=bert_option
|
554 |
+
|
555 |
+
try:
|
556 |
+
self.config = config_file()
|
557 |
+
self.BERT_config = self.config['BERT_config']
|
558 |
+
self.ibc_config = self.config['ibc_config']
|
559 |
+
self.isc_config = self.config['isc_config']
|
560 |
+
self.hrl_config = self.config['hrl_config']
|
561 |
+
self.bll_config = self.config['bll_config']
|
562 |
+
self.benign_config = self.config['benign_config']
|
563 |
+
self.nbc_config = self.config['nbc_config']
|
564 |
+
|
565 |
+
except Exception as e:
|
566 |
+
logging.exception("Error occurred while initializing models and vectorizer" +" Info: " + str(e))
|
567 |
+
exit()
|
568 |
+
|
569 |
+
self.lime_interpretability = LIME_Interpretability()
|
570 |
+
|
571 |
+
self.initialize()
|
572 |
+
|
573 |
+
|
574 |
+
def initialize(self):
|
575 |
+
try:
|
576 |
+
self.bert_model = BERT_Model(self.BERT_config, self.bert_option)
|
577 |
+
try:
|
578 |
+
self.ibc_branch = Branch_Classifier(self.ibc_config,branch_option=self.branch_option)
|
579 |
+
except Exception as e:
|
580 |
+
logging.exception("Error occurred while Initializing IBC branch Model, please double check you have a config file " +" Info: " + str(e))
|
581 |
+
exit()
|
582 |
+
|
583 |
+
try:
|
584 |
+
self.isc_branch = Branch_Classifier(self.isc_config,branch_option=self.branch_option)
|
585 |
+
except Exception as e:
|
586 |
+
logging.exception("Error occurred while Initializing isc branch Model, please double check you have a config file " +" Info: " + str(e))
|
587 |
+
exit()
|
588 |
+
|
589 |
+
try:
|
590 |
+
self.hrl_branch = Branch_Classifier(self.hrl_config,branch_option=self.branch_option)
|
591 |
+
except Exception as e:
|
592 |
+
logging.exception("Error occurred while Initializing hrl branch Model, please double check you have a config file " +" Info: " + str(e))
|
593 |
+
exit()
|
594 |
+
|
595 |
+
try:
|
596 |
+
self.bll_branch = Branch_Classifier(self.bll_config,branch_option=self.branch_option)
|
597 |
+
except Exception as e:
|
598 |
+
logging.exception("Error occurred while Initializing bll branch Model, please double check you have a config file " +" Info: " + str(e))
|
599 |
+
exit()
|
600 |
+
|
601 |
+
try:
|
602 |
+
self.benign_branch = Branch_Classifier(self.benign_config,branch_option=self.branch_option)
|
603 |
+
except Exception as e:
|
604 |
+
logging.exception("Error occurred while Initializing benign branch Model, please double check you have a config file " +" Info: " + str(e))
|
605 |
+
exit()
|
606 |
+
|
607 |
+
try:
|
608 |
+
self.nbc_branch = Branch_Classifier(self.nbc_config,branch_option=self.branch_option)
|
609 |
+
except Exception as e:
|
610 |
+
logging.exception("Error occurred while Initializing nbc branch Model, please double check you have a config file " +" Info: " + str(e))
|
611 |
+
exit()
|
612 |
+
|
613 |
+
self.all_label_models = [self.ibc_branch,self.nbc_branch,self.isc_branch,self.bll_branch,self.hrl_branch,self.benign_branch]
|
614 |
+
|
615 |
+
|
616 |
+
except Exception as e:
|
617 |
+
logging.exception("Error occurred while Initializing Pipeline, please double check you have a config file " +" Info: " + str(e))
|
618 |
+
exit()
|
619 |
+
|
620 |
+
|
621 |
+
"""
|
622 |
+
Run the entire pipeline
|
623 |
+
Steps:
|
624 |
+
1) First, we run the Severity Prediction (BERT)
|
625 |
+
2) Given each prediction for each sample, we then:
|
626 |
+
2.1) Run the corresponding Diagnose Branch Prediction
|
627 |
+
2.2) Merge every branch prediction
|
628 |
+
3) Merge Every Severity and Branch Prediction
|
629 |
+
|
630 |
+
Inputs:
|
631 |
+
* Text
|
632 |
+
|
633 |
+
Output:
|
634 |
+
* Predictions (Predictions + Probabilites)
|
635 |
+
* Sentence Embedding
|
636 |
+
"""
|
637 |
+
def run(self,input_text:str):
|
638 |
+
|
639 |
+
"""
|
640 |
+
First, get the severity prediction (higher order branch)
|
641 |
+
"""
|
642 |
+
predictions,embeddings_output = self.bert_model.branch_prediction([input_text])
|
643 |
+
predictions = predictions[0]
|
644 |
+
for pred in predictions:
|
645 |
+
for higher_order, sub_arr in pred.items():
|
646 |
+
# Check which branch it belongs to
|
647 |
+
if higher_order in ["Negative","No Prediction"]:
|
648 |
+
pred[higher_order]['labels'] = {higher_order: {"probability":sub_arr['probability']}}
|
649 |
+
pred[higher_order]["word_analysis"] = {"discriminator_data": "Not Used", "word_importance": {x:0 for x in input_text.split()}, "highlighted_html_text": " ".join(x for x in input_text.split())}
|
650 |
+
|
651 |
+
# For each Severity, run the corresponding Branch Prediction
|
652 |
+
else:
|
653 |
+
model = self.all_label_models[self.bert_model.config['classes'].index(higher_order)]
|
654 |
+
out_pred = model.predict([input_text])
|
655 |
+
|
656 |
+
pred[higher_order]['labels'] = out_pred['predictions']
|
657 |
+
pred[higher_order]['word_analysis'] = out_pred['word_analysis']
|
658 |
+
|
659 |
+
return predictions,embeddings_output
|
660 |
+
|
661 |
+
def bert_interpretability(self, input_text:str):
|
662 |
+
return self.lime_interpretability.lime_analysis(self.bert_model,input_text, self.bert_model.clean_data(input_text), class_names=self.bert_model.config['classes'])
|
663 |
+
|
664 |
+
|
665 |
+
if __name__ == '__main__':
|
666 |
+
exit()
|
667 |
+
|
668 |
+
|
669 |
+
|
670 |
+
|
app/src/text_cleaning.py
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from gensim.parsing import preprocessing
|
2 |
+
from gensim.parsing.preprocessing import strip_tags, strip_punctuation,strip_numeric,remove_stopwords
|
3 |
+
import re
|
4 |
+
from nltk.stem import PorterStemmer
|
5 |
+
import nltk
|
6 |
+
from nltk.corpus import stopwords
|
7 |
+
import pandas as pd
|
8 |
+
|
9 |
+
def remove_noise_text(txt):
|
10 |
+
|
11 |
+
txt = txt.lower()
|
12 |
+
txt = re.sub('right|left', '', txt) # remove right/left spaces
|
13 |
+
txt = re.sub("primary site:", '', txt)
|
14 |
+
|
15 |
+
#txt = re.sub('post-surgical changes', ' ', txt.lower())
|
16 |
+
|
17 |
+
# Remove any mentions to " Findings were discussed with...."
|
18 |
+
txt = txt.split("findings were discussed with")[0]
|
19 |
+
|
20 |
+
# Remove any other occurance of PI's Information
|
21 |
+
txt = txt.split("this study has been reviewed and interpreted")[0]
|
22 |
+
txt = txt.split("this finding was communicated to")[0]
|
23 |
+
txt = txt.split("important findings were identified")[0]
|
24 |
+
txt = txt.split("these findings")[0]
|
25 |
+
txt = txt.split("findings above were")[0]
|
26 |
+
txt = txt.split("findings regarding")[0]
|
27 |
+
txt = txt.split("were discussed")[0]
|
28 |
+
txt = txt.split("these images were")[0]
|
29 |
+
txt = txt.split("important finding")[0]
|
30 |
+
|
31 |
+
# remove any section headers
|
32 |
+
txt = re.sub("post-surgical changes:", '', txt)
|
33 |
+
txt = re.sub("post surgical changes:", '', txt)
|
34 |
+
txt = re.sub("primary site:", '', txt)
|
35 |
+
txt = re.sub("primary site", '', txt)
|
36 |
+
txt = re.sub("neck:", '', txt)
|
37 |
+
txt = re.sub("post-treatment changes:", '', txt)
|
38 |
+
txt = re.sub("post treatment changes:", '', txt)
|
39 |
+
txt = re.sub("brain, orbits, spine and lungs:", '', txt)
|
40 |
+
txt = re.sub("primary :", '', txt)
|
41 |
+
txt = re.sub("neck:", '', txt)
|
42 |
+
txt = re.sub("aerodigestive tract:", '', txt)
|
43 |
+
txt = re.sub("calvarium, skull base, and spine:", '', txt)
|
44 |
+
txt = re.sub("other:", '', txt)
|
45 |
+
txt = re.sub("upper neck:", '', txt)
|
46 |
+
txt = re.sub("perineural disease:", '', txt)
|
47 |
+
txt = re.sub("technique:", '', txt)
|
48 |
+
txt = re.sub("comparison:", '', txt)
|
49 |
+
txt = re.sub("paranasal sinuses:", '', txt)
|
50 |
+
txt = re.sub("included orbits:", '', txt)
|
51 |
+
txt = re.sub("nasopharynx:", '', txt)
|
52 |
+
txt = re.sub("tympanomastoid cavities:", '', txt)
|
53 |
+
txt = re.sub("skull base and calvarium:", '', txt)
|
54 |
+
txt = re.sub("included intracranial structures:", '', txt)
|
55 |
+
txt = re.sub("abnormal enhancement:", '', txt)
|
56 |
+
txt = re.sub("lymph nodes:", '', txt)
|
57 |
+
txt = re.sub("impression:", '', txt)
|
58 |
+
txt = re.sub("nodes:", '', txt)
|
59 |
+
txt = re.sub("mri orbits:", '', txt)
|
60 |
+
txt = re.sub("mri brain:", '', txt)
|
61 |
+
txt = re.sub("brain:", '', txt)
|
62 |
+
txt = re.sub("ct face w/:", '', txt)
|
63 |
+
txt = re.sub("transspatial extension:", '', txt)
|
64 |
+
txt = re.sub("thyroid bed:", '', txt)
|
65 |
+
txt = re.sub("additional findings:", '', txt)
|
66 |
+
txt = re.sub("series_image", '', txt)
|
67 |
+
txt = re.sub("series image", '', txt)
|
68 |
+
txt = re.sub("image series", '', txt)
|
69 |
+
txt = re.sub("series", '', txt)
|
70 |
+
|
71 |
+
txt = re.sub(" mm | mm|mm ", " ", txt)
|
72 |
+
txt = re.sub(" series | series|series ", "", txt)
|
73 |
+
txt = re.sub(" cm | cm|cm ", " ", txt)
|
74 |
+
txt = re.sub(" cc | cc|cc ", " ", txt)
|
75 |
+
txt = re.sub(" ct | ct|ct ", " ", txt)
|
76 |
+
txt = re.sub(" mri | mri|mri ", " ", txt)
|
77 |
+
txt = re.sub(" see | see|see ", " ", txt)
|
78 |
+
txt = re.sub(" iia | iia|iia ", " ", txt)
|
79 |
+
txt = re.sub("comment", "", txt)
|
80 |
+
|
81 |
+
|
82 |
+
txt = re.sub("post treatment", '', txt)
|
83 |
+
txt = re.sub("post_treatment", '', txt)
|
84 |
+
txt = re.sub("post-treatment", '', txt)
|
85 |
+
txt = re.sub("findings suggest", '', txt)
|
86 |
+
txt = re.sub("findings", '', txt)
|
87 |
+
txt = re.sub("suggest", '', txt)
|
88 |
+
txt = re.sub("study reviewed", '', txt)
|
89 |
+
txt = re.sub("study", '', txt)
|
90 |
+
txt = re.sub("reviewed", '', txt)
|
91 |
+
txt = re.sub("please see", '', txt)
|
92 |
+
txt = re.sub("please", '', txt)
|
93 |
+
|
94 |
+
txt = re.sub("skull base", '', txt)
|
95 |
+
txt = re.sub("fdg avid", '', txt)
|
96 |
+
txt = re.sub("fdg aivity", '', txt)
|
97 |
+
txt = re.sub("please see chest ct for further evaluation of known lung mass", '', txt)
|
98 |
+
|
99 |
+
txt = re.sub("status_post", '', txt)
|
100 |
+
txt = re.sub("status post|clock|/|'/'", '', txt)
|
101 |
+
txt = re.sub("statuspost|:", '', txt)
|
102 |
+
txt = re.sub(" cm | cm|cm ", " centimeters ", txt)
|
103 |
+
txt = re.sub(" cc | cc|cc ", " cubic centimeters ", txt)
|
104 |
+
txt = re.sub(" ct | ct|ct ", " carat metric ", txt)
|
105 |
+
txt = re.sub(" mm | mm|mm ", " millimeters ", txt)
|
106 |
+
#txt = re.sub("(\\d*\\.\\d+)|(\\d+\\.[0-9 ]+)","",txt)
|
107 |
+
|
108 |
+
# in the worst case, just replace the name from PI to empty string
|
109 |
+
txt = re.sub("dr\\.\\s[^\\s]+", '', txt)
|
110 |
+
|
111 |
+
|
112 |
+
txt = re.sub('\\;', ' .', txt)
|
113 |
+
txt = re.sub('\\.', ' .', txt)
|
114 |
+
|
115 |
+
# Removing multiple spaces
|
116 |
+
txt = re.sub(r'\s+', ' ', txt)
|
117 |
+
|
118 |
+
|
119 |
+
return txt
|
120 |
+
|
121 |
+
def add_bigrams(txt, fixed_bigrams):
|
122 |
+
|
123 |
+
for b in fixed_bigrams:
|
124 |
+
sub = ""
|
125 |
+
not_first = False
|
126 |
+
for x in b[1:]:
|
127 |
+
if not_first:
|
128 |
+
sub += "|"
|
129 |
+
not_first = True
|
130 |
+
|
131 |
+
sub += str(x) + "|" + str(x) + " " + "|" + " " + str(x) + "|" + " " + str(x)
|
132 |
+
txt = re.sub(sub, b[0], txt)
|
133 |
+
|
134 |
+
return txt
|
135 |
+
|
136 |
+
|
137 |
+
def clean_text(txt_orig,filters,stop_words,non_stop_words,freq_words,fixed_bigrams,steam, lemma , clean, min_lenght, eightify=False):
|
138 |
+
txt = remove_noise_text(txt_orig)
|
139 |
+
|
140 |
+
#print("\n\t\tOriginal\n", txt)
|
141 |
+
txt = add_bigrams(txt, fixed_bigrams)
|
142 |
+
#print("\n\t\tCleaned\n", txt)
|
143 |
+
words = preprocessing.preprocess_string(txt, filters)
|
144 |
+
words = add_bigrams(" ".join(w for w in words), fixed_bigrams).split()
|
145 |
+
|
146 |
+
txt = " ".join(w for w in words)
|
147 |
+
|
148 |
+
# eightify
|
149 |
+
#
|
150 |
+
if eightify:
|
151 |
+
replaces = [ ["her2|her 2|her two", " hertwo "], ["0", "8"], ["1", "8"], ["2", "8"], ["3", "8"],["4", "8"],
|
152 |
+
["5", "8"],["6", "8"] ,["7", "8"] ,["8", "8"] ,["9", "8"] ,
|
153 |
+
["\\>", " greather "], ["\\<", " less "]]
|
154 |
+
|
155 |
+
else:
|
156 |
+
replaces = [ ["her2|her 2|her two", " hertwo "], ["0", "zero "], ["1", "one "], ["2", "two "], ["3", "three "],["4", "four "],
|
157 |
+
["5", "five "],["6", "six "] ,["7", "seven "] ,["8", "eight "] ,["9", "nine " ] ,
|
158 |
+
["\\>", " greather "], ["\\<", " less "]]
|
159 |
+
|
160 |
+
|
161 |
+
for sub in replaces:
|
162 |
+
txt = re.sub(sub[0], sub[1], txt)
|
163 |
+
|
164 |
+
# Removing multiple spaces
|
165 |
+
txt = re.sub(r'\s+', ' ', txt)
|
166 |
+
|
167 |
+
words = txt.split()
|
168 |
+
|
169 |
+
if clean:
|
170 |
+
words = [w for w in words if (not w in stop_words and re.search("[a-z-A-Z]+\\w+",w) != None and (len(w) >min_lenght or w in non_stop_words) or w=='.') ]
|
171 |
+
else:
|
172 |
+
words = [w for w in words if (re.search("[a-z-A-Z]+\\w+",w) != None and (len(w) >min_lenght or w in non_stop_words) or w=='.')]
|
173 |
+
|
174 |
+
c_words = words.copy()
|
175 |
+
|
176 |
+
if steam:
|
177 |
+
porter = PorterStemmer()
|
178 |
+
c_words = [porter.stem(word) for word in c_words if not porter.stem(word) in freq_words and (len(porter.stem(word)) >min_lenght or word in non_stop_words or word=='.')]
|
179 |
+
|
180 |
+
if lemma:
|
181 |
+
lem = nltk.stem.wordnet.WordNetLemmatizer()
|
182 |
+
c_words = [lem.lemmatize(word) for word in c_words if not lem.lemmatize(word) in freq_words and (len(lem.lemmatize(word)) >min_lenght or word in non_stop_words or word=='.')]
|
183 |
+
|
184 |
+
return c_words
|
185 |
+
|
186 |
+
|
187 |
+
def text_cleaning(data, steam=False, lemma = True, clean=True, min_lenght=2, remove_punctuation=True,
|
188 |
+
freq_words_analysis=False, single_input=False,eightify=True):
|
189 |
+
|
190 |
+
clean_txt = []
|
191 |
+
|
192 |
+
|
193 |
+
freq_words = ["breast","biopsy","margin","dual","tissue","excision","change","core","identified",
|
194 |
+
"mastectomy","site","report","lesion","superior","anterior","inferior","medial",
|
195 |
+
"lateral","synoptic","evidence","slide", "brbx"]
|
196 |
+
|
197 |
+
# position 0 means the bigram output - 1:end means how they may come on text
|
198 |
+
fixed_bigrams = [ [' grade_one ', 'grade 1', 'grade i', 'grade I', 'grade one',],
|
199 |
+
[' grade_two ', 'grade 2', 'grade ii', 'grade II', 'grade two', ],
|
200 |
+
[' grade_three ', 'grade 3' , 'grade iii', 'grade III', 'grade three']]
|
201 |
+
|
202 |
+
|
203 |
+
if remove_punctuation:
|
204 |
+
filters = [lambda x: x.lower(), strip_tags, strip_punctuation]
|
205 |
+
else:
|
206 |
+
filters = [lambda x: x.lower(), strip_tags]
|
207 |
+
|
208 |
+
stop_words = set(stopwords.words('english'))
|
209 |
+
non_stop_words = ['no', 'than', 'not']
|
210 |
+
for x in non_stop_words:
|
211 |
+
stop_words.remove(x)
|
212 |
+
|
213 |
+
if single_input:
|
214 |
+
c_words = clean_text(data,filters,stop_words,non_stop_words,freq_words,fixed_bigrams,steam, lemma, clean, min_lenght,eightify=eightify)
|
215 |
+
if len(c_words)>0:
|
216 |
+
if c_words[0] =='.':
|
217 |
+
c_words = c_words[1:]
|
218 |
+
clean_txt.append(c_words)
|
219 |
+
|
220 |
+
else:
|
221 |
+
for i in range(data.shape[0]):
|
222 |
+
txt_orig = data.iloc[i].lower()
|
223 |
+
c_words = clean_text(txt_orig,filters,stop_words,non_stop_words,freq_words,fixed_bigrams,steam, lemma, clean, min_lenght,eightify=eightify)
|
224 |
+
if len(c_words)>0:
|
225 |
+
if c_words[0] =='.':
|
226 |
+
c_words = c_words[1:]
|
227 |
+
clean_txt.append(c_words)
|
228 |
+
|
229 |
+
|
230 |
+
if freq_words_analysis:
|
231 |
+
flatten_corpus = [j for sub in clean_txt for j in sub]
|
232 |
+
clean_txt = []
|
233 |
+
unique = list(set(flatten_corpus))
|
234 |
+
wordfreq = [flatten_corpus.count(p) for p in unique]
|
235 |
+
wordfreq = dict(list(zip(unique,wordfreq)))
|
236 |
+
|
237 |
+
freqdict = [(wordfreq[key], key) for key in wordfreq]
|
238 |
+
freqdict.sort()
|
239 |
+
freqdict.reverse()
|
240 |
+
|
241 |
+
df = pd.DataFrame(freqdict,columns = ['Frequency','Word'])
|
242 |
+
|
243 |
+
|
244 |
+
df.to_excel('../mammo_word_count.xls')
|
245 |
+
|
246 |
+
return clean_txt
|
247 |
+
|
248 |
+
if __name__ == '__main__':
|
249 |
+
exit()
|
250 |
+
|
app/src/text_cleaning_transforerms.py
ADDED
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from os import walk
|
3 |
+
from os import listdir
|
4 |
+
from os.path import isfile, join
|
5 |
+
import numpy as np
|
6 |
+
import re
|
7 |
+
|
8 |
+
from gensim.parsing import preprocessing
|
9 |
+
from gensim.parsing.preprocessing import strip_tags, strip_punctuation
|
10 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
11 |
+
import math
|
12 |
+
from tqdm import tqdm
|
13 |
+
|
14 |
+
def remove_noise_text(txt):
|
15 |
+
|
16 |
+
txt = txt.lower()
|
17 |
+
txt = re.sub("primary site:", ' ', txt)
|
18 |
+
|
19 |
+
#txt = re.sub('post-surgical changes', ' ', txt.lower())
|
20 |
+
|
21 |
+
# Remove any mentions to " Findings were discussed with...."
|
22 |
+
txt = txt.split("findings were discussed with")[0]
|
23 |
+
|
24 |
+
# Remove any other occurance of PI's Information
|
25 |
+
txt = txt.split("this study has been reviewed and interpreted")[0]
|
26 |
+
txt = txt.split("this finding was communicated to")[0]
|
27 |
+
txt = txt.split("important findings were identified")[0]
|
28 |
+
txt = txt.split("these findings")[0]
|
29 |
+
txt = txt.split("findings above were")[0]
|
30 |
+
txt = txt.split("findings regarding")[0]
|
31 |
+
txt = txt.split("were discussed")[0]
|
32 |
+
txt = txt.split("these images were")[0]
|
33 |
+
txt = txt.split("important finding")[0]
|
34 |
+
|
35 |
+
# remove any section headers
|
36 |
+
txt = re.sub("post-surgical changes:", ' ', txt)
|
37 |
+
txt = re.sub("post surgical changes:", ' ', txt)
|
38 |
+
txt = re.sub("primary site:", ' ', txt)
|
39 |
+
txt = re.sub("primary site", ' ', txt)
|
40 |
+
txt = re.sub("neck:", ' ', txt)
|
41 |
+
txt = re.sub("post-treatment changes:", ' ', txt)
|
42 |
+
txt = re.sub("post treatment changes:", ' ', txt)
|
43 |
+
txt = re.sub("brain, orbits, spine and lungs:", ' ', txt)
|
44 |
+
txt = re.sub("primary :", ' ', txt)
|
45 |
+
txt = re.sub("neck:", ' ', txt)
|
46 |
+
txt = re.sub("aerodigestive tract:", ' ', txt)
|
47 |
+
txt = re.sub("calvarium, skull base, and spine:", ' ', txt)
|
48 |
+
txt = re.sub("other:", ' ', txt)
|
49 |
+
txt = re.sub("upper neck:", ' ', txt)
|
50 |
+
txt = re.sub("perineural disease:", ' ', txt)
|
51 |
+
txt = re.sub("technique:", ' ', txt)
|
52 |
+
txt = re.sub("comparison:", ' ', txt)
|
53 |
+
txt = re.sub("paranasal sinuses:", ' ', txt)
|
54 |
+
txt = re.sub("included orbits:", ' ', txt)
|
55 |
+
txt = re.sub("nasopharynx:", ' ', txt)
|
56 |
+
txt = re.sub("tympanomastoid cavities:", ' ', txt)
|
57 |
+
txt = re.sub("skull base and calvarium:", ' ', txt)
|
58 |
+
txt = re.sub("included intracranial structures:", ' ', txt)
|
59 |
+
txt = re.sub("impression:", ' ', txt)
|
60 |
+
txt = re.sub("nodes:", ' ', txt)
|
61 |
+
txt = re.sub("mri orbits:", ' ', txt)
|
62 |
+
txt = re.sub("mri brain:", ' ', txt)
|
63 |
+
txt = re.sub("brain:", ' ', txt)
|
64 |
+
txt = re.sub("ct face w/:", ' ', txt)
|
65 |
+
txt = re.sub("transspatial extension:", ' ', txt)
|
66 |
+
txt = re.sub("thyroid bed:", ' ', txt)
|
67 |
+
txt = re.sub("additional findings:", ' ', txt)
|
68 |
+
txt = re.sub("series_image", ' ', txt)
|
69 |
+
txt = re.sub("series image", ' ', txt)
|
70 |
+
txt = re.sub("image series", ' ', txt)
|
71 |
+
txt = re.sub("see synoptic report", ' ', txt)
|
72 |
+
txt = re.sub("see report", ' ', txt)
|
73 |
+
|
74 |
+
txt = re.sub("brstwo|brstmarun|brstwln|brlump|lnbx", ' ', txt)
|
75 |
+
|
76 |
+
txt = re.sub("post_treatment", 'post treatment', txt)
|
77 |
+
txt = re.sub("post-treatment", 'post treatment', txt)
|
78 |
+
|
79 |
+
txt = re.sub("nonmasslike", 'non mass like', txt)
|
80 |
+
txt = re.sub("non_mass_like", 'non mass like', txt)
|
81 |
+
txt = re.sub("non-mass-like", 'non mass like', txt)
|
82 |
+
txt = re.sub("statuspost", 'status post', txt)
|
83 |
+
|
84 |
+
|
85 |
+
# in the worst case, just replace the name from PI to empty string
|
86 |
+
txt = re.sub("dr\\.\\s[^\\s]+", ' ', txt)
|
87 |
+
|
88 |
+
txt = re.sub(" series | series|series ", "", txt)
|
89 |
+
txt = re.sub(" cm | cm|cm ", " centimeters ", txt)
|
90 |
+
txt = re.sub(" cc | cc|cc ", " cubic centimeters ", txt)
|
91 |
+
txt = re.sub(" ct | ct|ct ", " carat metric ", txt)
|
92 |
+
txt = re.sub(" mm | mm|mm ", " millimeters ", txt)
|
93 |
+
|
94 |
+
txt = re.sub("status_post|o\'", '', txt)
|
95 |
+
txt = re.sub("status post|clock|/|'/'", '', txt)
|
96 |
+
txt = re.sub("statuspost", '', txt)
|
97 |
+
txt = re.sub("brstwo|brlump|brstmarun|brwire|brstcap|", '', txt)
|
98 |
+
|
99 |
+
txt = re.sub("\\(|\\)", ',', txt)
|
100 |
+
txt = re.sub(",,", ',', txt)
|
101 |
+
txt = re.sub(",\\.", '.', txt)
|
102 |
+
txt = re.sub(", \\.", '.', txt)
|
103 |
+
|
104 |
+
txt = re.sub(" ,", ', ', txt)
|
105 |
+
txt = re.sub("a\\.", ' ', txt[0:5]) + txt[5:]
|
106 |
+
txt = re.sub("b\\.", ' ', txt[0:5]) + txt[5:]
|
107 |
+
txt = re.sub("c\\.", ' ', txt[0:5]) + txt[5:]
|
108 |
+
txt = re.sub("d\\.", ' ', txt[0:5]) + txt[5:]
|
109 |
+
txt = re.sub("e\\.", ' ', txt[0:5]) + txt[5:]
|
110 |
+
txt = re.sub("f\\.", ' ', txt[0:5]) + txt[5:]
|
111 |
+
|
112 |
+
|
113 |
+
# in the worst case, just replace the name from PI to empty string
|
114 |
+
txt = re.sub("dr\\.\\s[^\\s]+", '', txt)
|
115 |
+
|
116 |
+
# Removing multiple spaces
|
117 |
+
txt = re.sub(r'\s+', ' ', txt)
|
118 |
+
txt = re.sub(' +', ' ', txt)
|
119 |
+
|
120 |
+
txt = txt.rstrip().lstrip()
|
121 |
+
|
122 |
+
return txt
|
123 |
+
|
124 |
+
|
125 |
+
def add_bigrams(txt, fixed_bigrams):
|
126 |
+
|
127 |
+
for b in fixed_bigrams:
|
128 |
+
sub = ""
|
129 |
+
not_first = False
|
130 |
+
for x in b[1:]:
|
131 |
+
if not_first:
|
132 |
+
sub += "|"
|
133 |
+
not_first = True
|
134 |
+
|
135 |
+
sub += str(x) + "|" + str(x) + " " + "|" + " " + str(x) + "|" + " " + str(x)
|
136 |
+
txt = re.sub(sub, b[0], txt)
|
137 |
+
|
138 |
+
|
139 |
+
return txt
|
140 |
+
|
141 |
+
def extra_clean_text(clean_t,fixed_bigrams):
|
142 |
+
|
143 |
+
txt = add_bigrams(clean_t, fixed_bigrams)
|
144 |
+
replaces = [ ["her2|her 2|her two", " hertwo "],
|
145 |
+
# ["0", "zero "], ["1", "one "], ["2", "two "], ["3", "three "],["4", "four "],
|
146 |
+
# ["5", "five "],["6", "six "] ,["7", "seven "] ,["8", "eight "] ,["9", "nine " ] ,
|
147 |
+
["\\>", " greather "], ["\\<", " less "]]
|
148 |
+
|
149 |
+
for sub in replaces:
|
150 |
+
txt = re.sub(sub[0], sub[1], txt)
|
151 |
+
|
152 |
+
return txt
|
153 |
+
|
154 |
+
|
155 |
+
def text_cleaning(data,min_lenght=2,extra_clean=True, remove_punctuation=False):
|
156 |
+
|
157 |
+
# position 0 means the bigram output - 1:end means how they may come on text
|
158 |
+
fixed_bigrams = [ [' gradeone ', 'grade 1', 'grade i', 'grade I', 'grade one',],
|
159 |
+
[' gradetwo ', 'grade 2', 'grade ii', 'grade II', 'grade two', ],
|
160 |
+
[' gradethree ', 'grade 3' , 'grade iii', 'grade III', 'grade three']]
|
161 |
+
|
162 |
+
clean_txt = []
|
163 |
+
|
164 |
+
clean_t = remove_noise_text(data)
|
165 |
+
if extra_clean:
|
166 |
+
clean_t = extra_clean_text(clean_t,fixed_bigrams)
|
167 |
+
if remove_punctuation:
|
168 |
+
filters = [lambda x: x.lower(), strip_tags, strip_punctuation]
|
169 |
+
else:
|
170 |
+
filters = [lambda x: x.lower(), strip_tags]
|
171 |
+
|
172 |
+
clean_t = " ".join(x for x in preprocessing.preprocess_string(clean_t, filters) if len(x) >=min_lenght)
|
173 |
+
|
174 |
+
|
175 |
+
# Removing multiple spaces
|
176 |
+
clean_t = re.sub(r'\s+', ' ', clean_t)
|
177 |
+
|
178 |
+
return clean_t
|
179 |
+
|
180 |
+
# set only_data = True if no need to get scores or if dataaset doesn't have a score
|
181 |
+
def pre_process(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuation=False):
|
182 |
+
|
183 |
+
|
184 |
+
data_pre_processed = text_cleaning(data,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
|
185 |
+
|
186 |
+
"""
|
187 |
+
Partion the data into max_size chunks
|
188 |
+
"""
|
189 |
+
sentences = sent_tokenize(data)
|
190 |
+
data_pre_processed_chunks,sample = [],""
|
191 |
+
|
192 |
+
# Were able to split into sentences
|
193 |
+
if len(sentences)>1:
|
194 |
+
for index,sentence in enumerate(sentences):
|
195 |
+
if len(sentence.split()) + len(sample.split()) <= max_size:
|
196 |
+
sample += sentence
|
197 |
+
else:
|
198 |
+
data_pre_processed_chunks.append(text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation))
|
199 |
+
sample = sentence if index < len(sentences)-1 else ""
|
200 |
+
|
201 |
+
if len(sample) ==0:
|
202 |
+
clean_data = text_cleaning(sentences[-1],min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
|
203 |
+
else:
|
204 |
+
clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
|
205 |
+
|
206 |
+
#if len(clean_data.split()) >3:
|
207 |
+
data_pre_processed_chunks.append(clean_data)
|
208 |
+
|
209 |
+
# Split by get max size chunks
|
210 |
+
else:
|
211 |
+
words = word_tokenize(data)
|
212 |
+
lower_b, upper_b = 0, max_size
|
213 |
+
for x in range(math.ceil(len(words)/max_size)):
|
214 |
+
sample = " ".join(x for x in words[lower_b:upper_b])
|
215 |
+
lower_b, upper_b = upper_b, upper_b+max_size
|
216 |
+
clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
|
217 |
+
#if len(clean_data.split()) >3:
|
218 |
+
data_pre_processed_chunks.append(clean_data)
|
219 |
+
|
220 |
+
# return the pre_processed of whoole text and chunks
|
221 |
+
return data_pre_processed,data_pre_processed_chunks
|
222 |
+
|
223 |
+
if __name__ == '__main__':
|
224 |
+
exit(1)
|
225 |
+
|
226 |
+
|
227 |
+
|
228 |
+
|
229 |
+
|
environment.yml
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: pathology
|
2 |
+
channels:
|
3 |
+
- pytorch
|
4 |
+
- defaults
|
5 |
+
dependencies:
|
6 |
+
- _libgcc_mutex=0.1=main
|
7 |
+
- _openmp_mutex=4.5=1_gnu
|
8 |
+
- blas=1.0=mkl
|
9 |
+
- ca-certificates=2021.10.26=h06a4308_2
|
10 |
+
- certifi=2021.10.8=py38h06a4308_2
|
11 |
+
- cudatoolkit=11.3.1=h2bc3f7f_2
|
12 |
+
- intel-openmp=2022.0.1=h06a4308_3633
|
13 |
+
- ld_impl_linux-64=2.35.1=h7274673_9
|
14 |
+
- libffi=3.3=he6710b0_2
|
15 |
+
- libgcc-ng=9.3.0=h5101ec6_17
|
16 |
+
- libgomp=9.3.0=h5101ec6_17
|
17 |
+
- libstdcxx-ng=9.3.0=hd4cf53a_17
|
18 |
+
- libuv=1.40.0=h7b6447c_0
|
19 |
+
- mkl=2022.0.1=h06a4308_117
|
20 |
+
- ncurses=6.3=h7f8727e_2
|
21 |
+
- openssl=1.1.1m=h7f8727e_0
|
22 |
+
- pip=21.2.4=py38h06a4308_0
|
23 |
+
- python=3.8.12=h12debd9_0
|
24 |
+
- pytorch-mutex=1.0=cuda
|
25 |
+
- readline=8.1.2=h7f8727e_1
|
26 |
+
- setuptools=58.0.4=py38h06a4308_0
|
27 |
+
- sqlite=3.37.0=hc218d9a_0
|
28 |
+
- tk=8.6.11=h1ccaba5_0
|
29 |
+
- typing_extensions=3.10.0.2=pyh06a4308_0
|
30 |
+
- wheel=0.37.1=pyhd3eb1b0_0
|
31 |
+
- xz=5.2.5=h7b6447c_0
|
32 |
+
- zlib=1.2.11=h7f8727e_4
|
33 |
+
- pip:
|
34 |
+
- absl-py==1.0.0
|
35 |
+
- aiohttp==3.8.1
|
36 |
+
- aiosignal==1.2.0
|
37 |
+
- altair==4.2.0
|
38 |
+
- argon2-cffi==21.3.0
|
39 |
+
- argon2-cffi-bindings==21.2.0
|
40 |
+
- astor==0.8.1
|
41 |
+
- asttokens==2.0.5
|
42 |
+
- async-timeout==4.0.2
|
43 |
+
- attrs==21.4.0
|
44 |
+
- backcall==0.2.0
|
45 |
+
- backports-zoneinfo==0.2.1
|
46 |
+
- base58==2.1.1
|
47 |
+
- black==22.1.0
|
48 |
+
- bleach==4.1.0
|
49 |
+
- blinker==1.4
|
50 |
+
- cachetools==5.0.0
|
51 |
+
- cffi==1.15.0
|
52 |
+
- charset-normalizer==2.0.11
|
53 |
+
- click==8.0.3
|
54 |
+
- configparser==5.2.0
|
55 |
+
- cycler==0.11.0
|
56 |
+
- datasets==1.18.2
|
57 |
+
- debugpy==1.5.1
|
58 |
+
- decorator==5.1.1
|
59 |
+
- defusedxml==0.7.1
|
60 |
+
- dill==0.3.4
|
61 |
+
- docker-pycreds==0.4.0
|
62 |
+
- entrypoints==0.3
|
63 |
+
- et-xmlfile==1.1.0
|
64 |
+
- executing==0.8.2
|
65 |
+
- filelock==3.4.2
|
66 |
+
- fire==0.4.0
|
67 |
+
- fonttools==4.29.0
|
68 |
+
- frozenlist==1.3.0
|
69 |
+
- fsspec==2022.1.0
|
70 |
+
- gensim==4.1.2
|
71 |
+
- gitdb==4.0.9
|
72 |
+
- gitpython==3.1.26
|
73 |
+
- google-auth==2.6.0
|
74 |
+
- google-auth-oauthlib==0.4.6
|
75 |
+
- grpcio==1.43.0
|
76 |
+
- htbuilder==0.6.0
|
77 |
+
- huggingface-hub==0.4.0
|
78 |
+
- idna==3.3
|
79 |
+
- imageio==2.14.1
|
80 |
+
- importlib-metadata==4.10.1
|
81 |
+
- importlib-resources==5.4.0
|
82 |
+
- ipykernel==6.7.0
|
83 |
+
- ipython==8.0.1
|
84 |
+
- ipython-genutils==0.2.0
|
85 |
+
- ipywidgets==7.6.5
|
86 |
+
- iteration-utilities==0.11.0
|
87 |
+
- jedi==0.18.1
|
88 |
+
- jinja2==3.0.3
|
89 |
+
- joblib==1.1.0
|
90 |
+
- jsonschema==4.4.0
|
91 |
+
- jupyter-client==7.1.2
|
92 |
+
- jupyter-core==4.9.1
|
93 |
+
- jupyterlab-pygments==0.1.2
|
94 |
+
- jupyterlab-widgets==1.0.2
|
95 |
+
- kiwisolver==1.3.2
|
96 |
+
- lime==0.2.0.1
|
97 |
+
- llvmlite==0.38.0
|
98 |
+
- markdown==3.3.6
|
99 |
+
- markupsafe==2.0.1
|
100 |
+
- matplotlib==3.5.1
|
101 |
+
- matplotlib-inline==0.1.3
|
102 |
+
- mistune==0.8.4
|
103 |
+
- multidict==6.0.2
|
104 |
+
- multiprocess==0.70.12.2
|
105 |
+
- mypy-extensions==0.4.3
|
106 |
+
- nbclient==0.5.10
|
107 |
+
- nbconvert==6.4.1
|
108 |
+
- nbformat==5.1.3
|
109 |
+
- nest-asyncio==1.5.4
|
110 |
+
- networkx==2.6.3
|
111 |
+
- nltk==3.6.7
|
112 |
+
- notebook==6.4.8
|
113 |
+
- numba==0.55.1
|
114 |
+
- numpy==1.21.5
|
115 |
+
- oauthlib==3.2.0
|
116 |
+
- openpyxl==3.0.9
|
117 |
+
- packaging==21.3
|
118 |
+
- pandas==1.4.0
|
119 |
+
- pandocfilters==1.5.0
|
120 |
+
- parso==0.8.3
|
121 |
+
- pathspec==0.9.0
|
122 |
+
- pathtools==0.1.2
|
123 |
+
- pexpect==4.8.0
|
124 |
+
- pickleshare==0.7.5
|
125 |
+
- pillow==9.0.0
|
126 |
+
- platformdirs==2.4.1
|
127 |
+
- plotly==5.5.0
|
128 |
+
- prometheus-client==0.13.1
|
129 |
+
- promise==2.3
|
130 |
+
- prompt-toolkit==3.0.26
|
131 |
+
- protobuf==3.19.4
|
132 |
+
- psutil==5.9.0
|
133 |
+
- ptyprocess==0.7.0
|
134 |
+
- pure-eval==0.2.2
|
135 |
+
- pyarrow==6.0.1
|
136 |
+
- pyasn1==0.4.8
|
137 |
+
- pyasn1-modules==0.2.8
|
138 |
+
- pycparser==2.21
|
139 |
+
- pydeck==0.7.1
|
140 |
+
- pygments==2.11.2
|
141 |
+
- pympler==1.0.1
|
142 |
+
- pynndescent==0.5.6
|
143 |
+
- pyparsing==3.0.7
|
144 |
+
- pyrsistent==0.18.1
|
145 |
+
- python-dateutil==2.8.2
|
146 |
+
- pytz==2021.3
|
147 |
+
- pytz-deprecation-shim==0.1.0.post0
|
148 |
+
- pywavelets==1.2.0
|
149 |
+
- pyyaml==6.0
|
150 |
+
- pyzmq==22.3.0
|
151 |
+
- regex==2022.1.18
|
152 |
+
- requests==2.27.1
|
153 |
+
- requests-oauthlib==1.3.1
|
154 |
+
- rsa==4.8
|
155 |
+
- sacremoses==0.0.47
|
156 |
+
- scikit-image==0.19.1
|
157 |
+
- scikit-learn==1.0.2
|
158 |
+
- scikit-multilearn==0.2.0
|
159 |
+
- scipy==1.7.3
|
160 |
+
- semantic-version==2.8.5
|
161 |
+
- send2trash==1.8.0
|
162 |
+
- sentencepiece==0.1.96
|
163 |
+
- sentry-sdk==1.5.4
|
164 |
+
- seqeval==1.2.2
|
165 |
+
- setuptools-rust==1.1.2
|
166 |
+
- shortuuid==1.0.8
|
167 |
+
- simpletransformers==0.63.4
|
168 |
+
- six==1.16.0
|
169 |
+
- smart-open==5.2.1
|
170 |
+
- smmap==5.0.0
|
171 |
+
- stack-data==0.1.4
|
172 |
+
- streamlit==1.5.0
|
173 |
+
- subprocess32==3.5.4
|
174 |
+
- tenacity==8.0.1
|
175 |
+
- tensorboard==2.8.0
|
176 |
+
- tensorboard-data-server==0.6.1
|
177 |
+
- tensorboard-plugin-wit==1.8.1
|
178 |
+
- termcolor==1.1.0
|
179 |
+
- terminado==0.13.1
|
180 |
+
- testpath==0.5.0
|
181 |
+
- threadpoolctl==3.1.0
|
182 |
+
- tifffile==2021.11.2
|
183 |
+
- tokenizers==0.11.4
|
184 |
+
- toml==0.10.2
|
185 |
+
- tomli==2.0.0
|
186 |
+
- toolz==0.11.2
|
187 |
+
- torch==1.9.0
|
188 |
+
- torchvision==0.10.0
|
189 |
+
- tornado==6.1
|
190 |
+
- tqdm==4.62.3
|
191 |
+
- traitlets==5.1.1
|
192 |
+
- transformers==4.16.2
|
193 |
+
- tzdata==2021.5
|
194 |
+
- tzlocal==4.1
|
195 |
+
- umap-learn==0.5.2
|
196 |
+
- urllib3==1.26.8
|
197 |
+
- validators==0.18.2
|
198 |
+
- wandb==0.12.9
|
199 |
+
- watchdog==2.1.6
|
200 |
+
- wcwidth==0.2.5
|
201 |
+
- webencodings==0.5.1
|
202 |
+
- werkzeug==2.0.2
|
203 |
+
- widgetsnbextension==3.5.2
|
204 |
+
- xgboost==1.4.2
|
205 |
+
- xxhash==2.0.2
|
206 |
+
- yarl==1.7.2
|
207 |
+
- yaspin==2.1.0
|
208 |
+
- zipp==3.7.0
|
209 |
+
|
210 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Flask==1.1.2
|
2 |
+
Flask_Login==0.5.0
|
3 |
+
Flask_SQLAlchemy==2.5.1
|
4 |
+
Flask_WTF==0.15.1
|
5 |
+
WTForms==2.3.3
|
6 |
+
absl-py==1.0.0
|
7 |
+
aiohttp==3.8.1
|
8 |
+
aiosignal==1.2.0
|
9 |
+
altair==4.2.0
|
10 |
+
argon2-cffi==21.3.0
|
11 |
+
argon2-cffi-bindings==21.2.0
|
12 |
+
astor==0.8.1
|
13 |
+
asttokens==2.0.5
|
14 |
+
async-timeout==4.0.2
|
15 |
+
attrs==21.4.0
|
16 |
+
backcall==0.2.0
|
17 |
+
backports-zoneinfo==0.2.1
|
18 |
+
base58==2.1.1
|
19 |
+
black==22.1.0
|
20 |
+
bleach==4.1.0
|
21 |
+
blinker==1.4
|
22 |
+
cachetools==5.0.0
|
23 |
+
cffi==1.15.0
|
24 |
+
charset-normalizer==2.0.11
|
25 |
+
click==8.0.3
|
26 |
+
configparser==5.2.0
|
27 |
+
cycler==0.11.0
|
28 |
+
datasets==1.18.2
|
29 |
+
debugpy==1.5.1
|
30 |
+
decorator==5.1.1
|
31 |
+
defusedxml==0.7.1
|
32 |
+
dill==0.3.4
|
33 |
+
docker-pycreds==0.4.0
|
34 |
+
entrypoints==0.3
|
35 |
+
et-xmlfile==1.1.0
|
36 |
+
executing==0.8.2
|
37 |
+
filelock==3.4.2
|
38 |
+
fire==0.4.0
|
39 |
+
fonttools==4.29.0
|
40 |
+
frozenlist==1.3.0
|
41 |
+
fsspec==2022.1.0
|
42 |
+
gensim==4.1.2
|
43 |
+
gitdb==4.0.9
|
44 |
+
gitpython==3.1.26
|
45 |
+
google-auth==2.6.0
|
46 |
+
google-auth-oauthlib==0.4.6
|
47 |
+
grpcio==1.43.0
|
48 |
+
htbuilder==0.6.0
|
49 |
+
huggingface-hub==0.4.0
|
50 |
+
idna==3.3
|
51 |
+
imageio==2.14.1
|
52 |
+
importlib-metadata==4.10.1
|
53 |
+
importlib-resources==5.4.0
|
54 |
+
ipykernel==6.7.0
|
55 |
+
ipython==8.0.1
|
56 |
+
ipython-genutils==0.2.0
|
57 |
+
ipywidgets==7.6.5
|
58 |
+
iteration-utilities==0.11.0
|
59 |
+
jedi==0.18.1
|
60 |
+
jinja2==3.0.3
|
61 |
+
joblib==1.1.0
|
62 |
+
jsonschema==4.4.0
|
63 |
+
jupyter-client==7.1.2
|
64 |
+
jupyter-core==4.9.1
|
65 |
+
jupyterlab-pygments==0.1.2
|
66 |
+
jupyterlab-widgets==1.0.2
|
67 |
+
kiwisolver==1.3.2
|
68 |
+
lime==0.2.0.1
|
69 |
+
llvmlite==0.38.0
|
70 |
+
markdown==3.3.6
|
71 |
+
markupsafe==2.0.1
|
72 |
+
matplotlib==3.5.1
|
73 |
+
matplotlib-inline==0.1.3
|
74 |
+
mistune==0.8.4
|
75 |
+
multidict==6.0.2
|
76 |
+
multiprocess==0.70.12.2
|
77 |
+
mypy-extensions==0.4.3
|
78 |
+
nbclient==0.5.10
|
79 |
+
nbconvert==6.4.1
|
80 |
+
nbformat==5.1.3
|
81 |
+
nest-asyncio==1.5.4
|
82 |
+
networkx==2.6.3
|
83 |
+
nltk==3.6.7
|
84 |
+
notebook==6.4.8
|
85 |
+
numba==0.55.1
|
86 |
+
numpy==1.21.5
|
87 |
+
oauthlib==3.2.0
|
88 |
+
openpyxl==3.0.9
|
89 |
+
packaging==21.3
|
90 |
+
pandas==1.4.0
|
91 |
+
pandocfilters==1.5.0
|
92 |
+
parso==0.8.3
|
93 |
+
pathspec==0.9.0
|
94 |
+
pathtools==0.1.2
|
95 |
+
pexpect==4.8.0
|
96 |
+
pickleshare==0.7.5
|
97 |
+
pillow==9.0.0
|
98 |
+
platformdirs==2.4.1
|
99 |
+
plotly==5.5.0
|
100 |
+
prometheus-client==0.13.1
|
101 |
+
promise==2.3
|
102 |
+
prompt-toolkit==3.0.26
|
103 |
+
protobuf==3.19.4
|
104 |
+
psutil==5.9.0
|
105 |
+
ptyprocess==0.7.0
|
106 |
+
pure-eval==0.2.2
|
107 |
+
pyarrow==6.0.1
|
108 |
+
pyasn1==0.4.8
|
109 |
+
pyasn1-modules==0.2.8
|
110 |
+
pycparser==2.21
|
111 |
+
pydeck==0.7.1
|
112 |
+
pygments==2.11.2
|
113 |
+
pympler==1.0.1
|
114 |
+
pynndescent==0.5.6
|
115 |
+
pyparsing==3.0.7
|
116 |
+
pyrsistent==0.18.1
|
117 |
+
python-dateutil==2.8.2
|
118 |
+
pytz==2021.3
|
119 |
+
pytz-deprecation-shim==0.1.0.post0
|
120 |
+
pywavelets==1.2.0
|
121 |
+
pyyaml==6.0
|
122 |
+
pyzmq==22.3.0
|
123 |
+
regex==2022.1.18
|
124 |
+
requests==2.27.1
|
125 |
+
requests-oauthlib==1.3.1
|
126 |
+
rsa==4.8
|
127 |
+
sacremoses==0.0.47
|
128 |
+
scikit-image==0.19.1
|
129 |
+
scikit-learn==1.0.2
|
130 |
+
scikit-multilearn==0.2.0
|
131 |
+
scipy==1.7.3
|
132 |
+
semantic-version==2.8.5
|
133 |
+
send2trash==1.8.0
|
134 |
+
sentencepiece==0.1.96
|
135 |
+
sentry-sdk==1.5.4
|
136 |
+
seqeval==1.2.2
|
137 |
+
setuptools-rust==1.1.2
|
138 |
+
shortuuid==1.0.8
|
139 |
+
simpletransformers==0.63.4
|
140 |
+
six==1.16.0
|
141 |
+
smart-open==5.2.1
|
142 |
+
smmap==5.0.0
|
143 |
+
stack-data==0.1.4
|
144 |
+
streamlit==1.5.0
|
145 |
+
subprocess32==3.5.4
|
146 |
+
tenacity==8.0.1
|
147 |
+
tensorboard==2.8.0
|
148 |
+
tensorboard-data-server==0.6.1
|
149 |
+
tensorboard-plugin-wit==1.8.1
|
150 |
+
termcolor==1.1.0
|
151 |
+
terminado==0.13.1
|
152 |
+
testpath==0.5.0
|
153 |
+
threadpoolctl==3.1.0
|
154 |
+
tifffile==2021.11.2
|
155 |
+
tokenizers==0.11.4
|
156 |
+
toml==0.10.2
|
157 |
+
tomli==2.0.0
|
158 |
+
toolz==0.11.2
|
159 |
+
torch==1.9.0
|
160 |
+
torchvision==0.10.0
|
161 |
+
tornado==6.1
|
162 |
+
tqdm==4.62.3
|
163 |
+
traitlets==5.1.1
|
164 |
+
transformers==4.16.2
|
165 |
+
tzdata==2021.5
|
166 |
+
tzlocal==4.1
|
167 |
+
umap-learn==0.5.2
|
168 |
+
urllib3==1.26.8
|
169 |
+
validators==0.18.2
|
170 |
+
wandb==0.12.9
|
171 |
+
watchdog==2.1.6
|
172 |
+
wcwidth==0.2.5
|
173 |
+
webencodings==0.5.1
|
174 |
+
werkzeug==2.0.2
|
175 |
+
widgetsnbextension==3.5.2
|
176 |
+
xgboost==1.4.2
|
177 |
+
xxhash==2.0.2
|
178 |
+
yarl==1.7.2
|
179 |
+
yaspin==2.1.0
|
180 |
+
zipp==3.7.0
|