Spaces:
Paused
Paused
# import pandas_profiling as pp | |
import pandas as pd | |
import tensorflow as tf | |
from datasets import load_dataset | |
from tensorflow.python.framework import tensor_shape | |
#LOINC | |
datasetLOINC = load_dataset("awacke1/LOINC-CodeSet-Value-Description.csv", split="train") | |
#SNOMED: | |
datasetSNOMED = load_dataset("awacke1/SNOMED-CT-Code-Value-Semantic-Set.csv", split="train") | |
#eCQM: | |
dataseteCQM = load_dataset("awacke1/eCQM-Code-Value-Semantic-Set.csv", split="train") | |
# map using autotokenizer | |
from transformers import AutoTokenizer | |
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") | |
dataset = datasetLOINC.map(lambda examples: tokenizer(examples["Description"]), batched=True) | |
JSONOBJ2=dataset[0] | |
print(JSONOBJ2) | |
sw = datasetLOINC.filter(lambda example: example["Description"].startswith("Allergy")) | |
len(sw) | |
print(sw) | |
print(datasetLOINC) | |
print(datasetSNOMED) | |
print(dataseteCQM) | |
# play with some dataset tools before the show: | |
#print(start_with_ar["Description"]) | |
#--- | |
#Main Stage - Begin! | |
#--- | |
import os | |
import json | |
import numpy as np | |
import gradio as gr | |
HF_TOKEN = os.environ.get("HF_TOKEN") | |
CHOICES = ["SNOMED", "LOINC", "CQM"] | |
JSONOBJ = """{"items":{"item":[{"id": "0001","type": null,"is_good": false,"ppu": 0.55,"batters":{"batter":[{ "id": "1001", "type": "Regular" },{ "id": "1002", "type": "Chocolate" },{ "id": "1003", "type": "Blueberry" },{ "id": "1004", "type": "Devil's Food" }]},"topping":[{ "id": "5001", "type": "None" },{ "id": "5002", "type": "Glazed" },{ "id": "5005", "type": "Sugar" },{ "id": "5007", "type": "Powdered Sugar" },{ "id": "5006", "type": "Chocolate with Sprinkles" },{ "id": "5003", "type": "Chocolate" },{ "id": "5004", "type": "Maple" }]}]}}""" | |
def concatenate_text(examples): | |
return { | |
"text": examples["Code"] | |
+ " \n " | |
+ examples["Description"] | |
+ " \n " | |
+ examples["Purpose: Clinical Focus"] | |
} | |
def cls_pooling(model_output): | |
return model_output.last_hidden_state[:, 0] | |
def get_embeddings(text_list): | |
encoded_input = tokenizer( | |
text_list, padding=True, truncation=True, return_tensors="tf" | |
) | |
encoded_input = {k: v for k, v in encoded_input.items()} | |
model_output = model(**encoded_input) | |
return cls_pooling(model_output) | |
def fn( text1, text2, num, slider1, slider2, single_checkbox, checkboxes, radio, dropdown, im1, im2, im3, im4, | |
video, audio1, audio2, file, df1, df2,): | |
#def fn( text1, text2, single_checkbox, checkboxes, radio, im4, file, df1, df2,): | |
searchTerm = text1 | |
searchTermSentence = text2 | |
start_with_searchTermLOINC = datasetLOINC.filter(lambda example:example["Description"].startswith('Allergy')) #Allergy | |
# FAISS | |
columns = start_with_searchTermLOINC.column_names | |
columns_to_keep = ["Value Set Name", "Code", "Description", "Purpose: Clinical Focus", "Code System OID"] | |
columns_to_remove = set(columns_to_keep).symmetric_difference(columns) | |
start_with_searchTermLOINC = start_with_searchTermLOINC.remove_columns(columns_to_remove) | |
start_with_searchTermLOINC | |
start_with_searchTermLOINC.set_format("pandas") | |
df = start_with_searchTermLOINC[:] | |
df["Purpose: Clinical Focus"][0] | |
df4 = df.explode("Purpose: Clinical Focus", ignore_index=True) | |
df4.head(4) | |
from datasets import Dataset | |
clinical_dataset = Dataset.from_pandas(df4) | |
clinical_dataset | |
clinical_dataset = clinical_dataset.map(lambda x: {"c_length": len(x["Description"].split())}) | |
clinical_dataset = clinical_dataset.filter(lambda x: x["c_length"] > 15) | |
clinical_dataset | |
clinical_dataset = clinical_dataset.map(concatenate_text) | |
#embedding = get_embeddings(clinical_dataset["text"][0]) | |
#embedding.shape | |
from transformers import AutoTokenizer, TFAutoModel | |
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1" | |
tokenizer = AutoTokenizer.from_pretrained(model_ckpt) | |
model = TFAutoModel.from_pretrained(model_ckpt, from_pt=True) | |
# TensorShape([1, 768]) | |
tf.shape([1, 768]) | |
embeddings_dataset = clinical_dataset.map( | |
lambda x: {"embeddings": get_embeddings(x["text"]).numpy()[0]}) | |
# embeddings_dataset.add_faiss_index(column="embeddings") | |
# question = "How can I load a dataset offline?" | |
# question_embedding = get_embeddings([question]).numpy() | |
# question_embedding.shape | |
# scores, samples = embeddings_dataset.get_nearest_examples("embeddings", question_embedding, k=5) | |
# import pandas as pd | |
# samples_df = pd.DataFrame.from_dict(samples) | |
# samples_df["scores"] = scores | |
# samples_df.sort_values("scores", ascending=False, inplace=True) | |
# "text": examples["Code"] | |
# + " \n " | |
# + examples["Description"] | |
# + " \n " | |
# + examples["Purpose: Clinical Focus"] | |
# for _, row in samples_df.iterrows(): | |
# print(f"Code: {row.Code}") | |
# print(f"Description: {row.Description}") | |
# #print(f"Purpose: Clinical Focus: {row.Purpose: Clinical Focus}") | |
# #print(f"URL: {row.html_url}") | |
# print("=" * 50) | |
# print() | |
# SNOMED and CQM --------------- | |
start_with_searchTermSNOMED = datasetSNOMED.filter(lambda example: example["Description"].startswith('Hospital')) #Hospital | |
start_with_searchTermCQM = dataseteCQM.filter(lambda example: example["Description"].startswith('Telephone')) #Telephone | |
print(start_with_searchTermLOINC ) | |
print(start_with_searchTermSNOMED ) | |
print(start_with_searchTermCQM) | |
# try: | |
#top1matchLOINC = json.loads(start_with_searchTermLOINC['train']) | |
#top1matchSNOMED = json.loads(start_with_searchTermSNOMED['train']) | |
#top1matchCQM = json.loads(start_with_searchTermCQM['train']) | |
# top1matchLOINC = json.loads(start_with_searchTermLOINC) | |
# top1matchSNOMED = json.loads(start_with_searchTermSNOMED) | |
# top1matchCQM = json.loads(start_with_searchTermCQM) | |
# except: | |
# print('Hello') | |
#print(start_with_searchTermLOINC[0]) | |
#print(start_with_searchTermSNOMED[0] ) | |
#print(start_with_searchTermCQM[0] ) | |
#print(returnMsg) | |
# print("Datasets Processed") | |
return ( | |
(text1 if single_checkbox else text2) | |
+ ", selected:" | |
+ ", ".join(checkboxes), # Text | |
{ | |
"positive": num / (num + slider1 + slider2), | |
"negative": slider1 / (num + slider1 + slider2), | |
"neutral": slider2 / (num + slider1 + slider2), | |
}, # Label | |
(audio1[0], np.flipud(audio1[1])) | |
if audio1 is not None else os.path.join(os.path.dirname(__file__), "files/cantina.wav"), # Audio | |
np.flipud(im1) | |
if im1 is not None else os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), # Image | |
video | |
if video is not None else os.path.join(os.path.dirname(__file__), "files/world.mp4"), # Video | |
[ | |
("The", "art"), | |
("quick brown", "adj"), | |
("fox", "nn"), | |
("jumped", "vrb"), | |
("testing testing testing", None), | |
("over", "prp"), | |
("the", "art"), | |
("testing", None), | |
("lazy", "adj"), | |
("dogs", "nn"), | |
(".", "punc"), | |
] + [(f"test {x}", f"test {x}") for x in range(10)], # HighlightedText | |
[ | |
("The testing testing testing", None), | |
("over", 0.6), | |
("the", 0.2), | |
("testing", None), | |
("lazy", -0.1), | |
("dogs", 0.4), | |
(".", 0), | |
] + [(f"test", x / 10) for x in range(-10, 10)], # HighlightedText | |
#json.loads(JSONOBJ), # JSON | |
start_with_searchTermLOINC.to_json(orient="records", path_or_buf="None"), | |
#json.dumps(json.loads(start_with_searchTermLOINC['train'].to_json(orient="records", path_or_buf="None"))), | |
"<button style='background-color: red'>Click Me: " + radio + "</button>", # HTML | |
os.path.join(os.path.dirname(__file__), "files/titanic.csv"), | |
df1, # Dataframe | |
np.random.randint(0, 10, (4, 4)), # Dataframe | |
df2, # Timeseries | |
) | |
demo = gr.Interface( | |
fn, | |
inputs=[ | |
gr.Textbox(value="Allergy", label="Textbox"), | |
gr.Textbox(lines=3, value="Bathing", placeholder="Type here..", label="Textbox 2"), | |
gr.Number(label="Number", value=42), | |
gr.Slider(10, 20, value=15, label="Slider: 10 - 20"), | |
gr.Slider(maximum=20, step=0.04, label="Slider: step @ 0.04"), | |
gr.Checkbox(label="Check for NER Match on Submit"), | |
gr.CheckboxGroup(label="Clinical Terminology to Check", choices=CHOICES, value=CHOICES[0:2]), | |
gr.Radio(label="Preferred Terminology Output", choices=CHOICES, value=CHOICES[2]), | |
gr.Dropdown(label="Dropdown", choices=CHOICES), | |
gr.Image(label="Image"), | |
gr.Image(label="Image w/ Cropper"), | |
gr.Image(label="Sketchpad"), | |
gr.Image(label="Webcam", source="webcam"), | |
gr.Video(label="Video"), | |
gr.Audio(label="Audio"), | |
gr.Audio(label="Microphone", source="microphone"), | |
gr.File(label="File"), | |
gr.Dataframe(label="Filters", headers=["Name", "Age", "Gender"]), | |
gr.Timeseries(x="time", y=["price", "value"], colors=["pink", "purple"]), | |
], | |
outputs=[ | |
gr.Textbox(label="Textbox"), | |
gr.Label(label="Label"), | |
gr.Audio(label="Audio"), | |
gr.Image(label="Image"), | |
gr.Video(label="Video"), | |
gr.HighlightedText(label="HighlightedText", color_map={"punc": "pink", "test 0": "blue"}), | |
gr.HighlightedText(label="HighlightedText", show_legend=True), | |
gr.JSON(label="JSON"), | |
gr.HTML(label="HTML"), | |
gr.File(label="File"), | |
gr.Dataframe(label="Dataframe"), | |
gr.Dataframe(label="Numpy"), | |
gr.Timeseries(x="time", y=["price", "value"], label="Timeseries"), | |
], | |
examples=[ | |
[ | |
"Allergy", | |
"Admission", | |
10, | |
12, | |
4, | |
True, | |
["SNOMED", "LOINC", "CQM"], | |
"SNOMED", | |
"bar", | |
os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), | |
os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), | |
os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), | |
os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), | |
os.path.join(os.path.dirname(__file__), "files/world.mp4"), | |
os.path.join(os.path.dirname(__file__), "files/cantina.wav"), | |
os.path.join(os.path.dirname(__file__), "files/cantina.wav"), | |
os.path.join(os.path.dirname(__file__), "files/titanic.csv"), | |
[[1, 2, 3], [3, 4, 5]], | |
os.path.join(os.path.dirname(__file__), "files/time.csv"), | |
] | |
] | |
* 3, | |
theme="default", | |
title="⚗️🧠🔬🧬 Clinical Terminology Auto Mapper AI 👩⚕️🩺⚕️🙋", | |
cache_examples=False, | |
description="Clinical Terminology Auto Mapper AI", | |
article="Learn more at [Yggdrasil](https://github.com/AaronCWacker/Yggdrasil)", | |
# live=True, | |
) | |
if __name__ == "__main__": | |
demo.launch(debug=True) |