Spaces:

klinic-hackupc
/

klinic

Sleeping

App Files Files Community

ACMCMC commited on May 5, 2024

Commit

27d40b9

1 Parent(s): 7833461

UI Changes

Browse files

Files changed (3) hide show

app.py +72 -24
database.ipynb +83 -2
utils.py +7 -11

app.py CHANGED Viewed

@@ -3,58 +3,106 @@ from streamlit_agraph import agraph, Node, Edge, Config
 import os
 from sqlalchemy import create_engine, text
 import pandas as pd
-from utils import get_all_diseases_name, get_most_similar_diseases_from_uri, get_uri_from_name, get_diseases_related_to_a_textual_description, get_similarities_among_diseases_uris
 import json
 import numpy as np
 from sentence_transformers import SentenceTransformer
-username = 'demo'
-password = 'demo'
-hostname = os.getenv('IRIS_HOSTNAME', 'localhost')
-port = '1972'
-namespace = 'USER'
 CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"
 engine = create_engine(CONNECTION_STRING)
-def handle_click_on_analyze_button(user_text):
     # 1. Embed the textual description that the user entered using the model
     # 2. Get 5 diseases with the highest cosine silimarity from the DB
     encoder = SentenceTransformer("allenai-specter")
-    diseases_related_to_the_user_text = get_diseases_related_to_a_textual_description(user_text, encoder)
-    #for disease_label in diseases_related_to_the_user_text:
     #    st.text(disease_label)
     # 3. Get the similarities of the embeddings of those diseases (cosine similarity of the embeddings of the nodes of such diseases)
-    diseases_uris = [disease['uri'] for disease in diseases_related_to_the_user_text]
     get_similarities_among_diseases_uris(diseases_uris)
     print(diseases_related_to_the_user_text)
     # 4. Potentially filter out the diseases that are not similar enough (e.g. similarity < 0.8)
     # 5. Augment the set of diseases: add new diseases that are similar to the ones that are already in the set, until we get 10-15 diseases
     # 6. Query the embeddings of the diseases related to each clinical trial (also in the DB), to get the most similar clinical trials to our set of diseases
-    # 7. Use an LLM to get a summary of the clinical trials, in plain text format
     # 8. Use an LLM to extract numerical data from the clinical trials (e.g. number of patients, number of deaths, etc.). Get summary statistics out of that.
     # 9. Show the results to the user: graph of the diseases chosen, summary of the clinical trials, summary statistics of the clinical trials, and list of the details of the clinical trials considered
-    pass
-st.write("# Klìnic")
-description_input = st.text_input(label="Enter the disease description 👇", placeholder='A disease that causes memory loss and other cognitive impairments.')
-if st.button("Analyze"):
-    handle_click_on_analyze_button(description_input)
 # TODO: also when user clicks enter
-st.write(":red[Here should be the graph]")  # TODO remove
 chart_data = pd.DataFrame(
     np.random.randn(20, 3), columns=["a", "b", "c"]
 )  # TODO remove
-st.scatter_chart(chart_data)  # TODO remove
-st.write("## Disease Overview")
 disease_overview = ":red[lorem ipsum]"  # TODO
-st.write(disease_overview)
-st.write("## Clinical Trials Details")
 trials = []
 # TODO replace mock data
 with open("mock_trial.json") as f:

 import os
 from sqlalchemy import create_engine, text
 import pandas as pd
+import time
+from utils import (
+    get_all_diseases_name,
+    get_most_similar_diseases_from_uri,
+    get_uri_from_name,
+    get_diseases_related_to_a_textual_description,
+    get_similarities_among_diseases_uris,
+    augment_the_set_of_diseaces,
+    get_clinical_trials_related_to_diseases,
+    get_clinical_records_by_ids
+)
 import json
 import numpy as np
 from sentence_transformers import SentenceTransformer
+begin = st.container()
+username = "demo"
+password = "demo"
+hostname = os.getenv("IRIS_HOSTNAME", "localhost")
+port = "1972"
+namespace = "USER"
 CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"
 engine = create_engine(CONNECTION_STRING)
+begin.write("# Klìnic")
+description_input = begin.text_input(
+    label="Enter the disease description 👇",
+    placeholder="A disease that causes memory loss and other cognitive impairments.",
+)
+if begin.button("Analyze 🔎"):
     # 1. Embed the textual description that the user entered using the model
     # 2. Get 5 diseases with the highest cosine silimarity from the DB
     encoder = SentenceTransformer("allenai-specter")
+    diseases_related_to_the_user_text = get_diseases_related_to_a_textual_description(
+        description_input, encoder
+    )
+    # for disease_label in diseases_related_to_the_user_text:
     #    st.text(disease_label)
     # 3. Get the similarities of the embeddings of those diseases (cosine similarity of the embeddings of the nodes of such diseases)
+    diseases_uris = [disease["uri"] for disease in diseases_related_to_the_user_text]
     get_similarities_among_diseases_uris(diseases_uris)
     print(diseases_related_to_the_user_text)
     # 4. Potentially filter out the diseases that are not similar enough (e.g. similarity < 0.8)
     # 5. Augment the set of diseases: add new diseases that are similar to the ones that are already in the set, until we get 10-15 diseases
+    augmented_set_of_diseases = augment_the_set_of_diseaces(diseases_uris)
+    print(augmented_set_of_diseases)
     # 6. Query the embeddings of the diseases related to each clinical trial (also in the DB), to get the most similar clinical trials to our set of diseases
+    clinical_trials_related_to_the_diseases = get_clinical_trials_related_to_diseases(
+        augmented_set_of_diseases, encoder
+    )
+    print(f'clinical_trials_related_to_the_diseases: {clinical_trials_related_to_the_diseases}')
+    json_of_clinical_trials = get_clinical_records_by_ids(
+        [trial["nct_id"] for trial in clinical_trials_related_to_the_diseases]
+    )
+    print(f'json_of_clinical_trials: {json_of_clinical_trials}')
     # 8. Use an LLM to extract numerical data from the clinical trials (e.g. number of patients, number of deaths, etc.). Get summary statistics out of that.
     # 9. Show the results to the user: graph of the diseases chosen, summary of the clinical trials, summary statistics of the clinical trials, and list of the details of the clinical trials considered
+    graph_of_diseases = agraph(
+        nodes=[
+            Node(id="A", label="Node A", size=10),
+            Node(id="B", label="Node B", size=10),
+            Node(id="C", label="Node C", size=10),
+            Node(id="D", label="Node D", size=10),
+            Node(id="E", label="Node E", size=10),
+            Node(id="F", label="Node F", size=10),
+            Node(id="G", label="Node G", size=10),
+            Node(id="H", label="Node H", size=10),
+            Node(id="I", label="Node I", size=10),
+            Node(id="J", label="Node J", size=10),
+        ],
+        edges=[
+            Edge(source="A", target="B"),
+            Edge(source="B", target="C"),
+            Edge(source="C", target="D"),
+            Edge(source="D", target="E"),
+            Edge(source="E", target="F"),
+            Edge(source="F", target="G"),
+            Edge(source="G", target="H"),
+            Edge(source="H", target="I"),
+            Edge(source="I", target="J"),
+        ],
+        config=Config(height=500, width=500),
+    )
 # TODO: also when user clicks enter
+begin.write(":red[Here should be the graph]")  # TODO remove
 chart_data = pd.DataFrame(
     np.random.randn(20, 3), columns=["a", "b", "c"]
 )  # TODO remove
+begin.scatter_chart(chart_data)  # TODO remove
+begin.write("## Disease Overview")
 disease_overview = ":red[lorem ipsum]"  # TODO
+begin.write(disease_overview)
+begin.write("## Clinical Trials Details")
 trials = []
 # TODO replace mock data
 with open("mock_trial.json") as f:

database.ipynb CHANGED Viewed

@@ -288,9 +288,90 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
     "# Load knowledge graph\n",
     "clinical_trials = pd.read_csv(\"clinical_trials_embeddings.csv\")\n",

   },
   {
    "cell_type": "code",
+   "execution_count": 22,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>desease_condition</th>\n",
+       "      <th>embeddings</th>\n",
+       "      <th>nct_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>marijuana abuse, substance-related disorders, ...</td>\n",
+       "      <td>-0.8323991298675537, 1.47855544090271, 0.00130...</td>\n",
+       "      <td>NCT03055377</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>tuberculosis, latent tuberculosis, infections,...</td>\n",
+       "      <td>-0.43443307280540466, 0.9625586271286011, -0.1...</td>\n",
+       "      <td>NCT03042754</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>heart failure, heart diseases, cardiovascular ...</td>\n",
+       "      <td>-0.5791705250740051, 0.13008448481559753, 0.13...</td>\n",
+       "      <td>NCT03035123</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>lymphoma, neoplasms by histologic type, neopla...</td>\n",
+       "      <td>-0.1608569175004959, 0.8489153981208801, -0.55...</td>\n",
+       "      <td>NCT02272751</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>anemia, hematologic diseases</td>\n",
+       "      <td>0.21379394829273224, 0.17073844373226166, -0.1...</td>\n",
+       "      <td>NCT00931606</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                   desease_condition  \\\n",
+       "0  marijuana abuse, substance-related disorders, ...   \n",
+       "1  tuberculosis, latent tuberculosis, infections,...   \n",
+       "2  heart failure, heart diseases, cardiovascular ...   \n",
+       "3  lymphoma, neoplasms by histologic type, neopla...   \n",
+       "4                       anemia, hematologic diseases   \n",
+       "\n",
+       "                                          embeddings       nct_id  \n",
+       "0  -0.8323991298675537, 1.47855544090271, 0.00130...  NCT03055377  \n",
+       "1  -0.43443307280540466, 0.9625586271286011, -0.1...  NCT03042754  \n",
+       "2  -0.5791705250740051, 0.13008448481559753, 0.13...  NCT03035123  \n",
+       "3  -0.1608569175004959, 0.8489153981208801, -0.55...  NCT02272751  \n",
+       "4  0.21379394829273224, 0.17073844373226166, -0.1...  NCT00931606  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "# Load knowledge graph\n",
     "clinical_trials = pd.read_csv(\"clinical_trials_embeddings.csv\")\n",

utils.py CHANGED Viewed

@@ -123,16 +123,16 @@ def get_similarities_among_diseases_uris(
     return data
-def augment_the_set_of_diseaces(engine, diseases: List[str]) -> str:
     for i in range(15-len(diseases)):
         with engine.connect() as conn:
             with conn.begin():
                 sql = f"""
                     SELECT TOP 1 e2.uri AS new_disease, (SUM(VECTOR_COSINE(e1.embedding, e2.embedding))/ {len(diseases)})  AS score
                     FROM Test.EntityEmbeddings e1, Test.EntityEmbeddings e2
-                    WHERE e1.uri IN ({','.join([f"'http://identifiers.org/medgen/{disease}'" for disease in diseases])})
-                    AND e2.uri NOT IN ({','.join([f"'http://identifiers.org/medgen/{disease}'" for disease in diseases])})
                     AND e2.label != 'nan'
                     GROUP BY e2.label
                     ORDER BY score DESC
@@ -156,9 +156,7 @@ def get_diseases_related_to_a_textual_description(
 ) -> List[str]:
     # Embed the description using sentence-transformers
     description_embedding = get_embedding(description, encoder)
-    print(f"Size of the embedding: {len(description_embedding)}")
     string_representation = str(description_embedding.tolist())[1:-1]
-    print(f"String representation: {string_representation}")
     with engine.connect() as conn:
         with conn.begin():
@@ -172,27 +170,25 @@ def get_diseases_related_to_a_textual_description(
     return [{"uri": row[0], "distance": row[1]} for row in data]
-def get_diseases_related_to_clinical_trials(
     diseases: List[str], encoder
 ) -> List[str]:
     # Embed the diseases using sentence-transformers
     diseases_string = ", ".join(diseases)
     disease_embedding = get_embedding(diseases_string, encoder)
-    print(f"Size of the embedding: {len(disease_embedding)}")
     string_representation = str(disease_embedding.tolist())[1:-1]
-    print(f"String representation: {string_representation}")
     with engine.connect() as conn:
         with conn.begin():
             sql = f"""
-                    SELECT TOP 5 d.uri, VECTOR_COSINE(d.embedding, TO_VECTOR('{string_representation}', DOUBLE)) AS distance
                     FROM Test.ClinicalTrials d
                     ORDER BY distance DESC
                 """
             result = conn.execute(text(sql))
             data = result.fetchall()
-    return [{"uri": row[0], "distance": row[1]} for row in data]
 if __name__ == "__main__":

     return data
+def augment_the_set_of_diseaces(diseases: List[str]) -> str:
+    print(diseases)
     for i in range(15-len(diseases)):
         with engine.connect() as conn:
             with conn.begin():
                 sql = f"""
                     SELECT TOP 1 e2.uri AS new_disease, (SUM(VECTOR_COSINE(e1.embedding, e2.embedding))/ {len(diseases)})  AS score
                     FROM Test.EntityEmbeddings e1, Test.EntityEmbeddings e2
+                    WHERE e1.uri IN ({','.join([f"'{disease}'" for disease in diseases])})
+                    AND e2.uri NOT IN ({','.join([f"'{disease}'" for disease in diseases])})
                     AND e2.label != 'nan'
                     GROUP BY e2.label
                     ORDER BY score DESC
 ) -> List[str]:
     # Embed the description using sentence-transformers
     description_embedding = get_embedding(description, encoder)
     string_representation = str(description_embedding.tolist())[1:-1]
     with engine.connect() as conn:
         with conn.begin():
     return [{"uri": row[0], "distance": row[1]} for row in data]
+def get_clinical_trials_related_to_diseases(
     diseases: List[str], encoder
 ) -> List[str]:
     # Embed the diseases using sentence-transformers
     diseases_string = ", ".join(diseases)
     disease_embedding = get_embedding(diseases_string, encoder)
     string_representation = str(disease_embedding.tolist())[1:-1]
     with engine.connect() as conn:
         with conn.begin():
             sql = f"""
+                    SELECT TOP 5 d.nct_id, VECTOR_COSINE(d.embedding, TO_VECTOR('{string_representation}', DOUBLE)) AS distance
                     FROM Test.ClinicalTrials d
                     ORDER BY distance DESC
                 """
             result = conn.execute(text(sql))
             data = result.fetchall()
+    return [{"nct_id": row[0], "distance": row[1]} for row in data]
 if __name__ == "__main__":