ksg-dfci
/

MatchMiner-AI

Model card Files Files and versions Community

kenlkehl commited on 21 days ago

Commit

1df2239

verified ·

1 Parent(s): 92af765

Upload run_on_your_notes_and_trials.ipynb

Browse files

Files changed (1) hide show

run_on_your_notes_and_trials.ipynb +1174 -0

run_on_your_notes_and_trials.ipynb ADDED Viewed

	@@ -0,0 +1,1174 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e88793d7-e431-47dd-9964-0a633b94062b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "f29a5b89-3b48-4217-8dfc-cca8222e2d1e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/homes10/klkehl/miniconda3/envs/vllm2/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "2025-01-16 18:44:13,978\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from vllm import LLM, SamplingParams\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "import re\n",
+    "import os\n",
+    "from transformers import pipeline, AutoModel\n",
+    "from torch.nn import functional as F\n",
+    "import torch.nn as nn\n",
+    "from torch.utils.data import DataLoader\n",
+    "from torch.nn import LSTM, Linear, Embedding, Conv1d, MaxPool1d, GRU, LSTMCell, Dropout, Module, Sequential, ReLU\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "779db7b2-7bdb-4dea-968e-6bec3b1c892c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "d394de92-98cf-40e2-aa08-4e4f60f195bc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "device(type='cuda')"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+    "device"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "889585e0-3c94-485a-932a-b1cec935b1b3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a41f8e56-e779-4540-a484-a8ec622be396",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "099b73f1-94ce-4908-8951-0041ede61ee8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# here, pull in your raw patient clinical notes, imaging reports, and pathology reports\n",
+    "# your input file should contain at minimum columns like ['mrn', 'date', and 'text']; one row per clinical document\n",
+    "# you can combine notes from multiple patients into one input file as long as there is an mrn field\n",
+    "# this notebook expects MRNs to be called 'dfci_mrn, dates to be called 'date', and clinical text to be called 'text', so rename your columns accordingly\n",
+    "#all_reports = pd.read_csv(\"your_patient_notes_file_here.csv\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "02d6736c-5df1-4507-b5c4-b29ac6d8ba0e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# this is how i pull reports for patients at dfci, commented out for public use\n",
+    "\n",
+    "\n",
+    "# prefix = '/data/clin_notes_outcomes/pan_dfci_2024/derived_data/'\n",
+    "\n",
+    "# # pull in our large corpus of historical electronic health records data\n",
+    "# imaging = pd.read_parquet(prefix + 'all_imaging_reports.parquet')\n",
+    "# medonc = pd.read_parquet(prefix + 'all_clinical_notes.parquet')\n",
+    "# path = pd.read_parquet(prefix + 'all_path_reports.parquet')\n",
+    "\n",
+    "\n",
+    "# all_reports = pd.concat([imaging, medonc, path], axis=0).sort_values(by=['dfci_mrn','date']).reset_index(drop=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a44f0c77-840f-455b-bb62-055b21493324",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "aef31b68-98ac-4d51-a8d0-4adbcd2b42ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_reports = all_reports.sort_values(by=['dfci_mrn','date']).reset_index(drop=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "d4f43b9c-0e6e-4907-a786-c1ae82ce240c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 622 entries, 1627657 to 13607361\n",
+      "Data columns (total 9 columns):\n",
+      " #   Column         Non-Null Count  Dtype         \n",
+      "---  ------         --------------  -----         \n",
+      " 0   dfci_mrn       622 non-null    int64         \n",
+      " 1   date           622 non-null    datetime64[ns]\n",
+      " 2   text           622 non-null    object        \n",
+      " 3   scan_type      283 non-null    object        \n",
+      " 4   split          622 non-null    object        \n",
+      " 5   note_type      622 non-null    object        \n",
+      " 6   department     268 non-null    object        \n",
+      " 7   provider_type  268 non-null    object        \n",
+      " 8   path_type      71 non-null     object        \n",
+      "dtypes: datetime64[ns](1), int64(1), object(7)\n",
+      "memory usage: 48.6+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "# these are the fields in the raw DFCI data, yours will differ\n",
+    "ten_sample_patients = all_reports.dfci_mrn.sample(n=10)\n",
+    "all_reports = all_reports[all_reports.dfci_mrn.isin(ten_sample_patients)]\n",
+    "all_reports.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "fc5d06b4-8762-4462-8b1b-2cdbbd0a8cf3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# the next set of cells works to extract useful information from each clinical note in your dataset, yielding one long history document for each patient"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "8828ae7b-7bbc-4aa7-afe1-ace1cf36df27",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_2451805/729519135.py:37: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+      "  themodel.load_state_dict(torch.load('./tiny_bert_tagger_synthetic.pt'))\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TagModel(\n",
+       "  (bert): BertModel(\n",
+       "    (embeddings): BertEmbeddings(\n",
+       "      (word_embeddings): Embedding(30522, 128, padding_idx=0)\n",
+       "      (position_embeddings): Embedding(512, 128)\n",
+       "      (token_type_embeddings): Embedding(2, 128)\n",
+       "      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)\n",
+       "      (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    )\n",
+       "    (encoder): BertEncoder(\n",
+       "      (layer): ModuleList(\n",
+       "        (0-1): 2 x BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSdpaSelfAttention(\n",
+       "              (query): Linear(in_features=128, out_features=128, bias=True)\n",
+       "              (key): Linear(in_features=128, out_features=128, bias=True)\n",
+       "              (value): Linear(in_features=128, out_features=128, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=128, out_features=128, bias=True)\n",
+       "              (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=128, out_features=512, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=512, out_features=128, bias=True)\n",
+       "            (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "    (pooler): BertPooler(\n",
+       "      (dense): Linear(in_features=128, out_features=128, bias=True)\n",
+       "      (activation): Tanh()\n",
+       "    )\n",
+       "  )\n",
+       "  (prediction_heads): ModuleList(\n",
+       "    (0-8): 9 x Sequential(\n",
+       "      (0): Linear(in_features=128, out_features=128, bias=True)\n",
+       "      (1): ReLU()\n",
+       "      (2): Linear(in_features=128, out_features=1, bias=True)\n",
+       "    )\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "valid_tags_list = ['is_tagged','cancer_type','stage_at_diagnosis','treatment','cancer_burden','cancer_status','adverse_event','comorbidity','biomarker']\n",
+    "best_f1_thresholds = [-1.2996799,\n",
+    " 1.8744006,\n",
+    " -0.90340906,\n",
+    " -1.3298296,\n",
+    " -1.3740511,\n",
+    " -0.97108084,\n",
+    " -1.0886533,\n",
+    " -1.9212211,\n",
+    " -0.7184834]\n",
+    "\n",
+    "\n",
+    "\n",
+    "   \n",
+    "class TagModel(nn.Module):\n",
+    "\n",
+    "    def __init__(self, num_tags, device):\n",
+    "        super(TagModel, self).__init__()\n",
+    "        \n",
+    "        self.bert = AutoModel.from_pretrained('prajjwal1/bert-tiny').to(device)\n",
+    "\n",
+    "        self.prediction_heads = nn.ModuleList([Sequential(Linear(128, 128), ReLU(), Linear(128,1)).to(device) for x in range(0, num_tags)])\n",
+    "        \n",
+    "\n",
+    "    def forward(self, x_text_tensor, x_attention_mask):\n",
+    "        \n",
+    "        main = self.bert(x_text_tensor, x_attention_mask)\n",
+    "        main = main.last_hidden_state[:,0,:].squeeze(1)\n",
+    "\n",
+    "        outputs = [x(main) for x in self.prediction_heads]\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "num_valid_tags = len(valid_tags_list)\n",
+    "themodel = TagModel(num_valid_tags, device)\n",
+    "themodel.load_state_dict(torch.load('./tiny_bert_tagger_synthetic.pt'))\n",
+    "themodel.to(device)\n",
+    "\n",
+    "themodel.eval()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "e6922167-1ea6-4515-8a05-2c52d5e2715e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.utils import data\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "class UnlabeledTagDataset(data.Dataset):\n",
+    "    def __init__(self, pandas_dataset, valid_tags_list):\n",
+    "        self.data = pandas_dataset.copy().reset_index(drop=True)\n",
+    "        self.indices = self.data.index.unique()\n",
+    "        self.tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-tiny', max_length=128, truncation_side='left')        \n",
+    "        self.valid_tags_list = valid_tags_list\n",
+    "        \n",
+    "    def __len__(self):\n",
+    "        # how many notes in the dataset\n",
+    "        return len(self.indices)\n",
+    "    \n",
+    "    def __getitem__(self, index):\n",
+    "        # get data for notes corresponding to indices passed\n",
+    "        this_index = self.indices[index]\n",
+    "        pand = self.data.loc[this_index, :]\n",
+    "    \n",
+    "        encoded = self.tokenizer(pand['excerpt'], padding='max_length', max_length=128, truncation=True)\n",
+    "\n",
+    "        x_text_tensor = torch.tensor(encoded.input_ids, dtype=torch.long)\n",
+    "        x_attention_mask = torch.tensor(encoded.attention_mask, dtype=torch.long)\n",
+    "       \n",
+    "\n",
+    "        return x_text_tensor, x_attention_mask\n",
+    "        \n",
+    "def extract_relevant_text_from_patient(patient_frame_original, valid_tags_list, best_f1_thresholds, tagger_model):\n",
+    "    num_valid_tags = len(valid_tags_list)\n",
+    "    patient_frame = patient_frame_original.copy()\n",
+    "    patient_frame['date'] = pd.to_datetime(patient_frame.date)\n",
+    "    patient_frame = patient_frame.sort_values(by='date').reset_index()\n",
+    "    chunk_frames = []\n",
+    "    for i in range(0, patient_frame.shape[0]):\n",
+    "        chunks = re.sub(\"\\n|\\r\", \" \", patient_frame.iloc[i].text.strip())\n",
+    "        chunks = re.sub(r'\\s+', \" \", chunks)\n",
+    "        chunks = \"<excerpt break>\" + re.sub(\"\\\\. \", \"<excerpt break>\", chunks) + \"<excerpt break>\"\n",
+    "        chunks = pd.Series(chunks.split(\"<excerpt break>\")).str.strip()\n",
+    "        chunks = chunks[chunks != '']\n",
+    "    \n",
+    "        chunk_frame = pd.DataFrame({'date':patient_frame.iloc[i].date, 'note_type':patient_frame.iloc[i].note_type, 'excerpt':chunks})\n",
+    "        chunk_frames.append(chunk_frame)\n",
+    "\n",
+    "    if len(chunk_frames) > 0:\n",
+    "        chunk_frames = pd.concat(chunk_frames, axis=0)\n",
+    "        chunk_frames = chunk_frames.drop_duplicates(subset=['excerpt'], keep='first')\n",
+    "    \n",
+    "        no_shuffle_valid_dataset = data.DataLoader(UnlabeledTagDataset(chunk_frames, valid_tags_list), batch_size=32, shuffle=False, num_workers=0)\n",
+    "\n",
+    "        output_prediction_lists = [[] for x in range(num_valid_tags)]\n",
+    "        for batch in no_shuffle_valid_dataset:\n",
+    "            x_text_ids = batch[0].to(device)\n",
+    "            x_attention_mask = batch[1].to(device)\n",
+    "            with torch.no_grad():\n",
+    "                predictions = tagger_model(x_text_ids, x_attention_mask)\n",
+    "          \n",
+    "            for j in range(num_valid_tags):\n",
+    "                output_prediction_lists[j].append(predictions[j].squeeze(1).detach().cpu().numpy())\n",
+    "        \n",
+    "        output_prediction_lists = [np.concatenate(x) for x in output_prediction_lists]\n",
+    "        \n",
+    "        \n",
+    "        output = chunk_frames.copy()\n",
+    "        for x in range(num_valid_tags):\n",
+    "            output['outcome_' + str(x) + '_logit'] = output_prediction_lists[x]\n",
+    "    \n",
+    "        output = output[output.outcome_0_logit > best_f1_thresholds[0]]\n",
+    "\n",
+    "        output = output.groupby(['date','note_type'])['excerpt'].agg('. '.join).reset_index()\n",
+    "        output = output[~output.excerpt.isnull()]\n",
+    "        output['date_text'] = output['date'].astype(str) + \" \" + output['note_type'] + \" \" + output['excerpt']\n",
+    "        return \"\\n\".join(output.date_text.tolist())\n",
+    "    else:\n",
+    "        return \"\"\n",
+    "       "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ce166f0-2d21-47a3-85f0-bb0d96e28dc5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "d1b9785a-2ecd-454d-a6be-26d106f0b827",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "# this generates a data frame with one row per patient, and a patient_long_text column with a bunch of relevant text extracted from each patient's notes\n",
+    "\n",
+    "patient_list = []\n",
+    "unique_patients = all_reports.groupby('dfci_mrn').first().reset_index()[['dfci_mrn']]\n",
+    "for i in range(unique_patients.shape[0]):\n",
+    "    unique_patient = unique_patients.iloc[[i]]\n",
+    "    patient_frame = all_reports[all_reports.dfci_mrn == unique_patient.dfci_mrn.iloc[0]]\n",
+    "    if patient_frame.shape[0] > 0:\n",
+    "        # this next line is used for retrospective analysis to restrict input text to text predating a treatment start\n",
+    "        #patient_frame = patient_frame[pd.to_datetime(patient_frame.date) < patient_frame.treatment_start_date.iloc[0]]\n",
+    "        unique_patient['patient_long_text'] = extract_relevant_text_from_patient(patient_frame, valid_tags_list, best_f1_thresholds, themodel)\n",
+    "        patient_list.append(unique_patient)        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "86428fae-c91c-4b1b-88da-344dec8d6074",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 10 entries, 0 to 9\n",
+      "Data columns (total 2 columns):\n",
+      " #   Column             Non-Null Count  Dtype \n",
+      "---  ------             --------------  ----- \n",
+      " 0   dfci_mrn           10 non-null     int64 \n",
+      " 1   patient_long_text  10 non-null     object\n",
+      "dtypes: int64(1), object(1)\n",
+      "memory usage: 240.0+ bytes\n"
+     ]
+    }
+   ],
+   "source": [
+    "long_histories = pd.concat(patient_list, axis=0)\n",
+    "long_histories.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "40398c74-1f19-40e0-aefa-f5116eca7d7f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# now you have long histories for each patient\n",
+    "# delete tiny bert tagging model to make room on GPU for llama\n",
+    "del themodel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77e81c05-2f29-4bff-9c06-3ff832c50e3b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c0d98783-0f20-444f-8d14-cf059641bafb",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "d1ce57c1-3932-4fd9-8def-db554535c914",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# now get ready to use llama to summarize patient histories and extract trial spaces"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d1dd378-4c37-4ade-92ac-7bd982d1355e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "4f074561-80ae-44a0-8085-9cac394b80a7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 01-16 18:49:14 awq_marlin.py:97] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.\n",
+      "INFO 01-16 18:49:14 config.py:905] Defaulting to use mp for distributed inference\n",
+      "WARNING 01-16 18:49:14 arg_utils.py:957] Chunked prefill is enabled by default for models with max_model_len > 32K. Currently, chunked prefill might not work with some features or models. If you encounter any issues, please disable chunked prefill by setting --enable-chunked-prefill=False.\n",
+      "INFO 01-16 18:49:14 config.py:1021] Chunked prefill is enabled with max_num_batched_tokens=512.\n",
+      "INFO 01-16 18:49:14 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', speculative_config=None, tokenizer='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=120000, download_dir='../meta_ai/', load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4, num_scheduler_steps=1, chunked_prefill_enabled=True multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, mm_processor_kwargs=None)\n",
+      "WARNING 01-16 18:49:14 multiproc_gpu_executor.py:127] CUDA was previously initialized. We must use the `spawn` multiprocessing start method. Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.\n",
+      "WARNING 01-16 18:49:14 multiproc_gpu_executor.py:53] Reducing Torch parallelism from 32 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.\n",
+      "INFO 01-16 18:49:14 custom_cache_manager.py:17] Setting Triton cache manager to: vllm.triton_utils.custom_cache_manager:CustomCacheManager\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=2453819)\u001b[0;0m INFO 01-16 18:49:19 multiproc_worker_utils.py:215] Worker ready; awaiting tasks\n",
+      "INFO 01-16 18:49:20 utils.py:1008] Found nccl from library libnccl.so.2\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=2453819)\u001b[0;0m INFO 01-16 18:49:20 utils.py:1008] Found nccl from library libnccl.so.2\n",
+      "INFO 01-16 18:49:20 pynccl.py:63] vLLM is using nccl==2.20.5\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=2453819)\u001b[0;0m INFO 01-16 18:49:20 pynccl.py:63] vLLM is using nccl==2.20.5\n",
+      "INFO 01-16 18:49:20 custom_all_reduce_utils.py:242] reading GPU P2P access cache from /homes10/klkehl/.cache/vllm/gpu_p2p_access_cache_for_2,3.json\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=2453819)\u001b[0;0m INFO 01-16 18:49:20 custom_all_reduce_utils.py:242] reading GPU P2P access cache from /homes10/klkehl/.cache/vllm/gpu_p2p_access_cache_for_2,3.json\n",
+      "INFO 01-16 18:49:20 shm_broadcast.py:241] vLLM message queue communication handle: Handle(connect_ip='127.0.0.1', local_reader_ranks=[1], buffer=<vllm.distributed.device_communicators.shm_broadcast.ShmRingBuffer object at 0x7f4951076fc0>, local_subscribe_port=52437, remote_subscribe_port=None)\n",
+      "INFO 01-16 18:49:20 model_runner.py:1056] Starting to load model hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4...\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=2453819)\u001b[0;0m INFO 01-16 18:49:20 model_runner.py:1056] Starting to load model hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4...\n",
+      "INFO 01-16 18:49:21 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=2453819)\u001b[0;0m INFO 01-16 18:49:21 weight_utils.py:243] Using model weights format ['*.safetensors']\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading safetensors checkpoint shards:   0% Completed | 0/9 [00:00<?, ?it/s]\n",
+      "Loading safetensors checkpoint shards:  11% Completed | 1/9 [00:17<02:23, 17.88s/it]\n",
+      "Loading safetensors checkpoint shards:  22% Completed | 2/9 [01:02<03:57, 33.89s/it]\n",
+      "Loading safetensors checkpoint shards:  33% Completed | 3/9 [01:49<03:58, 39.79s/it]\n",
+      "Loading safetensors checkpoint shards:  44% Completed | 4/9 [02:36<03:32, 42.58s/it]\n",
+      "Loading safetensors checkpoint shards:  56% Completed | 5/9 [03:23<02:56, 44.20s/it]\n",
+      "Loading safetensors checkpoint shards:  67% Completed | 6/9 [03:56<02:00, 40.29s/it]\n",
+      "Loading safetensors checkpoint shards:  78% Completed | 7/9 [04:43<01:25, 42.53s/it]\n",
+      "Loading safetensors checkpoint shards:  89% Completed | 8/9 [05:30<00:43, 43.99s/it]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 9/9 [06:16<00:00, 44.57s/it]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 9/9 [06:16<00:00, 41.83s/it]\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 01-16 18:55:45 model_runner.py:1067] Loading model weights took 18.5818 GB\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=2453819)\u001b[0;0m INFO 01-16 18:55:47 model_runner.py:1067] Loading model weights took 18.5807 GB\n",
+      "INFO 01-16 18:55:48 distributed_gpu_executor.py:57] # GPU blocks: 17638, # CPU blocks: 1638\n",
+      "INFO 01-16 18:55:48 distributed_gpu_executor.py:61] Maximum concurrency for 120000 tokens per request: 2.35x\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=2453819)\u001b[0;0m INFO 01-16 18:55:53 model_runner.py:1395] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=2453819)\u001b[0;0m INFO 01-16 18:55:53 model_runner.py:1399] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
+      "INFO 01-16 18:55:53 model_runner.py:1395] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.\n",
+      "INFO 01-16 18:55:53 model_runner.py:1399] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
+      "INFO 01-16 18:56:21 custom_all_reduce.py:233] Registering 5635 cuda graph addresses\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=2453819)\u001b[0;0m INFO 01-16 18:56:21 custom_all_reduce.py:233] Registering 5635 cuda graph addresses\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=2453819)\u001b[0;0m INFO 01-16 18:56:21 model_runner.py:1523] Graph capturing finished in 29 secs.\n",
+      "INFO 01-16 18:56:21 model_runner.py:1523] Graph capturing finished in 28 secs.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# load llama\n",
+    "# modify this depending on your GPU setup and where you want to dowwnload the llm\n",
+    "# requires vllm\n",
+    "import os\n",
+    "from vllm import LLM\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"2,3\"\n",
+    "llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 2, download_dir = \"../meta_ai/\", gpu_memory_utilization=0.80, max_model_len=120000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a2882e7-a180-48e1-a50b-ec95730e08ce",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "485300a3-4661-4db5-b656-7aad11e9e8d3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3eb4a246-27a7-4fa9-915d-17f463e23171",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "418b71f0-9c4a-4edc-b810-23db5b4e40f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# generate summaries for our patients"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "ba00cbee-7b60-44c4-95e9-9da113de6de4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def summarize_patients(patient_texts, llama_model):\n",
+    "    \n",
+    "\n",
+    "    prompts = []\n",
+    "\n",
+    "    tokenizer = llama_model.get_tokenizer()\n",
+    "\n",
+    "    prompts = []\n",
+    "    for the_patient in patient_texts:\n",
+    "\n",
+    "        patient_text_tokens = tokenizer(the_patient, add_special_tokens=False).input_ids\n",
+    "        if len(patient_text_tokens) > 115000:\n",
+    "            first_part = patient_text_tokens[:57500]\n",
+    "            # Slice the last `slice_size` elements\n",
+    "            last_part = patient_text_tokens[-57500:]\n",
+    "            # Concatenate the two slices\n",
+    "            patient_text_tokens = first_part + last_part\n",
+    "        patient_text = tokenizer.decode(patient_text_tokens)\n",
+    "    \n",
+    "        messages = [{'role':'system', 'content': \"\"\"You are an experienced clinical oncology history summarization bot.\n",
+    "        Your job is to construct a summary of the cancer history for a patient based on an excerpt of the patient's electronic health record. The text in the excerpt is provided in chronological order.     \n",
+    "        Document the cancer type/primary site (eg breast cancer, lung cancer, etc); histology (eg adenocarcinoma, squamous carcinoma, etc); current extent (localized, advanced, metastatic, etc); biomarkers (genomic results, protein expression, etc); and treatment history (surgery, radiation, chemotherapy/targeted therapy/immunotherapy, etc, including start and stop dates and best response if known).\n",
+    "        Do not consider localized basal cell or squamous carcinomas of the skin, or colon polyps, to be cancers for your purposes.\n",
+    "        Do not include the patient's name, but do include relevant dates whenever documented, including dates of diagnosis and start/stop dates of each treatment.\n",
+    "        If a patient has a history of more than one cancer, document the cancers one at a time.\n",
+    "        \"\"\"}, \n",
+    "                    {'role':'user', 'content': \"The excerpt is:\\n\" + the_patient + \"\"\"Now, write your summary. Do not add preceding text before the abstraction, and do not add notes or commentary afterwards. This will not be used for clinical care, so do not write any disclaimers or cautionary notes.\"\"\"}\n",
+    "\n",
+    "                     ]\n",
+    "    \n",
+    "\n",
+    "\n",
+    "        prompts.append(messages)\n",
+    "\n",
+    "    trunc_messages = [x[1]['content'] for x in prompts]\n",
+    "\n",
+    "    newprompts = []\n",
+    "    for i, messages in enumerate(prompts):\n",
+    "        messages[1]['content'] = trunc_messages[i]\n",
+    "        template_prompt = tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False)\n",
+    "        newprompts.append(template_prompt)\n",
+    "        \n",
+    "\n",
+    "    \n",
+    "    responses = llama_model.generate(\n",
+    "        newprompts,     \n",
+    "        SamplingParams(\n",
+    "        temperature=0.0,\n",
+    "        top_p=0.2,\n",
+    "        max_tokens=4096,\n",
+    "        repetition_penalty=1.2,\n",
+    "        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")],  # KEYPOINT HERE\n",
+    "    ))\n",
+    "\n",
+    "    response_texts = [x.outputs[0].text for x in responses]\n",
+    "\n",
+    "\n",
+    "    return responses, response_texts\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "7de591f3-154f-4991-83db-0b45979e856a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processed prompts: 100%|█████████| 10/10 [00:47<00:00,  4.71s/it, est. speed input: 925.94 toks/s, output: 42.00 toks/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "long_histories['patient_summary'] = summarize_patients(long_histories.patient_long_text.tolist(), llama)[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "9605f785-8667-441f-838c-5dcf9eac19a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# now we turn attention to the clinical trials we want to match against\n",
+    "# assume you have a dataset of trials, each with an eligibilty_criteria text field as from clinicaltrials.gov\n",
+    "# here, i just used a download from ct.gov for trials relating to cancer\n",
+    "trials = pd.read_csv('ctgov_cancer_trials.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "b13620e6-cdde-4668-a992-1e8fddf4ab8a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ultimately you want to have a raw trial_text field that combines the trial title, summary, and eligibility criteria text from ct.gov\n",
+    "trials['trial_text'] = trials['title'] + \"\\n\" + trials['brief_summary'] + \"\\n\" + trials['eligibility_criteria']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "ee5d88da-8867-4aae-892a-799dbffeb8c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# now summarize the trials of interest to you based on the trial_text field\n",
+    "def summarize_trials_multi_cohort(eligibility_texts, llama_model):\n",
+    "\n",
+    "    tokenizer = llama.get_tokenizer()\n",
+    "    prompts = []\n",
+    "    for trial in eligibility_texts:\n",
+    "        messages = [\n",
+    "            {'role':'system', 'content': \"\"\"You are an expert clinical oncologist with an encyclopedic knowledge of cancer and its treatments.\n",
+    "        Your job is to review a clinical trial document and extract a list of structured clinical spaces that are eligible for that trial.\n",
+    "        A clinical space is defined as a unique combination of cancer primary site, histology, which treatments a patient must have received, which treatments a patient must not have received, cancer burden (eg presence of metastatic disease), and tumor biomarkers (such as germline or somatic gene mutations or alterations, or protein expression on tumor) that a patient must have or must not have; that renders a patient eligible for the trial.\n",
+    "        Trials often specify that a particular treatment is excluded only if it was given within a short period of time, for example 14 days, one month, etc , prior to trial start. Do not include this type of time-specific treatment eligibility criteria in your output at all.\n",
+    "        Some trials have only one space, while others have several. Do not output a space that contains multiple cancer types and/or histologies. Instead, generate separate spaces for each cancer type/histology combination.\n",
+    "        For biomarkers, if the trial specifies whether the biomarker will be assessed during screening, note that.\n",
+    "        Spell out cancer types; do not abbreviate them. For example, write \"non-small cell lung cancer\" rather than \"NSCLC\".\n",
+    "        Structure your output like this, as a list of spaces, with spaces separated by newlines, as below:\n",
+    "        1. Cancer type allowed: <cancer_type_allowed>. Histology allowed: <histology_allowed>. Cancer burden allowed: <cancer_burden_allowed>. Prior treatment required: <prior_treatments_requred>. Prior treatment excluded: <prior_treatments_excluded>. Biomarkers required: <biomarkers_required>. Biomarkers excluded: <biomarkers_excluded>.\n",
+    "        2. Cancer type allowed: <cancer_type_allowed>, etc.\n",
+    "        If a particular concept is not mentioned in the trial text, do not include it in your definition of trial space(s).\n",
+    "        \"\"\"},      \n",
+    "              \n",
+    "            {'role':'user', 'content': \"Here is a clinical trial document: \\n\" + trial + \"\\n\" + \"\"\"Now, generate your list of the trial space(s), formatted as above.\n",
+    "            Do not provide any introductory, explanatory, concluding, or disclaimer text.\n",
+    "            Reminder: Treatment history is an important component of trial space definitions, but treatment history requirements that are described as applying only in a given period of time prior to trial treatment MUST BE IGNORED.\"\"\"\n",
+    "            }\n",
+    "        ]\n",
+    "    \n",
+    "        prompts.append(tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False))\n",
+    "    \n",
+    "\n",
+    "    \n",
+    "    responses = llama_model.generate(\n",
+    "        prompts,   \n",
+    "        SamplingParams(\n",
+    "        temperature=0.0,\n",
+    "        top_p=0.9,\n",
+    "        max_tokens=3096,\n",
+    "        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")],  # KEYPOINT HERE\n",
+    "    ))\n",
+    "\n",
+    "    response_texts = [x.outputs[0].text for x in responses]\n",
+    "\n",
+    "\n",
+    "    return responses, response_texts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "f05c9d94-9190-4d29-bcf8-0800fbb37f42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# this runs the trial summarization/space extraction\n",
+    "# i have a premade trial spaces file, so this is commented out\n",
+    "# trials['spaces'] = summarize_trials_multi_cohort(trials.trial_text.tolist(), llama)[1]\n",
+    "#trials.to_csv('ctgov_all_trials_unique_trial_spaces_10-31-24.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "14d40b23-fcfa-4fe6-a0ff-c58d9750cab2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trials = pd.read_csv('ctgov_all_trials_unique_trial_spaces_10-31-24.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "8ee87e63-ef82-43a5-b243-0de4561a8bd0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# now parse the extracted trial spaces to get one row per space (can be one or more rows per trial)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "9cc06840-5647-4524-a7bf-a1ad53a07b7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "frames = []\n",
+    "for i in range(trials.shape[0]):\n",
+    "    cohorts = pd.Series(trials.iloc[i].spaces.split(\"\\n\"))\n",
+    "    cohorts = cohorts[~((cohorts.isnull()) | (cohorts == \"\\n\") | (cohorts == ''))].reset_index(drop=True)\n",
+    "    frame = pd.DataFrame(np.repeat(trials.iloc[[i]], len(cohorts), axis=0), columns=trials.columns)\n",
+    "    frame['this_space'] = cohorts\n",
+    "    frame['space_number'] = frame.index\n",
+    "    frames.append(frame)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "541669eb-f92e-49f3-9a36-b6625448c1a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_level_trials = pd.concat(frames, axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "51a04e84-7483-4398-b4a0-d0cdab790609",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 38276 entries, 0 to 0\n",
+      "Data columns (total 10 columns):\n",
+      " #   Column                Non-Null Count  Dtype \n",
+      "---  ------                --------------  ----- \n",
+      " 0   Unnamed: 0.1          38276 non-null  object\n",
+      " 1   Unnamed: 0            38276 non-null  object\n",
+      " 2   nct_id                38276 non-null  object\n",
+      " 3   title                 38276 non-null  object\n",
+      " 4   brief_summary         38276 non-null  object\n",
+      " 5   eligibility_criteria  38276 non-null  object\n",
+      " 6   trial_text            38276 non-null  object\n",
+      " 7   spaces                38276 non-null  object\n",
+      " 8   this_space            38276 non-null  object\n",
+      " 9   space_number          38276 non-null  int64 \n",
+      "dtypes: int64(1), object(9)\n",
+      "memory usage: 3.2+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "cohort_level_trials.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "648f0e1e-ef81-4983-8f03-1fbdb138f649",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "this_space\n",
+       "True     38140\n",
+       "False      136\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cohort_level_trials.this_space.str[0].isin(['1','2','3','4','5','6','7','8','9']).value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "9ea048c1-c4ef-4202-a9be-a4658c4f1058",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trial_spaces = cohort_level_trials[cohort_level_trials.this_space.str[0].isin(['1','2','3','4','5','6','7','8','9'])]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "852aee9d-ad97-4374-932f-6cae378dde2a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "00d2220a-627a-4b67-be28-c42561c3c964",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# if you want to save the extracted individual trial 'spaces' do this\n",
+    "#trial_spaces.to_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "d8d351c0-26fd-47f5-98f7-843198909733",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# this trial dataframe now has one row per trial 'space'; i have pre-generated it\n",
+    "trial_spaces = pd.read_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd726671-8517-47f7-a306-fd28ae0ce25d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "70dd7f46-8edb-4ff2-81d0-b94e64816ac5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.23it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# now embed patients and trial spaces\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "\n",
+    "# lazily using cpu here\n",
+    "embedding_model = SentenceTransformer('ksg-dfci/TrialSpace', trust_remote_code=True, device='cpu')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "9a08f0ae-ee9b-475c-b2af-c46be89d71d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with torch.no_grad():\n",
+    "    patient_embeddings = embedding_model.encode(long_histories.patient_summary, convert_to_tensor=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "aad4c966-f70a-48f7-b58c-d0932f9a9010",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# here's where we embed trial spaces\n",
+    "# this only needs to be run once to generate and save trial embeddings, or for a short list of trials you can run it every time\n",
+    "# here it is commented out, since I'll just load the previously generated embeddings\n",
+    "\n",
+    "# with torch.no_grad():\n",
+    "#    trial_space_embeddings = embedding_model.encode(trial_spaces.this_space.tolist(), convert_to_tensor=True)\n",
+    "\n",
+    "# from safetensors.torch import save_file\n",
+    "# output_trial_file = {\"space_embeddings\": trial_space_embeddings}\n",
+    "# save_file(output_trial_file, \"trial_space_embeddings.safetensors\")\n",
+    "\n",
+    "# trial_space_embeddings.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "3690cf7b-7df8-46eb-86c0-1e9efdaa1f43",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load trial space embeddings, should have same number of embeddings as there are in the trial spaces dataset\n",
+    "from safetensors import safe_open\n",
+    "with safe_open(\"trial_space_embeddings.safetensors\", framework=\"pt\", device='cpu') as f:\n",
+    "    trial_space_embeddings = f.get_tensor(\"space_embeddings\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "7f1d5a59-d458-4cb8-8015-492ca1e31de5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(torch.Size([38140, 1024]), (38140, 10))"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trial_space_embeddings.shape, trial_spaces.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "f541566e-1027-4de3-b5d9-6a04b1371cd2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# now let's find the top ten trial 'spaces' for each patient based on cosine similarity\n",
+    "\n",
+    "output_list = []\n",
+    "for i, patient_summary in enumerate(long_histories.patient_summary):\n",
+    "    patient_embedding = patient_embeddings[i, :]\n",
+    "    similarities = F.cosine_similarity(patient_embedding, trial_space_embeddings)\n",
+    "    sorted_similarities, sorted_indices = torch.sort(similarities, descending=True)\n",
+    "    relevant_spaces = trial_spaces.iloc[sorted_indices[0:10].cpu().numpy()]\n",
+    "    output = pd.DataFrame({'patient_summary':patient_summary, 'this_space':relevant_spaces.this_space, 'nct_id':relevant_spaces.nct_id, \n",
+    "                           'trial_title':relevant_spaces.title, 'trial_brief_summary':relevant_spaces.brief_summary,\n",
+    "                           'trial_text':relevant_spaces.trial_text})\n",
+    "    output_list.append(output)\n",
+    "\n",
+    "output = pd.concat(output_list, axis=0).reset_index(drop=True)\n",
+    "output['patient_summary'] = output.patient_summary.str.strip()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "6664ebe1-34a6-4184-985b-ef13f4a39369",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# now run 'trial checker' classifier to double check the top (10) matches we have pulled"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "ceaca6ce-f7af-4156-a185-2b506f57e469",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(\"roberta-large\")\n",
+    "\n",
+    "checker_pipe = pipeline(\n",
+    "    'text-classification', \n",
+    "    'ksg-dfci/TrialChecker', \n",
+    "    tokenizer=tokenizer, \n",
+    "    truncation=True, \n",
+    "    padding='max_length', \n",
+    "    max_length=512\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "9d9bc3cf-9f2f-41a2-a15f-0b552a4007c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "output['pt_trial_pair'] = (output['this_space'] + \"\\nNow here is the patient summary:\" + output['patient_summary'])\n",
+    "\n",
+    "classifier_results = checker_pipe(output['pt_trial_pair'].tolist())\n",
+    "output['trial_checker_result'] = [x['label'] for x in classifier_results]\n",
+    "output['trial_checker_score'] = [x['score'] for x in classifier_results]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "450939bd-44cc-4dc7-900a-255a46f2c3ab",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 100 entries, 0 to 99\n",
+      "Data columns (total 9 columns):\n",
+      " #   Column                Non-Null Count  Dtype  \n",
+      "---  ------                --------------  -----  \n",
+      " 0   patient_summary       100 non-null    object \n",
+      " 1   this_space            100 non-null    object \n",
+      " 2   nct_id                100 non-null    object \n",
+      " 3   trial_title           100 non-null    object \n",
+      " 4   trial_brief_summary   100 non-null    object \n",
+      " 5   trial_text            100 non-null    object \n",
+      " 6   pt_trial_pair         100 non-null    object \n",
+      " 7   trial_checker_result  100 non-null    object \n",
+      " 8   trial_checker_score   100 non-null    float64\n",
+      "dtypes: float64(1), object(8)\n",
+      "memory usage: 7.2+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "output.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f00270f2-7ce1-42e7-8f16-986acdde76f8",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a905f88e-479e-4e4c-8492-827f7768d91c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}