In [1]:
import pandas as pd
import numpy as np
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '2'


In [2]:
trial_spaces = pd.read_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')

In [3]:
trial_spaces.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38140 entries, 0 to 38139
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0.1          38140 non-null  int64 
 1   Unnamed: 0            38140 non-null  int64 
 2   nct_id                38140 non-null  object
 3   title                 38140 non-null  object
 4   brief_summary         38140 non-null  object
 5   eligibility_criteria  38140 non-null  object
 6   trial_text            38140 non-null  object
 7   spaces                38140 non-null  object
 8   this_space            38140 non-null  object
 9   space_number          38140 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 2.9+ MB


In [4]:
from sentence_transformers import SentenceTransformer
import torch

embedding_model = SentenceTransformer('reranker_round2.model', trust_remote_code=True, device='cuda')

  from tqdm.autonotebook import tqdm, trange
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.86s/it]


In [5]:
# only needs to be run once to generate and save trial embeddings

# with torch.no_grad():
#    trial_space_embeddings = embedding_model.encode(trial_spaces.this_space.tolist(), convert_to_tensor=True)

# from safetensors.torch import save_file
# output_trial_file = {"space_embeddings": trial_space_embeddings}
# save_file(output_trial_file, "trial_space_embeddings.safetensors")

# trial_space_embeddings.shape

In [6]:
from safetensors import safe_open
with safe_open("trial_space_embeddings.safetensors", framework="pt", device=0) as f:
    trial_space_embeddings = f.get_tensor("space_embeddings")

In [7]:
from transformers import pipeline, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

pipe = pipeline('text-classification', './roberta-checker', tokenizer=tokenizer, truncation=True, padding='max_length', max_length=512, device='cuda') 


In [8]:
patient_summary = "metastatic lung adenocarcinoma, PD-L1 75%, KRAS G12C mutant, prior pembrolizumab, prior carboplatin/pemetrexed"

In [9]:
patient_embedding = embedding_model.encode([patient_summary], convert_to_tensor=True)

In [10]:
import torch.nn.functional as F
similarities = F.cosine_similarity(patient_embedding, trial_space_embeddings)

In [11]:
similarities.shape

torch.Size([38140])

In [12]:
# pull top ten spaces for the patient
sorted_similarities, sorted_indices = torch.sort(similarities, descending=True)
relevant_spaces = trial_spaces.iloc[sorted_indices[0:10].cpu().numpy()].this_space
relevant_nctid =  trial_spaces.iloc[sorted_indices[0:10].cpu().numpy()].nct_id
relevant_title = trial_spaces.iloc[sorted_indices[0:10].cpu().numpy()].title
relevant_brief_summary = trial_spaces.iloc[sorted_indices[0:10].cpu().numpy()].brief_summary
relevant_eligibility_criteria = trial_spaces.iloc[sorted_indices[0:10].cpu().numpy()].eligibility_criteria
relevant_space_embeddings = trial_space_embeddings[sorted_indices[0:10], :]

In [13]:
analysis = pd.DataFrame({'patient_summary':patient_summary, 'this_space':relevant_spaces,
                        'nct_id':relevant_nctid, 'trial_title':relevant_title,
                        'trial_brief_summary':relevant_brief_summary, 'trial_eligibility_criteria':relevant_eligibility_criteria}).reset_index(drop=True)
analysis['pt_trial_pair'] = analysis['this_space'] + "\nNow here is the patient summary:" + analysis['patient_summary']
analysis.head()

Unnamed: 0,patient_summary,this_space,nct_id,trial_title,trial_brief_summary,trial_eligibility_criteria,pt_trial_pair
0,"metastatic lung adenocarcinoma, PD-L1 75%, KRA...",5. Cancer type allowed: non-small cell lung ca...,NCT06253520,A Phase Ib Clinical Trial to Evaluate the Admi...,Background:\n\nMany cancer cells produce subst...,* INCLUSION CRITERIA:\n* Participants with an ...,5. Cancer type allowed: non-small cell lung ca...
1,"metastatic lung adenocarcinoma, PD-L1 75%, KRA...",1. Cancer type allowed: non-small cell lung ca...,NCT05853575,A Randomized Study of Two Dosing Regimens of A...,This study will evaluate the efficacy of two d...,Key Inclusion Criteria:\n\n* Are at least 18 y...,1. Cancer type allowed: non-small cell lung ca...
2,"metastatic lung adenocarcinoma, PD-L1 75%, KRA...",3. Cancer type allowed: non-small cell lung ca...,NCT06128551,"Phase 1b, Multicenter, Open-Label, Dose Escala...","This study is to evaluate the safety, tolerabi...",Inclusion Criteria:\n\n* 18 years of age\n* Hi...,3. Cancer type allowed: non-small cell lung ca...
3,"metastatic lung adenocarcinoma, PD-L1 75%, KRA...",1. Cancer type allowed: Non-small cell lung ca...,NCT05788926,A Phase I Dose-escalation Trial of TG6050 Admi...,"This is a phase I, open-label, dose-escalation...",Inclusion Criteria:\n\n1. Signed written infor...,1. Cancer type allowed: Non-small cell lung ca...
4,"metastatic lung adenocarcinoma, PD-L1 75%, KRA...",1. Cancer type allowed: non-small cell lung ca...,NCT05375084,A Phase 1 Study of the SHP2 Inhibitor BBP-398 ...,"This is a Phase 1 study of BBP-398, a SHP2 inh...",Key Inclusion Criteria:\n\n* Patients must hav...,1. Cancer type allowed: non-small cell lung ca...


In [14]:
pipe = pipeline('text-classification', model='./roberta-checker', device='cuda')

In [15]:
classifier_results = pipe(analysis.pt_trial_pair.tolist())
analysis['roberta_check_result'] = [x['label'] for x in classifier_results]
analysis['roberta_check_score'] = [x['score'] for x in classifier_results]


In [16]:
analysis

Unnamed: 0,patient_summary,this_space,nct_id,trial_title,trial_brief_summary,trial_eligibility_criteria,pt_trial_pair,roberta_check_result,roberta_check_score
0,"metastatic lung adenocarcinoma, PD-L1 75%, KRA...",5. Cancer type allowed: non-small cell lung ca...,NCT06253520,A Phase Ib Clinical Trial to Evaluate the Admi...,Background:\n\nMany cancer cells produce subst...,* INCLUSION CRITERIA:\n* Participants with an ...,5. Cancer type allowed: non-small cell lung ca...,NEGATIVE,0.834101
1,"metastatic lung adenocarcinoma, PD-L1 75%, KRA...",1. Cancer type allowed: non-small cell lung ca...,NCT05853575,A Randomized Study of Two Dosing Regimens of A...,This study will evaluate the efficacy of two d...,Key Inclusion Criteria:\n\n* Are at least 18 y...,1. Cancer type allowed: non-small cell lung ca...,POSITIVE,0.910206
2,"metastatic lung adenocarcinoma, PD-L1 75%, KRA...",3. Cancer type allowed: non-small cell lung ca...,NCT06128551,"Phase 1b, Multicenter, Open-Label, Dose Escala...","This study is to evaluate the safety, tolerabi...",Inclusion Criteria:\n\n* 18 years of age\n* Hi...,3. Cancer type allowed: non-small cell lung ca...,POSITIVE,0.915395
3,"metastatic lung adenocarcinoma, PD-L1 75%, KRA...",1. Cancer type allowed: Non-small cell lung ca...,NCT05788926,A Phase I Dose-escalation Trial of TG6050 Admi...,"This is a phase I, open-label, dose-escalation...",Inclusion Criteria:\n\n1. Signed written infor...,1. Cancer type allowed: Non-small cell lung ca...,POSITIVE,0.914168
4,"metastatic lung adenocarcinoma, PD-L1 75%, KRA...",1. Cancer type allowed: non-small cell lung ca...,NCT05375084,A Phase 1 Study of the SHP2 Inhibitor BBP-398 ...,"This is a Phase 1 study of BBP-398, a SHP2 inh...",Key Inclusion Criteria:\n\n* Patients must hav...,1. Cancer type allowed: non-small cell lung ca...,POSITIVE,0.87793
5,"metastatic lung adenocarcinoma, PD-L1 75%, KRA...",2. Cancer type allowed: non-small cell lung ca...,NCT06128551,"Phase 1b, Multicenter, Open-Label, Dose Escala...","This study is to evaluate the safety, tolerabi...",Inclusion Criteria:\n\n* 18 years of age\n* Hi...,2. Cancer type allowed: non-small cell lung ca...,POSITIVE,0.926033
6,"metastatic lung adenocarcinoma, PD-L1 75%, KRA...",2. Cancer type allowed: Non-Small Cell Lung Ca...,NCT06447662,A Phase 1 Open-Label Study of PF-07934040 as a...,The purpose of this study is to learn about th...,Inclusion Criteria:\n\n* Histological or cytol...,2. Cancer type allowed: Non-Small Cell Lung Ca...,POSITIVE,0.506948
7,"metastatic lung adenocarcinoma, PD-L1 75%, KRA...",1. Cancer type allowed: non-small cell lung ca...,NCT06127940,K-SAB Trial - Sotorasib Followed by SBRT to 1-...,The goal of this interventional study is to le...,Main inclusion criteria:\n\n1. Histological or...,1. Cancer type allowed: non-small cell lung ca...,POSITIVE,0.952771
8,"metastatic lung adenocarcinoma, PD-L1 75%, KRA...",1. Cancer type allowed: non-small cell lung ca...,NCT06343402,A Phase 1a/1b Open-Label Study of BBO-8520 As ...,"A first in human study to evaluate the safety,...",Inclusion Criteria:\n\n* Histologically docume...,1. Cancer type allowed: non-small cell lung ca...,POSITIVE,0.949954
9,"metastatic lung adenocarcinoma, PD-L1 75%, KRA...",1. Cancer type allowed: non-small cell lung ca...,NCT05815173,Phase I/II Study of Ladarixin and Sotorasib in...,"This is a phase I/II, open-label, study of twi...",Inclusion Criteria:\n\n* Written informed cons...,1. Cancer type allowed: non-small cell lung ca...,POSITIVE,0.937962


In [17]:
analysis.this_space.iloc[0]

'5. Cancer type allowed: non-small cell lung cancer. Histology allowed: solid cancer. Cancer burden allowed: metastatic disease. Prior treatment required: at least one platinum-based chemotherapy regimen and at least one FDA-approved targeted treatment. Biomarkers required: KRAS G12V or G12D mutation. Biomarkers to be assessed during screening: HLA match.'

In [18]:
analysis.this_space.iloc[1]

'1. Cancer type allowed: non-small cell lung cancer. Histology allowed: adenocarcinoma, squamous cell carcinoma, large cell carcinoma, and other subtypes of non-small cell lung cancer. Cancer burden allowed: advanced, metastatic. Prior treatment required: chemotherapy that included cisplatin or carboplatin, immune checkpoint inhibitor. Prior treatment excluded: KRAS G12C targeted therapy. Biomarkers required: KRAS G12C mutation.'

In [19]:
analysis.this_space.iloc[2]

'3. Cancer type allowed: non-small cell lung cancer. Histology allowed: pathologically documented, KRAS G12C-mutated. Cancer burden allowed: advanced or metastatic. Prior treatment required: immunotherapy, chemotherapy. Biomarkers required: KRAS G12C mutation.'