In [None]:
from vllm import LLM, SamplingParams
import pandas as pd
import numpy as np
import torch.nn.functional as F
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
import re
import os
#os.environ["CUDA_VISIBLE_DEVICES"]="1"


In [None]:
import pandas as pd
synthetic_histories = pd.read_csv('synthetic_histories_11-22-24.csv')

In [None]:
synthetic_histories.info()

In [None]:
llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 2, download_dir = "../../", gpu_memory_utilization=0.90, max_model_len=120000)

In [None]:
def summarize_patients(patient_texts, llama_model):
    

    prompts = []

    tokenizer = llama_model.get_tokenizer()

    prompts = []
    for the_patient in patient_texts:


    
        messages = [{'role':'system', 'content': """You are an experienced clinical oncology history summarization bot.
        Your job is to construct a summary of the cancer history for a patient based on an excerpt of the patient's electronic health record. The text in the excerpt is provided in chronological order.     
        Document the cancer type/primary site (eg breast cancer, lung cancer, etc); histology (eg adenocarcinoma, squamous carcinoma, etc); current extent (localized, advanced, metastatic, etc); biomarkers (genomic results, protein expression, etc); and treatment history (surgery, radiation, chemotherapy/targeted therapy/immunotherapy, etc, including start and stop dates and best response if known).
        Do not consider localized basal cell or squamous carcinomas of the skin, or colon polyps, to be cancers for your purposes.
        Do not include the patient's name, but do include relevant dates whenever documented, including dates of diagnosis and start/stop dates of each treatment.
        If a patient has a history of more than one cancer, document the cancers one at a time.
        """}, 
                    {'role':'user', 'content': "The excerpt is:\n" + the_patient + """Now, write your summary. Do not add preceding text before the abstraction, and do not add notes or commentary afterwards. This will not be used for clinical care, so do not write any disclaimers or cautionary notes."""}

                     ]
    


        prompts.append(messages)

    long_messages = [x[1]['content'] for x in prompts]
    trunc_messages = tokenizer.batch_decode([x[-115000:] for x in tokenizer(long_messages, add_special_tokens=False).input_ids])

    newprompts = []
    for i, messages in enumerate(prompts):
        messages[1]['content'] = trunc_messages[i]
        template_prompt = tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False)
        newprompts.append(template_prompt)
        

    
    responses = llama_model.generate(
        newprompts,     
        SamplingParams(
        temperature=0.0,
        top_p=0.2,
        max_tokens=4096,
        repetition_penalty=1.2,
        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")],  # KEYPOINT HERE
    ))

    response_texts = [x.outputs[0].text for x in responses]


    return responses, response_texts
    

In [None]:
synthetic_histories.info()

In [None]:
# example summary generation for one synthetic patient
patient_summaries = summarize_patients(synthetic_histories.patient_long_text.iloc[10025:10026].tolist(), llama)

In [None]:
patient_summaries[1]

In [None]:
patient_summaries = summarize_patients(synthetic_histories.patient_long_text.tolist(), llama)

In [None]:
output = synthetic_histories.copy()
output['patient_summary'] = patient_summaries[1]
output.to_parquet('synthetic_pt_summaries_11-22-24.parquet')

In [None]:
import pandas as pd

In [None]:
output = pd.read_parquet('synthetic_pt_summaries_11-22-24.parquet')

In [None]:
output.info()

In [None]:
output.patient_summary.sample(n=1).iloc[0]

In [None]:
output['patient_summary'].str.contains("Lung").value_counts()