{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a3d6ff53-2176-44aa-8590-ec0aa301342d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from vllm import LLM, SamplingParams\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import torch.nn.functional as F\n",
    "import torch\n",
    "from transformers import AutoTokenizer\n",
    "from transformers import AutoModelForCausalLM\n",
    "import re\n",
    "import os\n",
    "#os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a070d00-9a45-4360-a38f-ceed8a9360e1",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f407048-0eb3-439a-8257-3cb6881ac784",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "synthetic_histories = pd.read_csv('synthetic_histories_11-22-24.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9bc40636-2325-4664-afc3-833b58fe7ba0",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "synthetic_histories.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9b4cae4-d46d-4a80-841c-8c8f08915b90",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca2b0678-119e-47a7-9a72-28685e97559d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 2, download_dir = \"../../\", gpu_memory_utilization=0.90, max_model_len=120000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f19be1ca-334c-4285-b8b7-0c9fbc83d0d4",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "02b9f891-4b50-4b64-9954-8481056cba79",
   "metadata": {},
   "outputs": [],
   "source": [
    "def summarize_patients(patient_texts, llama_model):\n",
    "    \n",
    "\n",
    "    prompts = []\n",
    "\n",
    "    tokenizer = llama_model.get_tokenizer()\n",
    "\n",
    "    prompts = []\n",
    "    for the_patient in patient_texts:\n",
    "\n",
    "\n",
    "    \n",
    "        messages = [{'role':'system', 'content': \"\"\"You are an experienced clinical oncology history summarization bot.\n",
    "        Your job is to construct a summary of the cancer history for a patient based on an excerpt of the patient's electronic health record. The text in the excerpt is provided in chronological order.     \n",
    "        Document the cancer type/primary site (eg breast cancer, lung cancer, etc); histology (eg adenocarcinoma, squamous carcinoma, etc); current extent (localized, advanced, metastatic, etc); biomarkers (genomic results, protein expression, etc); and treatment history (surgery, radiation, chemotherapy/targeted therapy/immunotherapy, etc, including start and stop dates and best response if known).\n",
    "        Do not consider localized basal cell or squamous carcinomas of the skin, or colon polyps, to be cancers for your purposes.\n",
    "        Do not include the patient's name, but do include relevant dates whenever documented, including dates of diagnosis and start/stop dates of each treatment.\n",
    "        If a patient has a history of more than one cancer, document the cancers one at a time.\n",
    "        \"\"\"}, \n",
    "                    {'role':'user', 'content': \"The excerpt is:\\n\" + the_patient + \"\"\"Now, write your summary. Do not add preceding text before the abstraction, and do not add notes or commentary afterwards. This will not be used for clinical care, so do not write any disclaimers or cautionary notes.\"\"\"}\n",
    "\n",
    "                     ]\n",
    "    \n",
    "\n",
    "\n",
    "        prompts.append(messages)\n",
    "\n",
    "    long_messages = [x[1]['content'] for x in prompts]\n",
    "    trunc_messages = tokenizer.batch_decode([x[-115000:] for x in tokenizer(long_messages, add_special_tokens=False).input_ids])\n",
    "\n",
    "    newprompts = []\n",
    "    for i, messages in enumerate(prompts):\n",
    "        messages[1]['content'] = trunc_messages[i]\n",
    "        template_prompt = tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False)\n",
    "        newprompts.append(template_prompt)\n",
    "        \n",
    "\n",
    "    \n",
    "    responses = llama_model.generate(\n",
    "        newprompts,     \n",
    "        SamplingParams(\n",
    "        temperature=0.0,\n",
    "        top_p=0.2,\n",
    "        max_tokens=4096,\n",
    "        repetition_penalty=1.2,\n",
    "        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")],  # KEYPOINT HERE\n",
    "    ))\n",
    "\n",
    "    response_texts = [x.outputs[0].text for x in responses]\n",
    "\n",
    "\n",
    "    return responses, response_texts\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "69bc8576-e6d7-452f-b6b0-15df7f4c8922",
   "metadata": {},
   "outputs": [],
   "source": [
    "synthetic_histories.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd443d34-c5db-414e-9892-eec368ef7ad6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# example summary generation for one synthetic patient\n",
    "patient_summaries = summarize_patients(synthetic_histories.patient_long_text.iloc[10025:10026].tolist(), llama)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b5f0b1a-6df4-4d32-9072-efb4136df070",
   "metadata": {},
   "outputs": [],
   "source": [
    "patient_summaries[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dabd98af-947e-40c0-aea8-7805bb5b1c3c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "74b2a972-9271-4ed2-9c2c-5ec5793e8650",
   "metadata": {},
   "outputs": [],
   "source": [
    "patient_summaries = summarize_patients(synthetic_histories.patient_long_text.tolist(), llama)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6b772c9-c4dd-45c2-8a4a-9e5c17d25e2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "output = synthetic_histories.copy()\n",
    "output['patient_summary'] = patient_summaries[1]\n",
    "output.to_parquet('synthetic_pt_summaries_11-22-24.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d30bf018-e135-40be-b636-0ba17acf8e61",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f9ed498-4927-46a1-a23e-bf9f3a0cc544",
   "metadata": {},
   "outputs": [],
   "source": [
    "output = pd.read_parquet('synthetic_pt_summaries_11-22-24.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6e1e9ce-e984-458b-881c-a99e3336e6c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "output.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5baf640d-1a6d-447e-84c2-d09a2a94a65a",
   "metadata": {},
   "outputs": [],
   "source": [
    "output.patient_summary.sample(n=1).iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "633ab065-8620-4519-af61-d9e76849cbdf",
   "metadata": {},
   "outputs": [],
   "source": [
    "output['patient_summary'].str.contains(\"Lung\").value_counts()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}