AmelieSchreiber
/

esm2_t12_35M_lora_binding_sites_v2_cp3

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c743a143-9d3e-4199-a984-5ad51014c168",
+   "metadata": {},
+   "source": [
+    "# UniProt Data Preprocessing\n",
+    "\n",
+    "This notebook is for preprocessing a UniProt TSV file with columns (Protein families, Binding site, Active site, Sequence). If the family annotation is missing, the code will filter out this sequence. Missing binding sites are not acceptable for this notebook, so make sure all of your suequences have binding site annotations. If the Active site annotation is missing, the sequence will be included without issue. Missing sequences are not handled by this notebook. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b595fc6e-ef53-47fa-a517-ea3bd1066a1c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "# Load the dataset\n",
+    "file_path = 'uniprotkb_family_AND_ft_binding_AND_pro_2023_09_19.tsv'\n",
+    "data = pd.read_csv(file_path, sep='\\t')\n",
+    "\n",
+    "# Display the first few rows of the dataframe\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "45666b3a-45e7-41e5-834b-bf7a0ca8b3de",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data.shape[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "7bff9929-9423-4d58-9c5f-6c1758e50da7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# Load the dataset\n",
+    "file_path = 'uniprotkb_family_AND_ft_binding_AND_pro_2023_09_19.tsv'\n",
+    "data = pd.read_csv(file_path, sep='\\t')\n",
+    "\n",
+    "# Filter out rows with NaN values in the 'Protein families' column\n",
+    "data = data[pd.notna(data['Protein families'])]\n",
+    "\n",
+    "# Display the first few rows of the modified dataframe\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f87e25da-12b2-4002-959b-f3be7c5b4928",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data.shape[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "062d44bd-2aa4-40e2-9662-9d7cfacabc80",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Group the data by 'Protein families' and get the size of each group\n",
+    "family_sizes = data.groupby('Protein families').size()\n",
+    "\n",
+    "# Create a new column with the size of each family\n",
+    "data['Family size'] = data['Protein families'].map(family_sizes)\n",
+    "\n",
+    "# Sort the data by 'Family size' in descending order and then by 'Protein families'\n",
+    "data_sorted = data.sort_values(by=['Family size', 'Protein families'], ascending=[False, True])\n",
+    "\n",
+    "# Drop the 'Family size' column as it is no longer needed\n",
+    "data_sorted.drop(columns='Family size', inplace=True)\n",
+    "\n",
+    "# Define a function to extract the location from the binding and active site columns\n",
+    "def extract_location(site_info):\n",
+    "    if pd.isnull(site_info):\n",
+    "        return None\n",
+    "    locations = []\n",
+    "    for info in site_info.split(';'):\n",
+    "        if 'BINDING' in info or 'ACT_SITE' in info:\n",
+    "            locations.append(info.split()[1])\n",
+    "    return '; '.join(locations)\n",
+    "\n",
+    "# Apply the function to the 'Binding site' and 'Active site' columns to extract the locations\n",
+    "data_sorted['Binding site'] = data_sorted['Binding site'].apply(extract_location)\n",
+    "data_sorted['Active site'] = data_sorted['Active site'].apply(extract_location)\n",
+    "\n",
+    "# Display the first few rows of the modified dataframe\n",
+    "data_sorted.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "70e04892-19a8-4e55-8b0d-bd5d9108b8d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a new column that combines the 'Binding site' and 'Active site' columns\n",
+    "data_sorted['Binding-Active site'] = data_sorted['Binding site'].astype(str) + '; ' + data_sorted['Active site'].astype(str)\n",
+    "\n",
+    "# Replace 'nan' values with None\n",
+    "data_sorted['Binding-Active site'] = data_sorted['Binding-Active site'].replace('nan; nan', None)\n",
+    "\n",
+    "# Display the first few rows of the updated dataframe\n",
+    "data_sorted.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c7022fff-b445-47df-afc8-f5a2e3659be7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Find entries in the \"Binding-Active site\" column containing '<' or '>'\n",
+    "entries_with_angle_brackets = data_sorted['Binding-Active site'].str.contains('<|>', na=False)\n",
+    "\n",
+    "# Get the number of such entries\n",
+    "num_entries_with_angle_brackets = entries_with_angle_brackets.sum()\n",
+    "\n",
+    "# Display the number of entries containing '<' or '>'\n",
+    "print(f\"Number of entries with angle brackets: {num_entries_with_angle_brackets}\")\n",
+    "\n",
+    "# Remove all rows where the \"Binding-Active site\" column contains '<' or '>'\n",
+    "data_filtered = data_sorted[~entries_with_angle_brackets]\n",
+    "\n",
+    "# Get the number of remaining rows\n",
+    "num_remaining_rows = data_filtered.shape[0]\n",
+    "\n",
+    "# Display the number of remaining rows\n",
+    "print(f\"Number of remaining rows: {num_remaining_rows}\")\n",
+    "\n",
+    "# Get the number of distinct protein families\n",
+    "num_distinct_families = data_filtered['Protein families'].nunique()\n",
+    "\n",
+    "# Display the number of distinct protein families\n",
+    "# Display the number of distinct protein families\n",
+    "print(f\"Number of distinct protein families: {num_distinct_families}\")\n",
+    "\n",
+    "# Define the target number of rows for the test set (approximately 20% of the data)\n",
+    "target_test_rows = int(0.20 * num_remaining_rows)\n",
+    "\n",
+    "# Get unique protein families\n",
+    "unique_families = data_filtered['Protein families'].unique()\n",
+    "\n",
+    "# Shuffle the unique families to randomize the selection\n",
+    "np.random.shuffle(unique_families)\n",
+    "\n",
+    "# Initialize variables to keep track of the selected rows for the test and train sets\n",
+    "test_rows = []\n",
+    "current_test_rows = 0\n",
+    "\n",
+    "# Loop through the shuffled families and add rows to the test set until we reach the target number of rows\n",
+    "for family in unique_families:\n",
+    "    family_rows = data_filtered[data_filtered['Protein families'] == family].index.tolist()\n",
+    "    if current_test_rows + len(family_rows) < target_test_rows:\n",
+    "        test_rows.extend(family_rows)\n",
+    "        current_test_rows += len(family_rows)\n",
+    "    else:\n",
+    "        # If adding the current family exceeds the target, we add it anyway and break the loop\n",
+    "        test_rows.extend(family_rows)\n",
+    "        break\n",
+    "\n",
+    "# Get the indices of the rows for the train set (all rows not in the test set)\n",
+    "train_rows = [i for i in data_filtered.index if i not in test_rows]\n",
+    "\n",
+    "# Create the test and train datasets\n",
+    "test_df = data_filtered.loc[test_rows]\n",
+    "train_df = data_filtered.loc[train_rows]\n",
+    "\n",
+    "test_df.shape[0], train_df.shape[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "6a5e747a-5af3-4eec-ba83-7d520d753e1f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Print the first few rows of each dataset to understand their structure\n",
+    "test_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "796c592c-4e4f-4403-b43d-843b9972170f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "ae5884c1-376c-4f53-b638-f63d7e4ea5ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Find rows where the \"Binding-Active site\" column contains the character \"?\", treating \"?\" as a literal character\n",
+    "test_rows_with_question_mark = test_df[test_df['Binding-Active site'].str.contains('\\?', na=False, regex=True)]\n",
+    "train_rows_with_question_mark = train_df[train_df['Binding-Active site'].str.contains('\\?', na=False, regex=True)]\n",
+    "\n",
+    "# Get the number of such rows in both datasets\n",
+    "num_test_rows_with_question_mark = len(test_rows_with_question_mark)\n",
+    "num_train_rows_with_question_mark = len(train_rows_with_question_mark)\n",
+    "\n",
+    "print(f\"Number of test rows with question mark: {num_test_rows_with_question_mark}\")\n",
+    "print(f\"Number of train rows with question mark: {num_train_rows_with_question_mark}\")\n",
+    "\n",
+    "# Delete the rows containing '?' in the \"Binding-Active site\" column\n",
+    "test_df = test_df.drop(test_rows_with_question_mark.index)\n",
+    "train_df = train_df.drop(train_rows_with_question_mark.index)\n",
+    "\n",
+    "# Check the number of remaining rows in both datasets\n",
+    "remaining_test_rows = test_df.shape[0]\n",
+    "remaining_train_rows = train_df.shape[0]\n",
+    "\n",
+    "print(f\"Number of remaining test rows: {remaining_test_rows}\")\n",
+    "print(f\"Number of remaining train rows: {remaining_train_rows}\")\n",
+    "\n",
+    "import re\n",
+    "\n",
+    "def expand_ranges(s):\n",
+    "    \"\"\"Expand ranges in a string.\"\"\"\n",
+    "    return re.sub(r'(\\d+)\\.\\.(\\d+)', lambda m: ', '.join(map(str, range(int(m.group(1)), int(m.group(2))+1))), str(s))\n",
+    "\n",
+    "# Apply the function to expand ranges in the \"Binding-Active site\" column in both datasets\n",
+    "test_df['Binding-Active site'] = test_df['Binding-Active site'].apply(expand_ranges)\n",
+    "train_df['Binding-Active site'] = train_df['Binding-Active site'].apply(expand_ranges)\n",
+    "\n",
+    "# Display the first few rows of each dataset to verify the changes\n",
+    "# print(test_df.head())\n",
+    "# print(train_df.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "2d76022f-b8d6-4a9c-81a4-7eae34af4732",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def convert_to_binary_list(binding_active_str, sequence_len):\n",
+    "    \"\"\"Convert a Binding-Active site string to a binary list based on the sequence length.\"\"\"\n",
+    "    # Step 2: Create a list of 0s with length equal to the sequence length\n",
+    "    binary_list = [0] * sequence_len\n",
+    "    \n",
+    "    # Step 3: Retrieve the indices and set the corresponding positions to 1\n",
+    "    if pd.notna(binding_active_str):\n",
+    "        # Get the indices from the binding-active site string\n",
+    "        indices = [int(x) - 1 for segment in binding_active_str.split(';') for x in segment.split(',') if x.strip().isdigit()]\n",
+    "        for idx in indices:\n",
+    "            # Ensure the index is within the valid range\n",
+    "            if 0 <= idx < sequence_len:\n",
+    "                binary_list[idx] = 1\n",
+    "                \n",
+    "    # Step 4: Return the binary list\n",
+    "    return binary_list\n",
+    "\n",
+    "# Apply the function to both datasets\n",
+    "test_df['Binding-Active site'] = test_df.apply(lambda row: convert_to_binary_list(row['Binding-Active site'], len(row['Sequence'])), axis=1)\n",
+    "train_df['Binding-Active site'] = train_df.apply(lambda row: convert_to_binary_list(row['Binding-Active site'], len(row['Sequence'])), axis=1)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "0c01fca9-a558-4919-bd97-73b41acd4fc9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "f686b656-313e-4ac3-a4ac-59fa87c6cbc9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "b4d27236-49ef-4244-81d2-4c5f120a97d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle\n",
+    "import random\n",
+    "\n",
+    "def split_into_chunks(sequences, labels):\n",
+    "    \"\"\"Split sequences and labels into chunks of size 1000 or less.\"\"\"\n",
+    "    chunk_size = 1000\n",
+    "    new_sequences = []\n",
+    "    new_labels = []\n",
+    "    \n",
+    "    for seq, lbl in zip(sequences, labels):\n",
+    "        if len(seq) > chunk_size:\n",
+    "            # Split the sequence and labels into chunks of size 1000 or less\n",
+    "            for i in range(0, len(seq), chunk_size):\n",
+    "                new_sequences.append(seq[i:i+chunk_size])\n",
+    "                new_labels.append(lbl[i:i+chunk_size])\n",
+    "        else:\n",
+    "            new_sequences.append(seq)\n",
+    "            new_labels.append(lbl)\n",
+    "            \n",
+    "    return new_sequences, new_labels\n",
+    "\n",
+    "# Extract the necessary columns to create lists of sequences and labels\n",
+    "test_sequences_by_family = test_df['Sequence'].tolist()\n",
+    "test_labels_by_family = test_df['Binding-Active site'].tolist()\n",
+    "train_sequences_by_family = train_df['Sequence'].tolist()\n",
+    "train_labels_by_family = train_df['Binding-Active site'].tolist()\n",
+    "\n",
+    "# Get the number of samples in each dataset\n",
+    "num_test_samples = len(test_sequences_by_family)\n",
+    "num_train_samples = len(train_sequences_by_family)\n",
+    "\n",
+    "# Generate random indices representing 50% of each dataset\n",
+    "random_test_indices = random.sample(range(num_test_samples), num_test_samples // 26.66)\n",
+    "random_train_indices = random.sample(range(num_train_samples), num_train_samples // 26.66)\n",
+    "\n",
+    "# Create smaller datasets using the random indices\n",
+    "test_sequences_small = [test_sequences_by_family[i] for i in random_test_indices]\n",
+    "test_labels_small = [test_labels_by_family[i] for i in random_test_indices]\n",
+    "train_sequences_small = [train_sequences_by_family[i] for i in random_train_indices]\n",
+    "train_labels_small = [train_labels_by_family[i] for i in random_train_indices]\n",
+    "\n",
+    "# Apply the function to create new datasets with chunks of size 1000 or less\n",
+    "test_sequences_chunked, test_labels_chunked = split_into_chunks(test_sequences_small, test_labels_small)\n",
+    "train_sequences_chunked, train_labels_chunked = split_into_chunks(train_sequences_small, train_labels_small)\n",
+    "\n",
+    "# Paths to save the new chunked pickle files\n",
+    "test_labels_chunked_path = '600K_data/test_labels_chunked_by_family.pkl'\n",
+    "test_sequences_chunked_path = '600K_data/test_sequences_chunked_by_family.pkl'\n",
+    "train_labels_chunked_path = '600K_data/train_labels_chunked_by_family.pkl'\n",
+    "train_sequences_chunked_path = '600K_data/train_sequences_chunked_by_family.pkl'\n",
+    "\n",
+    "# Save the chunked datasets as new pickle files\n",
+    "with open(test_labels_chunked_path, 'wb') as file:\n",
+    "    pickle.dump(test_labels_chunked, file)\n",
+    "with open(test_sequences_chunked_path, 'wb') as file:\n",
+    "    pickle.dump(test_sequences_chunked, file)\n",
+    "with open(train_labels_chunked_path, 'wb') as file:\n",
+    "    pickle.dump(train_labels_chunked, file)\n",
+    "with open(train_sequences_chunked_path, 'wb') as file:\n",
+    "    pickle.dump(train_sequences_chunked, file)\n",
+    "\n",
+    "test_labels_chunked_path, test_sequences_chunked_path, train_labels_chunked_path, train_sequences_chunked_path\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "bea3056d-c72c-420f-9036-c9f5069312d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load each pickle file and get the number of entries in each\n",
+    "with open(test_labels_chunked_path, 'rb') as file:\n",
+    "    test_labels_chunked = pickle.load(file)\n",
+    "    num_test_labels_chunked = len(test_labels_chunked)\n",
+    "\n",
+    "with open(test_sequences_chunked_path, 'rb') as file:\n",
+    "    test_sequences_chunked = pickle.load(file)\n",
+    "    num_test_sequences_chunked = len(test_sequences_chunked)\n",
+    "\n",
+    "with open(train_labels_chunked_path, 'rb') as file:\n",
+    "    train_labels_chunked = pickle.load(file)\n",
+    "    num_train_labels_chunked = len(train_labels_chunked)\n",
+    "\n",
+    "with open(train_sequences_chunked_path, 'rb') as file:\n",
+    "    train_sequences_chunked = pickle.load(file)\n",
+    "    num_train_sequences_chunked = len(train_sequences_chunked)\n",
+    "\n",
+    "num_test_labels_chunked, num_test_sequences_chunked, num_train_labels_chunked, num_train_sequences_chunked\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb10699a-0441-48be-bafd-c3a1a4d113af",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "esm2_binding_py38b",
+   "language": "python",
+   "name": "esm2_binding_py38b"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}