Spaces:

AI4PD
/

hexviz

App Files Files Community

aksell commited on Mar 22, 2023

Commit

ebbe380

•

1 Parent(s): 466a8f2

Add ModelType enum and Model class to hold layers and head count

Browse files

Files changed (3) hide show

protention/attention.py +19 -8
protention/streamlit/Attention_On_Structure.py +13 -3
tests/test_attention.py +2 -2

protention/attention.py CHANGED Viewed

@@ -2,14 +2,24 @@ from enum import Enum
 from io import StringIO
 from urllib import request
 import torch
 from Bio.PDB import PDBParser, Polypeptide, Structure
 from tape import ProteinBertModel, TAPETokenizer
 from transformers import T5EncoderModel, T5Tokenizer
-class Model(str, Enum):
-    tape_bert = "bert-base"
 def get_structure(pdb_code: str) -> Structure:
     """
@@ -56,9 +66,9 @@ def get_tape_bert() -> tuple[TAPETokenizer, ProteinBertModel]:
     model = ProteinBertModel.from_pretrained('bert-base', output_attentions=True)
     return tokenizer, model
 def get_attention(
-    pdb_code: str, model: Model = Model.tape_bert
 ):
     """
     Get attention from T5
@@ -70,8 +80,8 @@ def get_attention(
     # TODO handle multiple sequences
     sequence = sequences[0]
-    match model:
-        case model.tape_bert:
             tokenizer, model = get_tape_bert()
             token_idxs = tokenizer.encode(sequence).tolist()
             inputs = torch.tensor(token_idxs).unsqueeze(0)
@@ -80,9 +90,10 @@ def get_attention(
                 # Remove attention from <CLS> (first) and <SEP> (last) token
             attns = [attn[:, :, 1:-1, 1:-1] for attn in attns]
             attns = torch.stack([attn.squeeze(0) for attn in attns])
-        case model.prot_T5:
             # Space separate sequences
             sequences = [" ".join(sequence) for sequence in sequences]
             tokenizer, model = get_protT5()
-    return attns

 from io import StringIO
 from urllib import request
+import streamlit as st
 import torch
 from Bio.PDB import PDBParser, Polypeptide, Structure
 from tape import ProteinBertModel, TAPETokenizer
 from transformers import T5EncoderModel, T5Tokenizer
+class ModelType(str, Enum):
+    TAPE_BERT = "bert-base"
+    PROT_T5 = "prot_t5_xl_half_uniref50-enc"
+class Model:
+    def __init__(self, name, layers, heads):
+        self.name: ModelType = name
+        self.layers: int = layers
+        self.heads: int = heads
 def get_structure(pdb_code: str) -> Structure:
     """
     model = ProteinBertModel.from_pretrained('bert-base', output_attentions=True)
     return tokenizer, model
+@st.cache
 def get_attention(
+    pdb_code: str, model: ModelType = ModelType.TAPE_BERT
 ):
     """
     Get attention from T5
     # TODO handle multiple sequences
     sequence = sequences[0]
+    match model.name:
+        case ModelType.TAPE_BERT:
             tokenizer, model = get_tape_bert()
             token_idxs = tokenizer.encode(sequence).tolist()
             inputs = torch.tensor(token_idxs).unsqueeze(0)
                 # Remove attention from <CLS> (first) and <SEP> (last) token
             attns = [attn[:, :, 1:-1, 1:-1] for attn in attns]
             attns = torch.stack([attn.squeeze(0) for attn in attns])
+        case ModelType.PROT_T5:
             # Space separate sequences
             sequences = [" ".join(sequence) for sequence in sequences]
             tokenizer, model = get_protT5()
+    return attns

protention/streamlit/Attention_On_Structure.py CHANGED Viewed

@@ -3,21 +3,31 @@ import stmol
 import streamlit as st
 from stmol import showmol
 st.sidebar.title("pLM Attention Visualization")
 st.title("pLM Attention Visualization")
 pdb_id = st.text_input("PDB ID", "4RW0")
-chain_id = None
 left, right = st.columns(2)
 with left:
-    layer = st.number_input("Layer", value=8)
 with right:
-    head = st.number_input("Head", value=5)
 min_attn = st.slider("Minimum attention", min_value=0.0, max_value=0.4, value=0.15)
 def get_3dview(pdb):
     xyzview = py3Dmol.view(query=f"pdb:{pdb}")

 import streamlit as st
 from stmol import showmol
+from protention.attention import Model, ModelType, get_attention
 st.sidebar.title("pLM Attention Visualization")
 st.title("pLM Attention Visualization")
+# Define list of model types
+models = [
+    Model(name=ModelType.TAPE_BERT, layers=12, heads=12),
+]
+selected_model_name = st.selectbox("Select a model", [model.name.value for model in models], index=0)
+selected_model = next((model for model in models if model.name.value == selected_model_name), None)
 pdb_id = st.text_input("PDB ID", "4RW0")
 left, right = st.columns(2)
 with left:
+    layer = st.number_input("Layer", value=1, min_value=1, max_value=selected_model.layers)
 with right:
+    head = st.number_input("Head", value=1, min_value=1, max_value=selected_model.heads)
 min_attn = st.slider("Minimum attention", min_value=0.0, max_value=0.4, value=0.15)
+attention = get_attention(pdb_id, model=selected_model.name)
 def get_3dview(pdb):
     xyzview = py3Dmol.view(query=f"pdb:{pdb}")

tests/test_attention.py CHANGED Viewed

@@ -2,7 +2,7 @@ import torch
 from Bio.PDB.Structure import Structure
 from transformers import T5EncoderModel, T5Tokenizer
-from protention.attention import (Model, get_attention, get_protT5,
                                   get_sequences, get_structure)
@@ -38,7 +38,7 @@ def test_get_protT5():
 def test_get_attention_tape():
-    result = get_attention("1AKE", model=Model.tape_bert)
     assert result is not None
     assert result.shape == torch.Size([12,12,456,456])

 from Bio.PDB.Structure import Structure
 from transformers import T5EncoderModel, T5Tokenizer
+from protention.attention import (ModelType, get_attention, get_protT5,
                                   get_sequences, get_structure)
 def test_get_attention_tape():
+    result = get_attention("1AKE", model=ModelType.tape_bert)
     assert result is not None
     assert result.shape == torch.Size([12,12,456,456])