Spaces:

AI4PD
/

hexviz

aksell commited on Mar 27, 2023

Commit

5b6d16d

•

1 Parent(s): 4ad80db

WIP: Start adding ProtT5

Files changed (2) hide show

hexviz/app.py CHANGED Viewed

@@ -10,6 +10,7 @@ st.title("pLM Attention Visualization")
 # Define list of model types
 models = [
     Model(name=ModelType.TAPE_BERT, layers=12, heads=12),
 ]
 selected_model_name = st.selectbox("Select a model", [model.name.value for model in models], index=0)

 # Define list of model types
 models = [
     Model(name=ModelType.TAPE_BERT, layers=12, heads=12),
+    # Model(name=ModelType.PROT_T5, layers=24, heads=32),
 ]
 selected_model_name = st.selectbox("Select a model", [model.name.value for model in models], index=0)

hexviz/attention.py CHANGED Viewed

@@ -48,6 +48,7 @@ def get_sequences(structure: Structure) -> List[str]:
         sequences.append(list(residues_single_letter))
     return sequences
 def get_protT5() -> Tuple[T5Tokenizer, T5EncoderModel]:
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     tokenizer = T5Tokenizer.from_pretrained(
@@ -69,7 +70,7 @@ def get_tape_bert() -> Tuple[TAPETokenizer, ProteinBertModel]:
 @st.cache
 def get_attention(
-    sequence: List[str], model_type: ModelType = ModelType.TAPE_BERT
 ):
     if model_type == ModelType.TAPE_BERT:
         tokenizer, model = get_tape_bert()
@@ -81,9 +82,18 @@ def get_attention(
         attns = [attn[:, :, 1:-1, 1:-1] for attn in attns]
         attns = torch.stack([attn.squeeze(0) for attn in attns])
     elif model_type == ModelType.PROT_T5:
-            attns = None
-            # Space separate sequences
-            sequences = [" ".join(sequence) for sequence in sequences]
             tokenizer, model = get_protT5()
     else:
         raise ValueError(f"Model {model_type} not supported")

         sequences.append(list(residues_single_letter))
     return sequences
+@st.cache
 def get_protT5() -> Tuple[T5Tokenizer, T5EncoderModel]:
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     tokenizer = T5Tokenizer.from_pretrained(
 @st.cache
 def get_attention(
+    sequence: str, model_type: ModelType = ModelType.TAPE_BERT
 ):
     if model_type == ModelType.TAPE_BERT:
         tokenizer, model = get_tape_bert()
         attns = [attn[:, :, 1:-1, 1:-1] for attn in attns]
         attns = torch.stack([attn.squeeze(0) for attn in attns])
     elif model_type == ModelType.PROT_T5:
+            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+            # Introduce white-space between all amino acids
+            sequence = " ".join(sequence)
+            # tokenize sequences and pad up to the longest sequence in the batch
+            ids = tokenizer.encode_plus(sequence, add_special_tokens=True, padding="longest")
+            input_ids = torch.tensor(ids['input_ids']).to(device)
+            attention_mask = torch.tensor(ids['attention_mask']).to(device)
+            with torch.no_grad():
+                attns = model(input_ids=input_ids,attention_mask=attention_mask)[-1]
             tokenizer, model = get_protT5()
     else:
         raise ValueError(f"Model {model_type} not supported")