Add get_sequence
Browse files- protention/attention.py +18 -1
- tests/test_attention.py +12 -1
protention/attention.py
CHANGED
@@ -2,7 +2,7 @@ from io import StringIO
|
|
2 |
from urllib import request
|
3 |
|
4 |
import torch
|
5 |
-
from Bio.PDB import PDBParser, Structure
|
6 |
from transformers import T5EncoderModel, T5Tokenizer
|
7 |
|
8 |
|
@@ -17,6 +17,20 @@ def get_structure(pdb_code: str) -> Structure:
|
|
17 |
structure = parser.get_structure(pdb_code, file)
|
18 |
return structure
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
def get_protT5() -> tuple[T5Tokenizer, T5EncoderModel]:
|
22 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
@@ -41,11 +55,14 @@ def get_attention(
|
|
41 |
"""
|
42 |
# fetch structure
|
43 |
structure = get_structure(pdb_code)
|
|
|
|
|
44 |
|
45 |
# get model
|
46 |
tokenizer, model = get_protT5()
|
47 |
|
48 |
# call model
|
|
|
49 |
|
50 |
# get attention
|
51 |
|
|
|
2 |
from urllib import request
|
3 |
|
4 |
import torch
|
5 |
+
from Bio.PDB import PDBParser, Polypeptide, Structure
|
6 |
from transformers import T5EncoderModel, T5Tokenizer
|
7 |
|
8 |
|
|
|
17 |
structure = parser.get_structure(pdb_code, file)
|
18 |
return structure
|
19 |
|
20 |
+
def get_sequences(structure: Structure) -> list[str]:
|
21 |
+
"""
|
22 |
+
Get list of sequences with residues on a single letter format
|
23 |
+
|
24 |
+
Residues not in the standard 20 amino acids are replaced with X
|
25 |
+
"""
|
26 |
+
sequences = []
|
27 |
+
for seq in structure.get_chains():
|
28 |
+
residues = [residue.get_resname() for residue in seq.get_residues()]
|
29 |
+
# TODO ask if using protein_letters_3to1_extended makes sense
|
30 |
+
residues_single_letter = map(lambda x: Polypeptide.protein_letters_3to1.get(x, "X"), residues)
|
31 |
+
|
32 |
+
sequences.append(list(residues_single_letter))
|
33 |
+
return sequences
|
34 |
|
35 |
def get_protT5() -> tuple[T5Tokenizer, T5EncoderModel]:
|
36 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
|
55 |
"""
|
56 |
# fetch structure
|
57 |
structure = get_structure(pdb_code)
|
58 |
+
# Get list of sequences
|
59 |
+
sequences = get_sequences(structure)
|
60 |
|
61 |
# get model
|
62 |
tokenizer, model = get_protT5()
|
63 |
|
64 |
# call model
|
65 |
+
## Get sequence
|
66 |
|
67 |
# get attention
|
68 |
|
tests/test_attention.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from Bio.PDB.Structure import Structure
|
2 |
from transformers import T5EncoderModel, T5Tokenizer
|
3 |
|
4 |
-
from protention.attention import get_protT5, get_structure
|
5 |
|
6 |
|
7 |
def test_get_structure():
|
@@ -11,6 +11,17 @@ def test_get_structure():
|
|
11 |
assert structure is not None
|
12 |
assert isinstance(structure, Structure)
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
def test_get_protT5():
|
16 |
result = get_protT5()
|
|
|
1 |
from Bio.PDB.Structure import Structure
|
2 |
from transformers import T5EncoderModel, T5Tokenizer
|
3 |
|
4 |
+
from protention.attention import get_protT5, get_sequences, get_structure
|
5 |
|
6 |
|
7 |
def test_get_structure():
|
|
|
11 |
assert structure is not None
|
12 |
assert isinstance(structure, Structure)
|
13 |
|
14 |
+
def test_get_sequences():
|
15 |
+
pdb_id = "1AKE"
|
16 |
+
structure = get_structure(pdb_id)
|
17 |
+
|
18 |
+
sequences = get_sequences(structure)
|
19 |
+
|
20 |
+
assert sequences is not None
|
21 |
+
assert len(sequences) == 2
|
22 |
+
|
23 |
+
A, B = sequences
|
24 |
+
assert A[:3] == ["M", "R", "I"]
|
25 |
|
26 |
def test_get_protT5():
|
27 |
result = get_protT5()
|