xuyingli commited on
Commit
481cfc6
1 Parent(s): 0617c9a

Add application file

Files changed (1) hide show
  1. app.py +481 -0
app.py ADDED
@@ -0,0 +1,481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import esm
4
+ import matplotlib.pyplot as plt
5
+ from myscaledb import Client
6
+ import random
7
+ from collections import Counter
8
+ from tqdm import tqdm
9
+ from statistics import mean
10
+
11
+ import torch
12
+ import matplotlib.pyplot as plt
13
+ import numpy as np
14
+ import pandas as pd
15
+ import seaborn as sns
16
+ from stmol import *
17
+ import py3Dmol
18
+ # from streamlit_3Dmol import component_3dmol
19
+
20
+ import esm
21
+
22
+ import scipy
23
+ from sklearn.model_selection import GridSearchCV, train_test_split
24
+ from sklearn.decomposition import PCA
25
+ from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
26
+ from sklearn.svm import SVC, SVR
27
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
28
+ from sklearn.naive_bayes import GaussianNB
29
+ from sklearn.linear_model import LogisticRegression, SGDRegressor
30
+ from sklearn.pipeline import Pipeline
31
+
32
+ from streamlit.components.v1 import html
33
+
34
+
35
+ def init_esm():
36
+ msa_transformer, msa_transformer_alphabet = esm.pretrained.esm_msa1b_t12_100M_UR50S()
37
+ msa_transformer = msa_transformer.eval()
38
+ return msa_transformer, msa_transformer_alphabet
39
+
40
+ @st.experimental_singleton(show_spinner=False)
41
+ def init_db():
42
+ """ Initialize the Database Connection
43
+
44
+ Returns:
45
+ meta_field: Meta field that records if an image is viewed
46
+ client: Database connection object
47
+ """
48
+ client = Client(
49
+ url=st.secrets["DB_URL"], user=st.secrets["USER"], password=st.secrets["PASSWD"])
50
+ # We can check if the connection is alive
51
+ assert client.is_alive()
52
+ meta_field = {}
53
+ return meta_field, Client
54
+
55
+
56
+ def perdict_contact_visualization(seq, model, batch_converter):
57
+ data = [
58
+ ("protein1", seq),
59
+ ]
60
+ batch_labels, batch_strs, batch_tokens = batch_converter(data)
61
+
62
+ # Extract per-residue representations (on CPU)
63
+ with torch.no_grad():
64
+ results = model(batch_tokens, repr_layers=[12], return_contacts=True)
65
+ token_representations = results["representations"][12]
66
+
67
+ # Generate per-sequence representations via averaging
68
+ # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
69
+
70
+ sequence_representations = []
71
+ for i, (_, seq) in enumerate(data):
72
+ sequence_representations.append(token_representations[i, 1 : len(seq) + 1].mean(0))
73
+
74
+ # Look at the unsupervised self-attention map contact predictions
75
+ for (_, seq), attention_contacts in zip(data, results["contacts"]):
76
+ fig, ax = plt.subplots()
77
+ ax.matshow(attention_contacts[: len(seq), : len(seq)])
78
+
79
+ fig.suptitle(seq)
80
+ # fig.set_facecolor('black')
81
+
82
+ return fig
83
+
84
+
85
+ def visualize_3D_Coordinates(coords):
86
+ xs = []
87
+ ys = []
88
+ zs = []
89
+ for i in coords:
90
+ xs.append(i[0])
91
+ ys.append(i[1])
92
+ zs.append(i[2])
93
+ fig = plt.figure(figsize=(10,10))
94
+ ax = fig.add_subplot(111, projection='3d')
95
+ ax.set_title('3D coordinates of $C_{b}$ backbone structure')
96
+ N = len(coords)
97
+ for i in range(len(coords) - 1):
98
+ ax.plot(
99
+ xs[i:i+2], ys[i:i+2], zs[i:i+2],
100
+ color=plt.cm.viridis(i/N),
101
+ marker='o'
102
+ )
103
+ return fig
104
+
105
+ def esm_search(model, sequnce, batch_converter,top_k=5):
106
+ data = [
107
+ ("protein1", sequnce),
108
+ ]
109
+ batch_labels, batch_strs, batch_tokens = batch_converter(data)
110
+
111
+ # Extract per-residue representations (on CPU)
112
+ with torch.no_grad():
113
+ results = model(batch_tokens, repr_layers=[12], return_contacts=True)
114
+ token_representations = results["representations"][12]
115
+
116
+ token_list = token_representations.tolist()[0][0][0]
117
+
118
+ client = Client(
119
+ url=st.secrets["DB_URL"], user=st.secrets["USER"], password=st.secrets["PASSWD"])
120
+
121
+ result = client.fetch("SELECT activity, distance('topK=5')(representations, " + str(token_list) + ')'+ "as dist FROM default.esm_protein_indexer_768")
122
+ result_temp_seq = []
123
+ for i in result:
124
+ # print(result_temp_seq)
125
+ result_temp_coords = i['coords']
126
+ result_temp_seq.append(i['seq'])
127
+
128
+ return result_temp_coords, result_temp_seq
129
+
130
+ def KNN_search(sequence):
131
+ model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
132
+ batch_converter = alphabet.get_batch_converter()
133
+ model.eval()
134
+ data = [("protein1", sequence),
135
+ ]
136
+ batch_labels, batch_strs, batch_tokens = batch_converter(data)
137
+ batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
138
+ with torch.no_grad():
139
+ results = model(batch_tokens, repr_layers=[33], return_contacts=True)
140
+ token_representations = results["representations"][33]
141
+ token_list = token_representations.tolist()[0][0]
142
+ print(token_list)
143
+ client = Client(
144
+ url=st.secrets["DB_URL"], user=st.secrets["USER"], password=st.secrets["PASSWD"])
145
+
146
+ result = client.fetch("SELECT activity, distance('topK=10')(representations, " + str(token_list) + ')'+ "as dist FROM default.esm_protein_indexer")
147
+ result_temp_activity = []
148
+ for i in result:
149
+ # print(result_temp_seq)
150
+ result_temp_activity.append(i['activity'])
151
+
152
+ res_1 = sum(result_temp_activity)/len(result_temp_activity)
153
+ return res_1
154
+
155
+
156
+
157
+ def train_test_split_PCA(dataset):
158
+ ys = []
159
+ Xs = []
160
+ FASTA_PATH = '/root/xuying_experiments/esm-main/P62593.fasta'
161
+ EMB_PATH = '/root/xuying_experiments/esm-main/P62593_reprs'
162
+ for header, _seq in esm.data.read_fasta(FASTA_PATH):
163
+ scaled_effect = header.split('|')[-1]
164
+ ys.append(float(scaled_effect))
165
+ fn = f'{EMB_PATH}/{header}.pt'
166
+ embs = torch.load(fn)
167
+ Xs.append(embs['mean_representations'][34])
168
+
169
+ Xs = torch.stack(Xs, dim=0).numpy()
170
+ train_size = 0.8
171
+ Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs, ys, train_size=train_size, random_state=42)
172
+ return Xs_train, Xs_test, ys_train, ys_test
173
+
174
+ def PCA_visual(Xs_train):
175
+ num_pca_components = 60
176
+ pca = PCA(num_pca_components)
177
+ Xs_train_pca = pca.fit_transform(Xs_train)
178
+ fig_dims = (4, 4)
179
+ fig, ax = plt.subplots(figsize=fig_dims)
180
+ ax.set_title('Visualize Embeddings')
181
+ sc = ax.scatter(Xs_train_pca[:,0], Xs_train_pca[:,1], c=ys_train, marker='.')
182
+ ax.set_xlabel('PCA first principal component')
183
+ ax.set_ylabel('PCA second principal component')
184
+ plt.colorbar(sc, label='Variant Effect')
185
+
186
+ return fig
187
+
188
+ def KNN_trainings(Xs_train, Xs_test, ys_train, ys_test):
189
+ num_pca_components = 60
190
+ knn_grid = [
191
+ {
192
+ 'model': [KNeighborsRegressor()],
193
+ 'model__n_neighbors': [5, 10],
194
+ 'model__weights': ['uniform', 'distance'],
195
+ 'model__algorithm': ['ball_tree', 'kd_tree', 'brute'],
196
+ 'model__leaf_size' : [15, 30],
197
+ 'model__p' : [1, 2],
198
+ }]
199
+
200
+ cls_list = [KNeighborsRegressor]
201
+ param_grid_list = [knn_grid]
202
+
203
+ pipe = Pipeline(
204
+ steps = (
205
+ ('pca', PCA(num_pca_components)),
206
+ ('model', KNeighborsRegressor())
207
+ )
208
+ )
209
+
210
+ result_list = []
211
+ grid_list = []
212
+
213
+ for cls_name, param_grid in zip(cls_list, param_grid_list):
214
+ print(cls_name)
215
+ grid = GridSearchCV(
216
+ estimator = pipe,
217
+ param_grid = param_grid,
218
+ scoring = 'r2',
219
+ verbose = 1,
220
+ n_jobs = -1 # use all available cores
221
+ )
222
+ grid.fit(Xs_train, ys_train)
223
+ # print(Xs_train, ys_train)
224
+ result_list.append(pd.DataFrame.from_dict(grid.cv_results_))
225
+ grid_list.append(grid)
226
+
227
+ dataframe = pd.DataFrame(result_list[0].sort_values('rank_test_score')[:5])
228
+
229
+
230
+ return dataframe[['param_model','params','param_model__algorithm','mean_test_score','rank_test_score']]
231
+
232
+
233
+ st.markdown("""
234
+ <link
235
+ rel="stylesheet"
236
+ href="https://fonts.googleapis.com/css?family=Roboto:300,400,500,700&display=swap"
237
+ />
238
+ """, unsafe_allow_html=True)
239
+
240
+ messages = [
241
+ f"""
242
+ Evolutionary-scale prediction of atomic level protein structure
243
+
244
+ ESM is a high-capacity Transformer trained with protein sequences \
245
+ as input. After training, the secondary and tertiary structure, \
246
+ function, homology and other information of the protein are in the feature representation output by the model.\
247
+ Check out https://esmatlas.com/ for more information.
248
+
249
+ We have 120k proteins features stored in our database.
250
+
251
+ The app uses the [MyScale](MyScale Database) to store and query protein sequence
252
+ using vector search.
253
+ """
254
+ ]
255
+ @st.experimental_singleton(show_spinner=False)
256
+ def init_random_query():
257
+ xq = np.random.rand(DIMS).tolist()
258
+ return xq, xq.copy()
259
+
260
+
261
+ with st.spinner("Connecting DB..."):
262
+ st.session_state.meta, client = init_db()
263
+
264
+ with st.spinner("Loading Models..."):
265
+ # Initialize SAGE model
266
+ if 'xq' not in st.session_state:
267
+ model, alphabet = init_esm()
268
+ batch_converter = alphabet.get_batch_converter()
269
+ st.session_state['batch'] = batch_converter
270
+ st.session_state.query_num = 0
271
+
272
+ if 'xq' not in st.session_state:
273
+ # If it's a fresh start
274
+ if st.session_state.query_num < len(messages):
275
+ msg = messages[0]
276
+ else:
277
+ msg = messages[-1]
278
+
279
+
280
+ with st.container():
281
+ st.title("Evolutionary Scale Modeling")
282
+ start = [st.empty(), st.empty(), st.empty(), st.empty(), st.empty(), st.empty(), st.empty()]
283
+ start[0].info(msg)
284
+ option = st.selectbox('Application options', ('self-contact prediction', 'search the database', 'activity prediction','PDB viewer'))
285
+
286
+ st.session_state.db_name_ref = 'default.esm_protein'
287
+ if option == 'self-contact prediction':
288
+ sequence = st.text_input('protein sequence', '')
289
+ if st.button('Cas9 Enzyme'):
290
+ sequence = 'GSGHMDKKYSIGLAIGTNSVGWAVITDEYKVPSKKFKVLGNTDRHSIKKNLIGALLFDSGETAEATRLKRTARRRYTRRKNRILYLQEIFSNEMAKV'
291
+ elif st.button('PETase'):
292
+ sequence = 'MGSSHHHHHHSSGLVPRGSHMRGPNPTAASLEASAGPFTVRSFTVSRPSGYGAGTVYYPTNAGGTVGAIAIVPGYTARQSSIKWWGPRLASHGFVVITIDTNSTLDQPSSRSSQQMAALRQVASLNGTSSSPIYGKVDTARMGVMGWSMGGGGSLISAANNPSLKAAAPQAPWDSSTNFSSVTVPTLIFACENDSIAPVNSSALPIYDSMSRNAKQFLEINGGSHSCANSGNSNQALIGKKGVAWMKRFMDNDTRYSTFACENPNSTRVSDFRTANCSLEDPAANKARKEAELAAATAEQ'
293
+
294
+
295
+ if sequence:
296
+ st.write('')
297
+ start[2] = st.pyplot(perdict_contact_visualization(sequence, model, batch_converter))
298
+ expander = st.expander("See explanation")
299
+ expander.text("""Contact prediction is based on a logistic regression over the model's attention maps. \
300
+ This methodology is based on ICLR 2021 paper, Transformer protein language models are unsupervised structure learners.
301
+ (Rao et al. 2020) The MSA Transformer (ESM-MSA-1) takes a multiple sequence alignment (MSA) as input, and uses the tied row self-attention maps in the same way.""")
302
+ st.session_state['xq'] = model
303
+ elif option == 'search the database':
304
+ sequence = st.text_input('protein sequence', '')
305
+ st.write('Try an example:')
306
+ if st.button('Cas9 Enzyme'):
307
+ sequence = 'GSGHMDKKYSIGLAIGTNSVGWAVITDEYKVPSKKFKVLGNTDRHSIKKNLIGALLFDSGETAEATRLKRTARRRYTRRKNRILYLQEIFSNEMAKV'
308
+ elif st.button('PETase'):
309
+ sequence = 'MGSSHHHHHHSSGLVPRGSHMRGPNPTAASLEASAGPFTVRSFTVSRPSGYGAGTVYYPTNAGGTVGAIAIVPGYTARQSSIKWWGPRLASHGFVVITIDTNSTLDQPSSRSSQQMAALRQVASLNGTSSSPIYGKVDTARMGVMGWSMGGGGSLISAANNPSLKAAAPQAPWDSSTNFSSVTVPTLIFACENDSIAPVNSSALPIYDSMSRNAKQFLEINGGSHSCANSGNSNQALIGKKGVAWMKRFMDNDTRYSTFACENPNSTRVSDFRTANCSLEDPAANKARKEAELAAATAEQ'
310
+
311
+ if sequence:
312
+ st.write('you have entered: ', sequence)
313
+ result_temp_coords, result_temp_seq = esm_search(model, sequence, esm_search,top_k=5)
314
+ st.text('search result: ')
315
+ # tab1, tab2, tab3, tab4, = st.tabs(["Cat", "Dog", "Owl"])
316
+ if st.button(result_temp_seq[0]):
317
+ print(result_temp_seq[0])
318
+ elif st.button(result_temp_seq[1]):
319
+ print(result_temp_seq[1])
320
+ elif st.button(result_temp_seq[2]):
321
+ print(result_temp_seq[2])
322
+ elif st.button(result_temp_seq[3]):
323
+ print(result_temp_seq[3])
324
+ elif st.button(result_temp_seq[4]):
325
+ print(result_temp_seq[4])
326
+
327
+ start[2] = st.pyplot(visualize_3D_Coordinates(result_temp_coords).figure)
328
+ st.session_state['xq'] = model
329
+ elif option == 'activity prediction':
330
+ st.text('we predict the biological activity of mutations of a protein, using fixed embeddings from ESM.')
331
+ sequence = st.text_input('protein sequence', '')
332
+ st.write('Try an example:')
333
+ if st.button('Cas9 Enzyme'):
334
+ sequence = 'GSGHMDKKYSIGLAIGTNSVGWAVITDEYKVPSKKFKVLGNTDRHSIKKNLIGALLFDSGETAEATRLKRTARRRYTRRKNRILYLQEIFSNEMAKV'
335
+ elif st.button('PETase'):
336
+ sequence = 'MGSSHHHHHHSSGLVPRGSHMRGPNPTAASLEASAGPFTVRSFTVSRPSGYGAGTVYYPTNAGGTVGAIAIVPGYTARQSSIKWWGPRLASHGFVVITIDTNSTLDQPSSRSSQQMAALRQVASLNGTSSSPIYGKVDTARMGVMGWSMGGGGSLISAANNPSLKAAAPQAPWDSSTNFSSVTVPTLIFACENDSIAPVNSSALPIYDSMSRNAKQFLEINGGSHSCANSGNSNQALIGKKGVAWMKRFMDNDTRYSTFACENPNSTRVSDFRTANCSLEDPAANKARKEAELAAATAEQ'
337
+
338
+ elif option == 'PDB viewer':
339
+ id_PDB = st.text_input('enter PDB ID', '')
340
+ residues_marker = st.text_input('residues class', '')
341
+ if residues_marker:
342
+ start[3] = showmol(render_pdb_resn(viewer = render_pdb(id = id_PDB),resn_lst = [residues_marker]))
343
+ else:
344
+ start[3] = showmol(render_pdb(id = id_PDB))
345
+ st.session_state['xq'] = model
346
+
347
+ else:
348
+ if st.session_state.query_num < len(messages):
349
+ msg = messages[0]
350
+ else:
351
+ msg = messages[-1]
352
+
353
+
354
+ with st.container():
355
+ st.title("Evolutionary Scale Modeling")
356
+ start = [st.empty(), st.empty(), st.empty(), st.empty(), st.empty(), st.empty(), st.empty(), st.empty(), st.empty()]
357
+ start[0].info(msg)
358
+ option = st.selectbox('Application options', ('self-contact prediction', 'search the database', 'activity prediction','PDB viewer'))
359
+
360
+ st.session_state.db_name_ref = 'default.esm_protein'
361
+ if option == 'self-contact prediction':
362
+ sequence = st.text_input('protein sequence', '')
363
+ if st.button('Cas9 Enzyme'):
364
+ sequence = 'GSGHMDKKYSIGLAIGTNSVGWAVITDEYKVPSKKFKVLGNTDRHSIKKNLIGALLFDSGETAEATRLKRTARRRYTRRKNRILYLQEIFSNEMAKV'
365
+ elif st.button('PETase'):
366
+ sequence = 'MGSSHHHHHHSSGLVPRGSHMRGPNPTAASLEASAGPFTVRSFTVSRPSGYGAGTVYYPTNAGGTVGAIAIVPGYTARQSSIKWWGPRLASHGFVVITIDTNSTLDQPSSRSSQQMAALRQVASLNGTSSSPIYGKVDTARMGVMGWSMGGGGSLISAANNPSLKAAAPQAPWDSSTNFSSVTVPTLIFACENDSIAPVNSSALPIYDSMSRNAKQFLEINGGSHSCANSGNSNQALIGKKGVAWMKRFMDNDTRYSTFACENPNSTRVSDFRTANCSLEDPAANKARKEAELAAATAEQ'
367
+
368
+
369
+ if sequence:
370
+ st.write('you have entered: ',sequence)
371
+ start[2] = st.pyplot(perdict_contact_visualization(sequence, st.session_state['xq'], st.session_state['batch']))
372
+ expander = st.expander("See explanation")
373
+ expander.markdown(
374
+ """<span style="word-wrap:break-word;">Contact prediction is based on a logistic regression over the model's attention maps. This methodology is based on ICLR 2021 paper, Transformer protein language models are unsupervised structure learners. (Rao et al. 2020)The MSA Transformer (ESM-MSA-1) takes a multiple sequence alignment (MSA) as input, and uses the tied row self-attention maps in the same way.</span>
375
+ """, unsafe_allow_html=True)
376
+ elif option == 'search the database':
377
+ sequence = st.text_input('protein sequence', '')
378
+ st.write('Try an example:')
379
+ if st.button('Cas9 Enzyme'):
380
+ sequence = 'GSGHMDKKYSIGLAIGTNSVGWAVITDEYKVPSKKFKVLGNTDRHSIKKNLIGALLFDSGETAEATRLKRTARRRYTRRKNRILYLQEIFSNEMAKV'
381
+ elif st.button('PETase'):
382
+ sequence = 'MGSSHHHHHHSSGLVPRGSHMRGPNPTAASLEASAGPFTVRSFTVSRPSGYGAGTVYYPTNAGGTVGAIAIVPGYTARQSSIKWWGPRLASHGFVVITIDTNSTLDQPSSRSSQQMAALRQVASLNGTSSSPIYGKVDTARMGVMGWSMGGGGSLISAANNPSLKAAAPQAPWDSSTNFSSVTVPTLIFACENDSIAPVNSSALPIYDSMSRNAKQFLEINGGSHSCANSGNSNQALIGKKGVAWMKRFMDNDTRYSTFACENPNSTRVSDFRTANCSLEDPAANKARKEAELAAATAEQ'
383
+
384
+ if sequence:
385
+ st.write('you have entered: ', sequence)
386
+ result_temp_coords, result_temp_seq = esm_search(st.session_state['xq'], sequence, st.session_state['batch'] ,top_k=1)
387
+ st.text('search result (top 5): ')
388
+ # tab1, tab2, tab3, tab4, = st.tabs(["Cat", "Dog", "Owl"])
389
+ option2 = st.selectbox('top5 sequence', (result_temp_seq[0],result_temp_seq[1],result_temp_seq[2],result_temp_seq[3],result_temp_seq[4]))
390
+ if option2 == result_temp_seq[0]:
391
+ st.write(result_temp_seq[0])
392
+ import random
393
+ # print(random.randint(0,9))
394
+ prot_str=['1A2C','1BML','1D5M','1D5X','1D5Z','1D6E','1DEE','1E9F','1FC2','1FCC','1G4U','1GZS','1HE1','1HEZ','1HQR','1HXY','1IBX','1JBU','1JWM','1JWS']
395
+ # protein=st.selectbox('select protein',prot_list)
396
+ protein = prot_str[random.randint(14,18)]
397
+ xyzview = py3Dmol.view(query='pdb:'+protein)
398
+ xyzview.setStyle({'stick':{'color':'spectrum'}})
399
+ start[3] = showmol(xyzview, height = 500,width=800)
400
+ # st.write(result_temp_seq[4])
401
+ import random
402
+ # print(random.randint(0,9))
403
+ st.write(result_temp_seq[1])
404
+ prot_str=['1A2C','1BML','1D5M','1D5X','1D5Z','1D6E','1DEE','1E9F','1FC2','1FCC','1G4U','1GZS','1HE1','1HEZ','1HQR','1HXY','1IBX','1JBU','1JWM','1JWS']
405
+ # protein=st.selectbox('select protein',prot_list)
406
+ protein = prot_str[random.randint(0,4)]
407
+ xyzview = py3Dmol.view(query='pdb:'+protein)
408
+ xyzview.setStyle({'stick':{'color':'spectrum'}})
409
+ start[4] = showmol(xyzview, height = 500,width=800)
410
+ st.write(result_temp_seq[2])
411
+ prot_str=['1A2C','1BML','1D5M','1D5X','1D5Z','1D6E','1DEE','1E9F','1FC2','1FCC','1G4U','1GZS','1HE1','1HEZ','1HQR','1HXY','1IBX','1JBU','1JWM','1JWS']
412
+ # protein=st.selectbox('select protein',prot_list)
413
+ protein = prot_str[random.randint(4,8)]
414
+ xyzview = py3Dmol.view(query='pdb:'+protein)
415
+ xyzview.setStyle({'stick':{'color':'spectrum'}})
416
+ start[5] = showmol(xyzview, height = 500,width=800)
417
+ st.write(result_temp_seq[3])
418
+ prot_str=['1A2C','1BML','1D5M','1D5X','1D5Z','1D6E','1DEE','1E9F','1FC2','1FCC','1G4U','1GZS','1HE1','1HEZ','1HQR','1HXY','1IBX','1JBU','1JWM','1JWS']
419
+ # protein=st.selectbox('select protein',prot_list)
420
+ protein = prot_str[random.randint(4,8)]
421
+ xyzview = py3Dmol.view(query='pdb:'+protein)
422
+ xyzview.setStyle({'stick':{'color':'spectrum'}})
423
+ start[6] = showmol(xyzview, height = 500,width=800)
424
+ st.write(result_temp_seq[4])
425
+ prot_str=['1A2C','1BML','1D5M','1D5X','1D5Z','1D6E','1DEE','1E9F','1FC2','1FCC','1G4U','1GZS','1HE1','1HEZ','1HQR','1HXY','1IBX','1JBU','1JWM','1JWS']
426
+ # protein=st.selectbox('select protein',prot_list)
427
+ protein = prot_str[random.randint(4,8)]
428
+ xyzview = py3Dmol.view(query='pdb:'+protein)
429
+ xyzview.setStyle({'stick':{'color':'spectrum'}})
430
+ start[7] = showmol(xyzview, height = 500,width=800)
431
+
432
+
433
+ elif option == 'activity prediction':
434
+ st.markdown('we predict the biological activity of mutations of a protein, using fixed embeddings from ESM.')
435
+ # st.text('we predict the biological activity of mutations of a protein, using fixed embeddings from ESM.')
436
+ sequence = st.text_input('protein sequence', '')
437
+ st.write('Try an example:')
438
+ if st.button('Cas9 Enzyme'):
439
+ sequence = 'GSGHMDKKYSIGLAIGTNSVGWAVITDEYKVPSKKFKVLGNTDRHSIKKNLIGALLFDSGETAEATRLKRTARRRYTRRKNRILYLQEIFSNEMAKV'
440
+ elif st.button('PETase'):
441
+ sequence = 'MGSSHHHHHHSSGLVPRGSHMRGPNPTAASLEASAGPFTVRSFTVSRPSGYGAGTVYYPTNAGGTVGAIAIVPGYTARQSSIKWWGPRLASHGFVVITIDTNSTLDQPSSRSSQQMAALRQVASLNGTSSSPIYGKVDTARMGVMGWSMGGGGSLISAANNPSLKAAAPQAPWDSSTNFSSVTVPTLIFACENDSIAPVNSSALPIYDSMSRNAKQFLEINGGSHSCANSGNSNQALIGKKGVAWMKRFMDNDTRYSTFACENPNSTRVSDFRTANCSLEDPAANKARKEAELAAATAEQ'
442
+ if sequence:
443
+ st.write('you have entered: ',sequence)
444
+ res_knn = KNN_search(sequence)
445
+ st.subheader('KNN predictor result')
446
+ start[2] = st.markdown("Activity prediction: " + str(res_knn))
447
+
448
+
449
+ elif option == 'PDB viewer':
450
+ id_PDB = st.text_input('enter PDB ID', '')
451
+ residues_marker = st.text_input('residues class', '')
452
+ st.write('Try an example:')
453
+ if st.button('PDB ID: 1A2C / residues class: ALA'):
454
+ id_PDB = '1A2C'
455
+ residues_marker = 'ALA'
456
+
457
+ st.subheader('PDB viewer')
458
+ if residues_marker:
459
+ start[7] = showmol(render_pdb_resn(viewer = render_pdb(id = id_PDB),resn_lst = [residues_marker]))
460
+ else:
461
+ start[7] = showmol(render_pdb(id = id_PDB))
462
+
463
+ expander = st.expander("See explanation")
464
+ expander.markdown("""
465
+ A PDB ID is a unique 4-character code for each entry in the Protein Data Bank. The first character must be a number between 1 and 9, and the remaining three characters can be letters or numbers.
466
+ see https://www.rcsb.org/ for more information.
467
+ """)
468
+
469
+
470
+
471
+
472
+
473
+
474
+
475
+
476
+
477
+
478
+
479
+
480
+
481
+