Dinesh1102 commited on
Commit
70ee585
1 Parent(s): 62caeae

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -0
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tqdm
2
+ from Bio import SeqIO
3
+ import numpy as np
4
+ import pandas as pd
5
+ import tensorflow as tf
6
+ import os
7
+ import json
8
+ from typing import Dict
9
+ from collections import Counter
10
+ import random
11
+ import obonet
12
+ from transformers import T5Tokenizer, T5EncoderModel
13
+ import torch
14
+ import re
15
+ import gradio as gr
16
+
17
+ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
18
+
19
+ # Load the tokenizer
20
+ tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False) #.to(device)
21
+
22
+ # Load the model
23
+ model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(device)
24
+
25
+ def get_embeddings(seq):
26
+ sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", seq)))]
27
+
28
+ ids = tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding="longest")
29
+
30
+ input_ids = torch.tensor(ids['input_ids']).to(device)
31
+ attention_mask = torch.tensor(ids['attention_mask']).to(device)
32
+
33
+ # generate embeddings
34
+ with torch.no_grad():
35
+ embedding_repr = model(input_ids=input_ids,
36
+ attention_mask=attention_mask)
37
+
38
+ # extract residue embeddings for the first ([0,:]) sequence in the batch and remove padded & special tokens ([0,:7])
39
+ emb_0 = embedding_repr.last_hidden_state[0]
40
+ emb_0_per_protein = emb_0.mean(dim=0)
41
+
42
+ return emb_0_per_protein
43
+
44
+ def predict(filepath):
45
+ sequences = SeqIO.parse(filepath, "fasta")
46
+
47
+ ids = []
48
+ num_sequences=sum(1 for seq in sequences)
49
+ embeds = np.zeros((num_sequences, 1024))
50
+ i = 0
51
+ with open(filepath, "r") as fasta_file:
52
+ # Iterate over each sequence in the file
53
+ for sequence in SeqIO.parse(fasta_file, "fasta"):
54
+ # Access the sequence ID and sequence data
55
+ seq_id = sequence.id
56
+ seq_data = str(sequence.seq)
57
+ embeds[i] = get_embeddings(seq_data).detach().cpu().numpy()
58
+ print(embeds[i])
59
+ ids.append(seq_id)
60
+ i += 1
61
+
62
+ INPUT_SHAPE=[1024]
63
+ num_of_labels=1500
64
+
65
+ model = tf.keras.Sequential([
66
+ tf.keras.layers.BatchNormalization(input_shape=INPUT_SHAPE),
67
+ tf.keras.layers.Dense(units=512, activation='relu'),
68
+ tf.keras.layers.Dropout(0.2),
69
+ tf.keras.layers.Dense(units=512, activation='relu'),
70
+ tf.keras.layers.Dropout(0.2),
71
+ tf.keras.layers.Dense(units=512, activation='relu'),
72
+ tf.keras.layers.Dropout(0.2),
73
+ tf.keras.layers.Dense(units=num_of_labels, activation='sigmoid')
74
+ ])
75
+
76
+ model.compile(
77
+ optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
78
+ loss='binary_crossentropy',
79
+ metrics=['binary_accuracy', tf.keras.metrics.AUC()]
80
+ )
81
+
82
+ model.load_weights('./my_model.weights.h5') #load model here
83
+ labels_df=pd.read_csv('./labels.csv')
84
+ labels_df=labels_df.drop(columns='Unnamed: 0')
85
+
86
+ predictions = model.predict(embeds)
87
+ predictions_list1=[]
88
+ predictions_list2=[]
89
+
90
+ # 'predictions' will contain the model's output for the custom input tensor
91
+ # print(predictions)
92
+ for prediction in predictions:
93
+ tmp=[]
94
+ t2=[]
95
+ for i in prediction:
96
+ x=0 if i<0.4 else 1
97
+ tmp.append(x)
98
+ t2.append(i)
99
+ predictions_list1.append(tmp.copy())
100
+ predictions_list2.append(t2.copy())
101
+
102
+ label_columns = labels_df.columns
103
+
104
+ # Convert the predictions into a DataFrame
105
+ predictions_df = pd.DataFrame(predictions_list1, columns=label_columns)
106
+ p21=pd.DataFrame(predictions_list2, columns=label_columns)
107
+
108
+ # Save the DataFrame to a CSV file
109
+ predictions_df.to_csv("predictions.csv", index=False) #output csv
110
+ p21.to_csv("decimal.csv",index=False)
111
+ return "predictions.csv"
112
+
113
+ gr.Interface(
114
+ predict,
115
+ title = 'Protein Function Prediction using fasta file,upload a fasta file',
116
+ inputs="file",
117
+ outputs="file"
118
+ ).launch()