Dinesh1102's picture
Update app.py
de944bf verified
import tqdm
from Bio import SeqIO
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import json
from typing import Dict
from collections import Counter
import random
import obonet
from transformers import T5Tokenizer, T5EncoderModel
import torch
import re
import gradio as gr
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False) #.to(device)
# Load the model
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(device)
def get_embeddings(seq):
sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", seq)))]
ids = tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding="longest")
input_ids = torch.tensor(ids['input_ids']).to(device)
attention_mask = torch.tensor(ids['attention_mask']).to(device)
# generate embeddings
with torch.no_grad():
embedding_repr = model(input_ids=input_ids,
attention_mask=attention_mask)
# extract residue embeddings for the first ([0,:]) sequence in the batch and remove padded & special tokens ([0,:7])
emb_0 = embedding_repr.last_hidden_state[0]
emb_0_per_protein = emb_0.mean(dim=0)
return emb_0_per_protein
def predict(fasta_file):
sequences = SeqIO.parse(fasta_file, "fasta")
ids = []
num_sequences=sum(1 for seq in sequences)
embeds = np.zeros((num_sequences, 1024))
i = 0
with open(fasta_file, "r") as fastafile:
# Iterate over each sequence in the file
for sequence in SeqIO.parse(fastafile, "fasta"):
# Access the sequence ID and sequence data
seq_id = sequence.id
seq_data = str(sequence.seq)
embeds[i] = get_embeddings(seq_data).detach().cpu().numpy()
print(embeds[i])
ids.append(seq_id)
i += 1
INPUT_SHAPE=[1024]
num_of_labels=1500
model = tf.keras.Sequential([
tf.keras.layers.BatchNormalization(input_shape=INPUT_SHAPE),
tf.keras.layers.Dense(units=512, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(units=512, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(units=512, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(units=512, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(units=512, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(units=num_of_labels, activation='sigmoid')
])
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
loss='binary_crossentropy',
metrics=['binary_accuracy', tf.keras.metrics.AUC()]
)
model.load_weights('./model5layer.weights.h5') #load model here
labels_df=pd.read_csv('./labels.csv')
labels_df=labels_df.drop(columns='Unnamed: 0')
predictions = model.predict(embeds)
predictions_list1=[]
predictions_list2=[]
# 'predictions' will contain the model's output for the custom input tensor
# print(predictions)
for prediction in predictions:
tmp=[]
t2=[]
for i in prediction:
x=0 if i<0.4 else 1
tmp.append(x)
t2.append(i)
predictions_list1.append(tmp.copy())
predictions_list2.append(t2.copy())
label_columns = labels_df.columns
# Convert the predictions into a DataFrame
predictions_df = pd.DataFrame(predictions_list1, columns=label_columns)
p21=pd.DataFrame(predictions_list2, columns=label_columns)
# Save the DataFrame to a CSV file
predictions_df.to_csv("predictions.csv", index=False) #output csv
p21.to_csv("decimal.csv",index=False)
return "predictions.csv"
gr.Interface(
predict,
title = 'Multi-label Protein Function Prediction',
inputs="file",
outputs="file",
description="Upload a fasta file containing protein sequence"
).launch(share=True,debug=True)