File size: 5,453 Bytes
ab90cdb edd898f 22619e0 ab90cdb edd898f 33998ff 7895fd5 edd898f 33998ff 907da65 33998ff edd898f 33998ff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
---
library_name: peft
license: mit
language:
- en
metrics:
- f1
- precision
- recall
tags:
- ems
- esm2
- protein language model
- biology
---
## Training procedure
This model was trained with Hugging Face's Parameter Efficient Fine-Tuning (PEFT) library, in particular,
a Low Rank Adaptation (LoRA) was trained on top of the model
[AmelieSchreiber/esm2_t6_8M_finetuned_cafa5](https://huggingface.co./AmelieSchreiber/esm2_t6_8M_finetuned_cafa5).
```
Epoch 3/3
Training Loss: 0.0152
Validation Loss: 0.0153
Val F1 Score: 0.7361
Micro-Average Precision: 0.9977
Micro-Average Recall: 0.2264
Micro-Average ROC AUC: 0.8894
```
### Framework versions
- PEFT 0.4.0
## Using the Model
To use the model, try downloading the data [from here](https://huggingface.co./datasets/AmelieSchreiber/cafa_5),
adjust the paths to the files in the code below to their local paths on your machine, and try running:
```python
import os
import numpy as np
import torch
from transformers import AutoTokenizer, EsmForSequenceClassification, AdamW
from torch.nn.functional import binary_cross_entropy_with_logits
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from accelerate import Accelerator
from Bio import SeqIO
# Step 1: Data Preprocessing
fasta_file = "data/Train/train_sequences.fasta"
tsv_file = "data/Train/train_terms.tsv"
fasta_data = {}
tsv_data = {}
for record in SeqIO.parse(fasta_file, "fasta"):
fasta_data[record.id] = str(record.seq)
with open(tsv_file, 'r') as f:
for line in f:
parts = line.strip().split("\t")
tsv_data[parts[0]] = parts[1:]
unique_terms = list(set(term for terms in tsv_data.values() for term in terms))
def parse_fasta(file_path):
"""
Parses a FASTA file and returns a list of sequences.
"""
with open(file_path, 'r') as f:
content = f.readlines()
sequences = []
current_sequence = ""
for line in content:
if line.startswith(">"):
if current_sequence:
sequences.append(current_sequence)
current_sequence = ""
else:
current_sequence += line.strip()
if current_sequence:
sequences.append(current_sequence)
return sequences
# Parse the provided FASTA file
fasta_file_path = "data/Test/testsuperset.fasta"
protein_sequences = parse_fasta(fasta_file_path)
# protein_sequences[:3] # Displaying the first 3 sequences for verification
import torch
from transformers import AutoTokenizer, EsmForSequenceClassification
from sklearn.metrics import precision_recall_fscore_support
# 1. Parsing the go-basic.obo file (Assuming this is still needed)
def parse_obo_file(file_path):
with open(file_path, 'r') as f:
data = f.read().split("[Term]")
terms = []
for entry in data[1:]:
lines = entry.strip().split("\n")
term = {}
for line in lines:
if line.startswith("id:"):
term["id"] = line.split("id:")[1].strip()
elif line.startswith("name:"):
term["name"] = line.split("name:")[1].strip()
elif line.startswith("namespace:"):
term["namespace"] = line.split("namespace:")[1].strip()
elif line.startswith("def:"):
term["definition"] = line.split("def:")[1].split('"')[1]
terms.append(term)
return terms
# Let's assume the path to go-basic.obo is as follows (please modify if different)
obo_file_path = "data/Train/go-basic.obo"
parsed_terms = parse_obo_file("data/Train/go-basic.obo") # Replace with your path
# 2. Load the saved model and tokenizer
# Assuming the model path provided is correct
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel, PeftConfig
# Load the tokenizer and model
model_id = "AmelieSchreiber/esm2_t6_8M_lora_cafa5" # Replace with your Hugging Face hub model name
tokenizer = AutoTokenizer.from_pretrained(model_id)
# First, we load the underlying base model
base_model = AutoModelForSequenceClassification.from_pretrained(model_id)
# Then, we load the model with PEFT
model = PeftModel.from_pretrained(base_model, model_id)
loaded_model = model
loaded_tokenizer = AutoTokenizer.from_pretrained(model_id)
# 3. The predict_protein_function function
def predict_protein_function(sequence, model, tokenizer, go_terms):
inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True, max_length=1022)
model.eval()
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.sigmoid(outputs.logits)
predicted_indices = torch.where(predictions > 0.05)[1].tolist()
functions = []
for idx in predicted_indices:
term_id = unique_terms[idx] # Use the unique_terms list from your training script
for term in go_terms:
if term["id"] == term_id:
functions.append(term["name"])
break
return functions
# 4. Predicting protein function for the sequences in the FASTA file
protein_functions = {}
for seq in protein_sequences[:20]: # Using only the first 3 sequences for demonstration
predicted_functions = predict_protein_function(seq, loaded_model, loaded_tokenizer, parsed_terms)
protein_functions[seq[:20] + "..."] = predicted_functions # Using first 20 characters as key
protein_functions
``` |