AmelieSchreiber's picture
Update README.md
907da65
|
raw
history blame
5.45 kB
---
library_name: peft
license: mit
language:
- en
metrics:
- f1
- precision
- recall
tags:
- ems
- esm2
- protein language model
- biology
---
## Training procedure
This model was trained with Hugging Face's Parameter Efficient Fine-Tuning (PEFT) library, in particular,
a Low Rank Adaptation (LoRA) was trained on top of the model
[AmelieSchreiber/esm2_t6_8M_finetuned_cafa5](https://huggingface.co./AmelieSchreiber/esm2_t6_8M_finetuned_cafa5).
```
Epoch 3/3
Training Loss: 0.0152
Validation Loss: 0.0153
Val F1 Score: 0.7361
Micro-Average Precision: 0.9977
Micro-Average Recall: 0.2264
Micro-Average ROC AUC: 0.8894
```
### Framework versions
- PEFT 0.4.0
## Using the Model
To use the model, try downloading the data [from here](https://huggingface.co./datasets/AmelieSchreiber/cafa_5),
adjust the paths to the files in the code below to their local paths on your machine, and try running:
```python
import os
import numpy as np
import torch
from transformers import AutoTokenizer, EsmForSequenceClassification, AdamW
from torch.nn.functional import binary_cross_entropy_with_logits
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from accelerate import Accelerator
from Bio import SeqIO
# Step 1: Data Preprocessing
fasta_file = "data/Train/train_sequences.fasta"
tsv_file = "data/Train/train_terms.tsv"
fasta_data = {}
tsv_data = {}
for record in SeqIO.parse(fasta_file, "fasta"):
fasta_data[record.id] = str(record.seq)
with open(tsv_file, 'r') as f:
for line in f:
parts = line.strip().split("\t")
tsv_data[parts[0]] = parts[1:]
unique_terms = list(set(term for terms in tsv_data.values() for term in terms))
def parse_fasta(file_path):
"""
Parses a FASTA file and returns a list of sequences.
"""
with open(file_path, 'r') as f:
content = f.readlines()
sequences = []
current_sequence = ""
for line in content:
if line.startswith(">"):
if current_sequence:
sequences.append(current_sequence)
current_sequence = ""
else:
current_sequence += line.strip()
if current_sequence:
sequences.append(current_sequence)
return sequences
# Parse the provided FASTA file
fasta_file_path = "data/Test/testsuperset.fasta"
protein_sequences = parse_fasta(fasta_file_path)
# protein_sequences[:3] # Displaying the first 3 sequences for verification
import torch
from transformers import AutoTokenizer, EsmForSequenceClassification
from sklearn.metrics import precision_recall_fscore_support
# 1. Parsing the go-basic.obo file (Assuming this is still needed)
def parse_obo_file(file_path):
with open(file_path, 'r') as f:
data = f.read().split("[Term]")
terms = []
for entry in data[1:]:
lines = entry.strip().split("\n")
term = {}
for line in lines:
if line.startswith("id:"):
term["id"] = line.split("id:")[1].strip()
elif line.startswith("name:"):
term["name"] = line.split("name:")[1].strip()
elif line.startswith("namespace:"):
term["namespace"] = line.split("namespace:")[1].strip()
elif line.startswith("def:"):
term["definition"] = line.split("def:")[1].split('"')[1]
terms.append(term)
return terms
# Let's assume the path to go-basic.obo is as follows (please modify if different)
obo_file_path = "data/Train/go-basic.obo"
parsed_terms = parse_obo_file("data/Train/go-basic.obo") # Replace with your path
# 2. Load the saved model and tokenizer
# Assuming the model path provided is correct
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel, PeftConfig
# Load the tokenizer and model
model_id = "AmelieSchreiber/esm2_t6_8M_lora_cafa5" # Replace with your Hugging Face hub model name
tokenizer = AutoTokenizer.from_pretrained(model_id)
# First, we load the underlying base model
base_model = AutoModelForSequenceClassification.from_pretrained(model_id)
# Then, we load the model with PEFT
model = PeftModel.from_pretrained(base_model, model_id)
loaded_model = model
loaded_tokenizer = AutoTokenizer.from_pretrained(model_id)
# 3. The predict_protein_function function
def predict_protein_function(sequence, model, tokenizer, go_terms):
inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True, max_length=1022)
model.eval()
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.sigmoid(outputs.logits)
predicted_indices = torch.where(predictions > 0.05)[1].tolist()
functions = []
for idx in predicted_indices:
term_id = unique_terms[idx] # Use the unique_terms list from your training script
for term in go_terms:
if term["id"] == term_id:
functions.append(term["name"])
break
return functions
# 4. Predicting protein function for the sequences in the FASTA file
protein_functions = {}
for seq in protein_sequences[:20]: # Using only the first 3 sequences for demonstration
predicted_functions = predict_protein_function(seq, loaded_model, loaded_tokenizer, parsed_terms)
protein_functions[seq[:20] + "..."] = predicted_functions # Using first 20 characters as key
protein_functions
```