Update README.md

907da65 about 1 year ago

5.45 kB

	---
	library_name: peft
	license: mit
	language:
	- en
	metrics:
	- f1
	- precision
	- recall
	tags:
	- ems
	- esm2
	- protein language model
	- biology
	---
	## Training procedure

	This model was trained with Hugging Face's Parameter Efficient Fine-Tuning (PEFT) library, in particular,
	a Low Rank Adaptation (LoRA) was trained on top of the model
	[AmelieSchreiber/esm2_t6_8M_finetuned_cafa5](https://huggingface.co./AmelieSchreiber/esm2_t6_8M_finetuned_cafa5).

	```
	Epoch 3/3
	Training Loss: 0.0152
	Validation Loss: 0.0153
	Val F1 Score: 0.7361
	Micro-Average Precision: 0.9977
	Micro-Average Recall: 0.2264
	Micro-Average ROC AUC: 0.8894
	```

	### Framework versions

	- PEFT 0.4.0

	## Using the Model

	To use the model, try downloading the data [from here](https://huggingface.co./datasets/AmelieSchreiber/cafa_5),
	adjust the paths to the files in the code below to their local paths on your machine, and try running:

	```python
	import os
	import numpy as np
	import torch
	from transformers import AutoTokenizer, EsmForSequenceClassification, AdamW
	from torch.nn.functional import binary_cross_entropy_with_logits
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import f1_score, precision_score, recall_score
	from accelerate import Accelerator
	from Bio import SeqIO

	# Step 1: Data Preprocessing
	fasta_file = "data/Train/train_sequences.fasta"
	tsv_file = "data/Train/train_terms.tsv"

	fasta_data = {}
	tsv_data = {}

	for record in SeqIO.parse(fasta_file, "fasta"):
	fasta_data[record.id] = str(record.seq)

	with open(tsv_file, 'r') as f:
	for line in f:
	parts = line.strip().split("\t")
	tsv_data[parts[0]] = parts[1:]

	unique_terms = list(set(term for terms in tsv_data.values() for term in terms))

	def parse_fasta(file_path):
	"""
	Parses a FASTA file and returns a list of sequences.
	"""
	with open(file_path, 'r') as f:
	content = f.readlines()

	sequences = []
	current_sequence = ""

	for line in content:
	if line.startswith(">"):
	if current_sequence:
	sequences.append(current_sequence)
	current_sequence = ""
	else:
	current_sequence += line.strip()

	if current_sequence:
	sequences.append(current_sequence)

	return sequences

	# Parse the provided FASTA file
	fasta_file_path = "data/Test/testsuperset.fasta"
	protein_sequences = parse_fasta(fasta_file_path)
	# protein_sequences[:3] # Displaying the first 3 sequences for verification

	import torch
	from transformers import AutoTokenizer, EsmForSequenceClassification
	from sklearn.metrics import precision_recall_fscore_support

	# 1. Parsing the go-basic.obo file (Assuming this is still needed)
	def parse_obo_file(file_path):
	with open(file_path, 'r') as f:
	data = f.read().split("[Term]")

	terms = []
	for entry in data[1:]:
	lines = entry.strip().split("\n")
	term = {}
	for line in lines:
	if line.startswith("id:"):
	term["id"] = line.split("id:")[1].strip()
	elif line.startswith("name:"):
	term["name"] = line.split("name:")[1].strip()
	elif line.startswith("namespace:"):
	term["namespace"] = line.split("namespace:")[1].strip()
	elif line.startswith("def:"):
	term["definition"] = line.split("def:")[1].split('"')[1]
	terms.append(term)
	return terms

	# Let's assume the path to go-basic.obo is as follows (please modify if different)
	obo_file_path = "data/Train/go-basic.obo"
	parsed_terms = parse_obo_file("data/Train/go-basic.obo") # Replace with your path

	# 2. Load the saved model and tokenizer
	# Assuming the model path provided is correct
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	from peft import PeftModel, PeftConfig

	# Load the tokenizer and model
	model_id = "AmelieSchreiber/esm2_t6_8M_lora_cafa5" # Replace with your Hugging Face hub model name
	tokenizer = AutoTokenizer.from_pretrained(model_id)

	# First, we load the underlying base model
	base_model = AutoModelForSequenceClassification.from_pretrained(model_id)

	# Then, we load the model with PEFT
	model = PeftModel.from_pretrained(base_model, model_id)
	loaded_model = model
	loaded_tokenizer = AutoTokenizer.from_pretrained(model_id)

	# 3. The predict_protein_function function
	def predict_protein_function(sequence, model, tokenizer, go_terms):
	inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True, max_length=1022)
	model.eval()
	with torch.no_grad():
	outputs = model(**inputs)
	predictions = torch.sigmoid(outputs.logits)
	predicted_indices = torch.where(predictions > 0.05)[1].tolist()

	functions = []
	for idx in predicted_indices:
	term_id = unique_terms[idx] # Use the unique_terms list from your training script
	for term in go_terms:
	if term["id"] == term_id:
	functions.append(term["name"])
	break

	return functions

	# 4. Predicting protein function for the sequences in the FASTA file
	protein_functions = {}
	for seq in protein_sequences[:20]: # Using only the first 3 sequences for demonstration
	predicted_functions = predict_protein_function(seq, loaded_model, loaded_tokenizer, parsed_terms)
	protein_functions[seq[:20] + "..."] = predicted_functions # Using first 20 characters as key

	protein_functions
	```