AmelieSchreiber commited on
Commit
33998ff
1 Parent(s): 7895fd5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +136 -1
README.md CHANGED
@@ -15,6 +15,10 @@ tags:
15
  ---
16
  ## Training procedure
17
 
 
 
 
 
18
  ```
19
  Epoch 3/3
20
  Training Loss: 0.0152
@@ -27,5 +31,136 @@ Micro-Average ROC AUC: 0.8894
27
 
28
  ### Framework versions
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- - PEFT 0.4.0
 
 
15
  ---
16
  ## Training procedure
17
 
18
+ This model was trained with Hugging Face's Parameter Efficient Fine-Tuning (PEFT) library, in particular,
19
+ a Low Rank Adaptation (LoRA) was trained on top of the model
20
+ [AmelieSchreiber/esm2_t6_8M_finetuned_cafa5](https://huggingface.co/AmelieSchreiber/esm2_t6_8M_finetuned_cafa5).
21
+
22
  ```
23
  Epoch 3/3
24
  Training Loss: 0.0152
 
31
 
32
  ### Framework versions
33
 
34
+ - PEFT 0.4.0
35
+
36
+ ## Using the Model
37
+
38
+ ```python
39
+ import os
40
+ import numpy as np
41
+ import torch
42
+ from transformers import AutoTokenizer, EsmForSequenceClassification, AdamW
43
+ from torch.nn.functional import binary_cross_entropy_with_logits
44
+ from sklearn.model_selection import train_test_split
45
+ from sklearn.metrics import f1_score, precision_score, recall_score
46
+ from accelerate import Accelerator
47
+ from Bio import SeqIO
48
+
49
+ # Step 1: Data Preprocessing
50
+ fasta_file = "data/Train/train_sequences.fasta"
51
+ tsv_file = "data/Train/train_terms.tsv"
52
+
53
+ fasta_data = {}
54
+ tsv_data = {}
55
+
56
+ for record in SeqIO.parse(fasta_file, "fasta"):
57
+ fasta_data[record.id] = str(record.seq)
58
+
59
+ with open(tsv_file, 'r') as f:
60
+ for line in f:
61
+ parts = line.strip().split("\t")
62
+ tsv_data[parts[0]] = parts[1:]
63
+
64
+ unique_terms = list(set(term for terms in tsv_data.values() for term in terms))
65
+
66
+ def parse_fasta(file_path):
67
+ """
68
+ Parses a FASTA file and returns a list of sequences.
69
+ """
70
+ with open(file_path, 'r') as f:
71
+ content = f.readlines()
72
+
73
+ sequences = []
74
+ current_sequence = ""
75
+
76
+ for line in content:
77
+ if line.startswith(">"):
78
+ if current_sequence:
79
+ sequences.append(current_sequence)
80
+ current_sequence = ""
81
+ else:
82
+ current_sequence += line.strip()
83
+
84
+ if current_sequence:
85
+ sequences.append(current_sequence)
86
+
87
+ return sequences
88
+
89
+ # Parse the provided FASTA file
90
+ fasta_file_path = "data/Test/testsuperset.fasta"
91
+ protein_sequences = parse_fasta(fasta_file_path)
92
+ # protein_sequences[:3] # Displaying the first 3 sequences for verification
93
+
94
+ import torch
95
+ from transformers import AutoTokenizer, EsmForSequenceClassification
96
+ from sklearn.metrics import precision_recall_fscore_support
97
+
98
+ # 1. Parsing the go-basic.obo file (Assuming this is still needed)
99
+ def parse_obo_file(file_path):
100
+ with open(file_path, 'r') as f:
101
+ data = f.read().split("[Term]")
102
+
103
+ terms = []
104
+ for entry in data[1:]:
105
+ lines = entry.strip().split("\n")
106
+ term = {}
107
+ for line in lines:
108
+ if line.startswith("id:"):
109
+ term["id"] = line.split("id:")[1].strip()
110
+ elif line.startswith("name:"):
111
+ term["name"] = line.split("name:")[1].strip()
112
+ elif line.startswith("namespace:"):
113
+ term["namespace"] = line.split("namespace:")[1].strip()
114
+ elif line.startswith("def:"):
115
+ term["definition"] = line.split("def:")[1].split('"')[1]
116
+ terms.append(term)
117
+ return terms
118
+
119
+ # Let's assume the path to go-basic.obo is as follows (please modify if different)
120
+ obo_file_path = "data/Train/go-basic.obo"
121
+ parsed_terms = parse_obo_file("data/Train/go-basic.obo") # Replace with your path
122
+
123
+ # 2. Load the saved model and tokenizer
124
+ # Assuming the model path provided is correct
125
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
126
+ from peft import PeftModel, PeftConfig
127
+
128
+ # Load the tokenizer and model
129
+ model_id = "AmelieSchreiber/esm2_t6_8M_lora_cafa5" # Replace with your Hugging Face hub model name
130
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
131
+
132
+ # First, we load the underlying base model
133
+ base_model = AutoModelForSequenceClassification.from_pretrained(model_id)
134
+
135
+ # Then, we load the model with PEFT
136
+ model = PeftModel.from_pretrained(base_model, model_id)
137
+ loaded_model = model
138
+ loaded_tokenizer = AutoTokenizer.from_pretrained(model_id)
139
+
140
+ # 3. The predict_protein_function function
141
+ def predict_protein_function(sequence, model, tokenizer, go_terms):
142
+ inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True, max_length=1022)
143
+ model.eval()
144
+ with torch.no_grad():
145
+ outputs = model(**inputs)
146
+ predictions = torch.sigmoid(outputs.logits)
147
+ predicted_indices = torch.where(predictions > 0.05)[1].tolist()
148
+
149
+ functions = []
150
+ for idx in predicted_indices:
151
+ term_id = unique_terms[idx] # Use the unique_terms list from your training script
152
+ for term in go_terms:
153
+ if term["id"] == term_id:
154
+ functions.append(term["name"])
155
+ break
156
+
157
+ return functions
158
+
159
+ # 4. Predicting protein function for the sequences in the FASTA file
160
+ protein_functions = {}
161
+ for seq in protein_sequences[:20]: # Using only the first 3 sequences for demonstration
162
+ predicted_functions = predict_protein_function(seq, loaded_model, loaded_tokenizer, parsed_terms)
163
+ protein_functions[seq[:20] + "..."] = predicted_functions # Using first 20 characters as key
164
 
165
+ protein_functions
166
+ ```