Spaces:

as-cle-bert
/

saccharomyces-pythia

Sleeping

AstraBert

first commit

01554a7 11 months ago

9.34 kB

	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import VotingClassifier
	from sklearn.ensemble import VotingClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier
	from sklearn.tree import DecisionTreeClassifier
	from Bio.SeqUtils.ProtParam import ProteinAnalysis
	from Bio.SeqUtils.CheckSum import crc32
	from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex
	from Bio.SeqUtils.CodonUsageIndices import SharpEcoliIndex
	from Bio.SeqUtils import six_frame_translations
	from Bio.Seq import Seq
	from Bio import SeqIO
	import gzip
	from math import floor
	from sklearn.metrics import accuracy_score
	from orfipy_core import orfs
	import sys
	import matplotlib.pyplot as plt


	def load_data(infile):
	"""Load data from infile if it is in fasta format (after having unzipped it, if it is zipped)"""
	if infile.endswith(".gz"): # If file is gzipped, unzip it
	y = gzip.open(infile, "rt", encoding="latin-1")
	# Read file as fasta if it is fasta
	if (
	infile.endswith(".fasta.gz")
	or infile.endswith(".fna.gz")
	or infile.endswith(".fas.gz")
	or infile.endswith(".fa.gz")
	):
	records = SeqIO.parse(y, "fasta")
	sequences = {}
	for record in records:
	sequences.update({str(record.id): str(record.seq)})
	y.close()
	return sequences
	else:
	y.close()
	raise ValueError("File is the wrong format")
	# Read file directly as fasta if it is a not zipped fasta: handle also more uncommon extensions :-)
	elif (
	infile.endswith(".fasta")
	or infile.endswith(".fna")
	or infile.endswith(".fas")
	or infile.endswith(".fa")
	):
	with open(infile, "r") as y:
	records = SeqIO.parse(y, "fasta")
	sequences = {}
	for record in records:
	sequences.update({str(record.id): str(record.seq)})
	y.close()
	return sequences
	else:
	raise ValueError("File is the wrong format")


	def calculate_cai(dna, index=SharpEcoliIndex):
	cai = CodonAdaptationIndex()
	cai.set_cai_index(index)
	if len(dna) % 3 == 0:
	a = cai.cai_for_gene(dna)
	else:
	six_translated = six_frame_translations(dna)
	n = six_translated.split("\n")
	frames = {
	"0;F": n[5],
	"1;F": n[6],
	"2;F": n[7],
	"0;R": n[12],
	"1;R": n[11],
	"2;R": n[10],
	}
	ind = 0
	for i in list(frames.keys()):
	k = frames[i].replace(" ", "")
	if "M" in k and "*" in k:
	if i.split(";")[0] == "F" and k.index("M") < k.index("*"):
	if len(k) <= len(dna) / 3:
	ind = int(i.split("")[0])
	break
	elif i.split(";")[0] == "R" and k.index("M") > k.index("*"):
	if len(k) <= len(dna) / 3:
	ind = len(dna) - int(i.split("")[0])
	break
	if ind == 0:
	cods = 3 * floor(len(dna) / 3)
	dna = dna[:cods]
	a = cai.cai_for_gene(dna)
	elif 1 <= ind <= 2:
	if len(dna[ind:]) % 3 == 0:
	dna = dna[ind:]
	else:
	cods = 3 * floor((len(dna) - ind) / 3)
	dna = dna[ind : cods + ind]
	a = cai.cai_for_gene(dna)
	else:
	if len(dna[:ind]) % 3 == 0:
	dna = dna[ind:]
	else:
	cods = 3 * floor((len(dna) - ind) / 3)
	dna = dna[:cods]
	a = cai.cai_for_gene(dna)
	return a


	def checksum(dna):
	return crc32(dna)


	def hidrophobicity(dna):
	protein_sequence = str(Seq(dna).translate())
	protein_sequence = protein_sequence.replace("*", "")
	hydrophobicity_score = ProteinAnalysis(protein_sequence).gravy()
	return hydrophobicity_score


	def isoelectric_pt(dna):
	protein_sequence = str(Seq(dna).translate())
	protein_sequence = protein_sequence.replace("*", "")
	isoelectric = ProteinAnalysis(protein_sequence).isoelectric_point()
	return isoelectric


	def aromatic(dna):
	protein_sequence = str(Seq(dna).translate())
	protein_sequence = protein_sequence.replace("*", "")
	arom = ProteinAnalysis(protein_sequence).aromaticity()
	return arom


	def instable(dna):
	protein_sequence = str(Seq(dna).translate())
	protein_sequence = protein_sequence.replace("*", "")
	inst = ProteinAnalysis(protein_sequence).instability_index()
	return inst


	def weight(dna):
	protein_sequence = str(Seq(dna).translate())
	protein_sequence = protein_sequence.replace("*", "")
	wgt = ProteinAnalysis(protein_sequence).molecular_weight()
	return wgt


	def sec_struct(dna):
	protein_sequence = str(Seq(dna).translate())
	protein_sequence = protein_sequence.replace("*", "")
	second_struct = ProteinAnalysis(protein_sequence).secondary_structure_fraction()
	return ",".join([str(s) for s in second_struct])


	def mol_ext(dna):
	protein_sequence = str(Seq(dna).translate())
	protein_sequence = protein_sequence.replace("*", "")
	molar_ext = ProteinAnalysis(protein_sequence).molar_extinction_coefficient()
	return ",".join([str(s) for s in molar_ext])


	def longest_orf(coding):
	keys_M_starting = [
	key
	for key in list(coding.keys())
	if str(Seq(coding[key]).translate()).startswith("M")
	]
	M_starting = [
	seq
	for seq in list(coding.values())
	if str(Seq(seq).translate()).startswith("M")
	]
	lengths = [len(seq) for seq in M_starting]
	max_ind = lengths.index(max(lengths))
	return {keys_M_starting[max_ind]: M_starting[max_ind]}


	def predict_orf(seq, minlen=45, maxlen=18000, longest_M_starting_orf_only=True):
	ls = orfs(seq, minlen=minlen, maxlen=maxlen)
	coding = {}
	count = 0
	for start, stop, strand, description in ls:
	count += 1
	coding.update({f"ORF.{count}": seq[int(start) : int(stop)]})
	if longest_M_starting_orf_only:
	print(
	"\n---------------------------\nWarning: option longest_M_starting_orf_only is set to True and thus you will get only the longest M-starting ORF; to get all the ORFs, set it to False\n---------------------------\n",
	file=sys.stderr,
	)
	return longest_orf(coding)
	return coding


	def process_dna(fasta_file):
	fas = load_data(fasta_file)
	seqs = [seq for seq in list(fas.values())]
	heads = [seq for seq in list(fas.keys())]
	data = {}
	proteins = {}
	for i in range(len(seqs)):
	coding = predict_orf(seqs[i])
	open_reading_frames = list(coding.keys())
	for key in open_reading_frames:
	head = f"{heads[i]}.{key}"
	proteins.update({head: str(Seq(coding[key]).translate())})
	cai = calculate_cai(coding[key])
	cksm = checksum(coding[key])
	hydr = hidrophobicity(coding[key])
	isl = isoelectric_pt(coding[key])
	arm = aromatic(coding[key])
	inst = instable(coding[key])
	mw = weight(coding[key])
	se_st = sec_struct(coding[key]).split(",")
	se_st1 = se_st[0]
	se_st2 = se_st[1]
	se_st3 = se_st[2]
	me = mol_ext(coding[key]).split(",")
	me1 = me[0]
	me2 = me[1]
	n = pd.DataFrame(
	{
	"CAI": [cai],
	"CHECKSUM": [cksm],
	"HIDROPHOBICITY": [hydr],
	"ISOELECTRIC": [isl],
	"AROMATIC": [arm],
	"INSTABLE": [inst],
	"MW": [mw],
	"HELIX": [se_st1],
	"TURN": [se_st2],
	"SHEET": [se_st3],
	"MOL_EXT_RED": [me1],
	"MOL_EXT_OX": [me2],
	}
	)
	data.update({head: n})
	return data, proteins

	if __name__ == "__main__":
	print("Loading data...")
	# Load the data from the CSV file
	data = pd.read_csv("../../data/scerevisiae.csv")
	print("Loaded data")

	print("Generating training and test data...")
	# Features
	X = data.iloc[:, 1:]

	# Labels
	y = data["ORF_TYPE"]


	# Split the data into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42
	)
	print("Generated training and test data")

	print("Building and training the model...")
	# Create and train the Random Forest classifier
	clf4 = DecisionTreeClassifier()
	clf7 = HistGradientBoostingClassifier()
	clf8 = ExtraTreesClassifier()
	classifier = VotingClassifier([('dt', clf4), ('hgb', clf7), ('etc', clf8)], voting='hard')

	model = classifier.fit(X, y) # Uncomment this line if clf needs training


	# Make predictions on the test set
	y_pred = model.predict(X)

	# Evaluate the accuracy of the model
	accuracy = accuracy_score(y, y_pred)
	print(f"Accuracy: {accuracy}")

	from joblib import dump

	print("Saving model...")
	dump(model, "SacCerML.joblib")
	print("Saved")

	print("All done")