Spaces:
Sleeping
Sleeping
File size: 2,813 Bytes
d9a04ad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
"""
Helper function
"""
def get_sequence_example(example):
print(example)
if example.startswith("Amino Acid Long Sequence"):
return """\
>CCP45025.1|FEATURES|hmdarg|isoniazid|kasA
GARAGVMTPVSACSSGSEAIAHAWRQIVMGDADVAVCGGVEGPIEALPIAAFSMMRAMST
RNDEPERASRPFDKDRDGFVFGEAGALMLIETEEHAKARGAKPLARLLGAGITSDAFHMV
APAADGVRAGRAMTRSLELAGLSPADIDHVNAHGTATPIGDAAEANAIRVAGCDQAAVYA
PKSALGHSIGAVGALESVLTVLTLRDGVIPPTLNYETPDPEIDLDVVAGEPRYGDYRYAV
NNSFGFGGHN
>gi:505065763:ref:WP_015252865.1:|FEATURES|deeparg|tunicamycin|tmrB
GSFGSGKTQTAFELHRRLNPSYVYDPEKMGFALRSMVPQEIAKDDFQSYPLWRAFNYSLL"""
elif example.startswith("Amino Acid Short Sequence"):
return """\
>gi:505065763:ref:WP_015252865.1:|FEATURES|deeparg|tunicamycin|tmrB_0
RGIIIVPMTIVYPEYFNEIIGRLRQEGRIV
>AGQ48857.1|FEATURES|hmdarg|pleuromutilin|eatAv_0
GNFSIYEEQKKLRDEFEMAQNEKLKKEVSR"""
elif example.startswith("Nucleotide Long Sequence"):
return """\
>AJ635405|FEATURES|resfinder|beta-lactam|blaLEN9
GGATGGTGGAAATGGATCTGGCCAGCGGCCGCACGCTGGCCGCCTGGCGCGCCGATGAACGCTTTCCCATGGTGAGCACCTTTAAAGTGCTGCTGTGCGGCGCGGTGCTGGCGCGGGTGGATGCCGGGCTCGAACAACTGGATCGGCGGATCCACTACCGCCAGCAGGATCTGGTGGACTACTCCCCGGTCAGCGAAAAACACCTTGTCGACGGGATGACGATCGGCGAACTCTGTGCCGCCGCCATCACCCTGAGCGATAACAGCGCTGGCAATCTGCTGCTGGCCACCGTCGGCGGCCCCGCGGGATTAACTGCCTTTCTGCGCCAGATCGGTGACAACGTCACCCGTCTTGACCGCTGGGAAACGGCACTGAATGAGGCGCTTCCCGGCGACGCGCGCGACACCACCACCCCGGCCAGCATGGCCGCCACGCTGCGCAAACTACTGACCGCGCAGCATCTGAGCGCCCGT"""
elif example.startswith("Nucleotide Short Sequence"):
return """\
>S60108|FEATURES|resfinder|aminoglycoside|kgmB_0
CCGCACCCGGCTCCCGGACCCGGCGATCCCGAGGACCCGAGGCTGGCGGAGGTCGTCGACGCGGTCCGGTCCAGCAGGCGCTACCAGAGCGTCGCGCCCG
>APOK01000044|FEATURES|resfinder|beta-lactam|blaOXA-290_0
ACATATGATGGGCAAACATTTCAAGAATATGGCAATGCGTTGAGTCGATCGAATACGGCTTATATTCCAGCCTCAACCTTCAAGATGTTAAATGCTCTGA"""
def classify_sequence(sequence):
nucleotide_chars = set("ATGC-")
if all(char in nucleotide_chars for char in sequence):
return "nt" # "Nucleotide"
else:
return "aa" # "Amino Acid
def count_length_sequences(sequence):
count_length_sequences = []
for i in sequence:
count_length_sequences.append(len(i))
max_sequence_length = max(count_length_sequences)
return max_sequence_length
def classify_sequence_type_length(sequence):
sequence_type = classify_sequence(sequence[0])
sequence_length = count_length_sequences(sequence)
if sequence_type == "nt": # nucleotide
if sequence_length > 150:
sequence_length_type = "l" # "long"
else:
sequence_length_type = "s" # "short"
else: # amino acid
if sequence_length > 50 :
sequence_length_type = "l"
else:
sequence_length_type = "s"
return (sequence_type, sequence_length_type)
|