File size: 2,813 Bytes
d9a04ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""
Helper function

"""

def get_sequence_example(example):
    print(example)
    if example.startswith("Amino Acid Long Sequence"):
        return """\
>CCP45025.1|FEATURES|hmdarg|isoniazid|kasA
GARAGVMTPVSACSSGSEAIAHAWRQIVMGDADVAVCGGVEGPIEALPIAAFSMMRAMST
RNDEPERASRPFDKDRDGFVFGEAGALMLIETEEHAKARGAKPLARLLGAGITSDAFHMV
APAADGVRAGRAMTRSLELAGLSPADIDHVNAHGTATPIGDAAEANAIRVAGCDQAAVYA
PKSALGHSIGAVGALESVLTVLTLRDGVIPPTLNYETPDPEIDLDVVAGEPRYGDYRYAV
NNSFGFGGHN
>gi:505065763:ref:WP_015252865.1:|FEATURES|deeparg|tunicamycin|tmrB
GSFGSGKTQTAFELHRRLNPSYVYDPEKMGFALRSMVPQEIAKDDFQSYPLWRAFNYSLL"""
    elif example.startswith("Amino Acid Short Sequence"):
        return """\
>gi:505065763:ref:WP_015252865.1:|FEATURES|deeparg|tunicamycin|tmrB_0
RGIIIVPMTIVYPEYFNEIIGRLRQEGRIV
>AGQ48857.1|FEATURES|hmdarg|pleuromutilin|eatAv_0
GNFSIYEEQKKLRDEFEMAQNEKLKKEVSR"""
    elif example.startswith("Nucleotide Long Sequence"):
        return """\
>AJ635405|FEATURES|resfinder|beta-lactam|blaLEN9
GGATGGTGGAAATGGATCTGGCCAGCGGCCGCACGCTGGCCGCCTGGCGCGCCGATGAACGCTTTCCCATGGTGAGCACCTTTAAAGTGCTGCTGTGCGGCGCGGTGCTGGCGCGGGTGGATGCCGGGCTCGAACAACTGGATCGGCGGATCCACTACCGCCAGCAGGATCTGGTGGACTACTCCCCGGTCAGCGAAAAACACCTTGTCGACGGGATGACGATCGGCGAACTCTGTGCCGCCGCCATCACCCTGAGCGATAACAGCGCTGGCAATCTGCTGCTGGCCACCGTCGGCGGCCCCGCGGGATTAACTGCCTTTCTGCGCCAGATCGGTGACAACGTCACCCGTCTTGACCGCTGGGAAACGGCACTGAATGAGGCGCTTCCCGGCGACGCGCGCGACACCACCACCCCGGCCAGCATGGCCGCCACGCTGCGCAAACTACTGACCGCGCAGCATCTGAGCGCCCGT"""
    elif example.startswith("Nucleotide Short Sequence"):
        return """\
>S60108|FEATURES|resfinder|aminoglycoside|kgmB_0
CCGCACCCGGCTCCCGGACCCGGCGATCCCGAGGACCCGAGGCTGGCGGAGGTCGTCGACGCGGTCCGGTCCAGCAGGCGCTACCAGAGCGTCGCGCCCG
>APOK01000044|FEATURES|resfinder|beta-lactam|blaOXA-290_0
ACATATGATGGGCAAACATTTCAAGAATATGGCAATGCGTTGAGTCGATCGAATACGGCTTATATTCCAGCCTCAACCTTCAAGATGTTAAATGCTCTGA"""

def classify_sequence(sequence):
    nucleotide_chars = set("ATGC-")
    if all(char in nucleotide_chars for char in sequence):
        return "nt" # "Nucleotide"
    else:
        return "aa" # "Amino Acid

def count_length_sequences(sequence):
    count_length_sequences = []
    for i in sequence:
        count_length_sequences.append(len(i))
    max_sequence_length = max(count_length_sequences)
    return max_sequence_length

def classify_sequence_type_length(sequence):
    sequence_type = classify_sequence(sequence[0])
    sequence_length = count_length_sequences(sequence)
    if sequence_type == "nt": # nucleotide
        if sequence_length > 150:
            sequence_length_type = "l" # "long"
        else:
            sequence_length_type = "s" # "short"
    else:  # amino acid
        if sequence_length > 50 :
            sequence_length_type = "l"
        else:
            sequence_length_type = "s"
    return (sequence_type, sequence_length_type)