annotate validate_fasta.py @ 0:7ae9d78b06f5 draft

"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
author galaxy-australia
date Fri, 28 Jan 2022 04:56:29 +0000
parents
children 6c92e000d684
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
1
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
2
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
3 import argparse
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
4 from typing import List, TextIO
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
5
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
6
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
7 class Fasta:
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
8 def __init__(self, header_str: str, seq_str: str):
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
9 self.header = header_str
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
10 self.aa_seq = seq_str
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
11
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
12
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
13 class FastaLoader:
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
14 def __init__(self):
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
15 """creates a Fasta() from a file"""
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
16 self.fastas: List[Fasta] = []
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
17
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
18 def load(self, fasta_path: str):
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
19 """
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
20 load function has to be very flexible.
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
21 file may be normal fasta format (header, seq) or can just be a bare sequence.
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
22 """
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
23 with open(fasta_path, 'r') as fp:
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
24 header, sequence = self.interpret_first_line(fp)
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
25 line = fp.readline().rstrip('\n')
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
26
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
27 while line:
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
28 if line.startswith('>'):
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
29 self.update_fastas(header, sequence)
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
30 header = line
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
31 sequence = ''
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
32 else:
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
33 sequence += line
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
34 line = fp.readline().rstrip('\n')
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
35
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
36 # after reading whole file, header & sequence buffers might be full
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
37 self.update_fastas(header, sequence)
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
38 return self.fastas
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
39
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
40 def interpret_first_line(self, fp: TextIO):
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
41 header = ''
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
42 sequence = ''
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
43 line = fp.readline().rstrip('\n')
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
44 if line.startswith('>'):
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
45 header = line
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
46 else:
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
47 sequence += line
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
48 return header, sequence
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
49
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
50 def update_fastas(self, header: str, sequence: str):
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
51 # if we have a sequence
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
52 if not sequence == '':
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
53 # create generic header if not exists
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
54 if header == '':
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
55 fasta_count = len(self.fastas)
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
56 header = f'>sequence_{fasta_count}'
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
57
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
58 # create new Fasta
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
59 self.fastas.append(Fasta(header, sequence))
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
60
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
61
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
62 class FastaValidator:
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
63 def __init__(self, fasta_list: List[Fasta]):
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
64 self.fasta_list = fasta_list
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
65 self.min_length = 30
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
66 self.max_length = 2000
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
67 self.iupac_characters = {
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
68 'A', 'B', 'C', 'D', 'E', 'F', 'G',
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
69 'H', 'I', 'K', 'L', 'M', 'N', 'P',
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
70 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
71 'Y', 'Z', '-'
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
72 }
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
73
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
74 def validate(self):
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
75 """performs fasta validation"""
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
76 self.validate_num_seqs()
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
77 self.validate_length()
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
78 self.validate_alphabet()
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
79 # not checking for 'X' nucleotides at the moment.
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
80 # alphafold can throw an error if it doesn't like it.
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
81 #self.validate_x()
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
82
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
83 def validate_num_seqs(self) -> None:
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
84 if len(self.fasta_list) > 1:
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
85 raise Exception(f'Error encountered validating fasta: More than 1 sequence detected ({len(self.fasta_list)}). Please use single fasta sequence as input')
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
86 elif len(self.fasta_list) == 0:
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
87 raise Exception(f'Error encountered validating fasta: input file has no fasta sequences')
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
88
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
89 def validate_length(self):
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
90 """Confirms whether sequence length is valid. """
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
91 fasta = self.fasta_list[0]
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
92 if len(fasta.aa_seq) < self.min_length:
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
93 raise Exception(f'Error encountered validating fasta: Sequence too short ({len(fasta.aa_seq)}aa). Must be > 30aa')
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
94 if len(fasta.aa_seq) > self.max_length:
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
95 raise Exception(f'Error encountered validating fasta: Sequence too long ({len(fasta.aa_seq)}aa). Must be < 2000aa')
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
96
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
97 def validate_alphabet(self):
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
98 """
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
99 Confirms whether the sequence conforms to IUPAC codes.
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
100 If not, reports the offending character and its position.
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
101 """
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
102 fasta = self.fasta_list[0]
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
103 for i, char in enumerate(fasta.aa_seq.upper()):
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
104 if char not in self.iupac_characters:
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
105 raise Exception(f'Error encountered validating fasta: Invalid amino acid found at pos {i}: {char}')
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
106
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
107 def validate_x(self):
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
108 """checks if any bases are X. TODO check whether alphafold accepts X bases. """
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
109 fasta = self.fasta_list[0]
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
110 for i, char in enumerate(fasta.aa_seq.upper()):
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
111 if char == 'X':
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
112 raise Exception(f'Error encountered validating fasta: Unsupported aa code "X" found at pos {i}')
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
113
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
114
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
115 class FastaWriter:
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
116 def __init__(self) -> None:
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
117 self.outfile = 'alphafold.fasta'
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
118 self.formatted_line_len = 60
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
119
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
120 def write(self, fasta: Fasta):
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
121 with open(self.outfile, 'w') as fp:
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
122 header = fasta.header
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
123 seq = self.format_sequence(fasta.aa_seq)
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
124 fp.write(header + '\n')
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
125 fp.write(seq + '\n')
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
126
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
127 def format_sequence(self, aa_seq: str):
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
128 formatted_seq = ''
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
129 for i in range(0, len(aa_seq), self.formatted_line_len):
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
130 formatted_seq += aa_seq[i: i + self.formatted_line_len] + '\n'
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
131 return formatted_seq
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
132
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
133
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
134 def main():
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
135 # load fasta file
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
136 args = parse_args()
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
137 fl = FastaLoader()
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
138 fastas = fl.load(args.input_fasta)
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
139
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
140 # validate
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
141 fv = FastaValidator(fastas)
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
142 fv.validate()
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
143
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
144 # write cleaned version
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
145 fw = FastaWriter()
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
146 fw.write(fastas[0])
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
147
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
148
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
149 def parse_args() -> argparse.Namespace:
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
150 parser = argparse.ArgumentParser()
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
151 parser.add_argument(
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
152 "input_fasta",
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
153 help="input fasta file",
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
154 type=str
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
155 )
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
156 return parser.parse_args()
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
157
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
158
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
159
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
160 if __name__ == '__main__':
7ae9d78b06f5 "planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff changeset
161 main()