Mercurial > repos > galaxy-australia > alphafold2
comparison validate_fasta.py @ 9:3bd420ec162d draft
planemo upload for repository https://github.com/usegalaxy-au/tools-au commit 7726c3cba165bdc8fc6366ec0ce6596e55657468
author | galaxy-australia |
---|---|
date | Tue, 13 Sep 2022 22:04:12 +0000 |
parents | ca90d17ff51b |
children | d00e15139065 |
comparison
equal
deleted
inserted
replaced
8:ca90d17ff51b | 9:3bd420ec162d |
---|---|
2 | 2 |
3 import re | 3 import re |
4 import sys | 4 import sys |
5 import argparse | 5 import argparse |
6 from typing import List | 6 from typing import List |
7 | |
8 MULTIMER_MAX_SEQUENCE_COUNT = 10 | |
7 | 9 |
8 | 10 |
9 class Fasta: | 11 class Fasta: |
10 def __init__(self, header_str: str, seq_str: str): | 12 def __init__(self, header_str: str, seq_str: str): |
11 self.header = header_str | 13 self.header = header_str |
70 | 72 |
71 | 73 |
72 class FastaValidator: | 74 class FastaValidator: |
73 def __init__( | 75 def __init__( |
74 self, | 76 self, |
75 fasta_list: List[Fasta], | |
76 min_length=None, | 77 min_length=None, |
77 max_length=None): | 78 max_length=None, |
79 multiple=False): | |
80 self.multiple = multiple | |
78 self.min_length = min_length | 81 self.min_length = min_length |
79 self.max_length = max_length | 82 self.max_length = max_length |
80 self.fasta_list = fasta_list | |
81 self.iupac_characters = { | 83 self.iupac_characters = { |
82 'A', 'B', 'C', 'D', 'E', 'F', 'G', | 84 'A', 'B', 'C', 'D', 'E', 'F', 'G', |
83 'H', 'I', 'K', 'L', 'M', 'N', 'P', | 85 'H', 'I', 'K', 'L', 'M', 'N', 'P', |
84 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', | 86 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
85 'Y', 'Z', '-' | 87 'Y', 'Z', '-' |
86 } | 88 } |
87 | 89 |
88 def validate(self): | 90 def validate(self, fasta_list: List[Fasta]): |
89 """Perform FASTA validation.""" | 91 """Perform FASTA validation.""" |
92 self.fasta_list = fasta_list | |
90 self.validate_num_seqs() | 93 self.validate_num_seqs() |
91 self.validate_length() | 94 self.validate_length() |
92 self.validate_alphabet() | 95 self.validate_alphabet() |
93 | |
94 # not checking for 'X' nucleotides at the moment. | 96 # not checking for 'X' nucleotides at the moment. |
95 # alphafold can throw an error if it doesn't like it. | 97 # alphafold can throw an error if it doesn't like it. |
96 # self.validate_x() | 98 # self.validate_x() |
99 return self.fasta_list | |
97 | 100 |
98 def validate_num_seqs(self) -> None: | 101 def validate_num_seqs(self) -> None: |
99 """Assert that only one sequence has been provided.""" | 102 """Assert that only one sequence has been provided.""" |
100 if len(self.fasta_list) > 1: | 103 fasta_count = len(self.fasta_list) |
101 sys.stderr.write( | 104 |
102 'WARNING: More than 1 sequence detected.' | 105 if self.multiple: |
103 ' Using first FASTA sequence as input.\n') | 106 if fasta_count < 2: |
104 self.fasta_list = self.fasta_list[:1] | 107 raise ValueError( |
105 elif len(self.fasta_list) == 0: | 108 'Error encountered validating FASTA:\n' |
106 raise ValueError( | 109 'Multimer mode requires multiple input sequence.' |
107 'Error encountered validating FASTA:\n' | 110 f' Only {fasta_count} sequences were detected in' |
108 ' input file has no FASTA sequences') | 111 ' the provided file.') |
112 self.fasta_list = self.fasta_list | |
113 | |
114 elif fasta_count > MULTIMER_MAX_SEQUENCE_COUNT: | |
115 sys.stderr.write( | |
116 f'WARNING: detected {fasta_count} sequences but the' | |
117 f' maximum allowed is {MULTIMER_MAX_SEQUENCE_COUNT}' | |
118 ' sequences. The last' | |
119 f' {fasta_count - MULTIMER_MAX_SEQUENCE_COUNT} sequence(s)' | |
120 ' have been discarded.\n') | |
121 self.fasta_list = self.fasta_list[:MULTIMER_MAX_SEQUENCE_COUNT] | |
122 else: | |
123 if fasta_count > 1: | |
124 sys.stderr.write( | |
125 'WARNING: More than 1 sequence detected.' | |
126 ' Using first FASTA sequence as input.\n') | |
127 self.fasta_list = self.fasta_list[:1] | |
128 | |
129 elif len(self.fasta_list) == 0: | |
130 raise ValueError( | |
131 'Error encountered validating FASTA:\n' | |
132 ' no FASTA sequences detected in input file.') | |
109 | 133 |
110 def validate_length(self): | 134 def validate_length(self): |
111 """Confirm whether sequence length is valid.""" | 135 """Confirm whether sequence length is valid.""" |
112 fasta = self.fasta_list[0] | 136 fasta = self.fasta_list[0] |
113 if self.min_length: | 137 if self.min_length: |
168 args = parse_args() | 192 args = parse_args() |
169 fas = FastaLoader(args.input) | 193 fas = FastaLoader(args.input) |
170 | 194 |
171 # validate | 195 # validate |
172 fv = FastaValidator( | 196 fv = FastaValidator( |
173 fas.fastas, | |
174 min_length=args.min_length, | 197 min_length=args.min_length, |
175 max_length=args.max_length, | 198 max_length=args.max_length, |
199 multiple=args.multimer, | |
176 ) | 200 ) |
177 fv.validate() | 201 clean_fastas = fv.validate(fas.fastas) |
178 | 202 |
179 # write cleaned version | 203 # write clean data |
180 fw = FastaWriter() | 204 fw = FastaWriter() |
181 fw.write(fas.fastas[0]) | 205 for fas in clean_fastas: |
206 fw.write(fas) | |
182 | 207 |
183 except ValueError as exc: | 208 except ValueError as exc: |
184 sys.stderr.write(f"{exc}\n\n") | 209 sys.stderr.write(f"{exc}\n\n") |
185 raise exc | 210 raise exc |
186 | 211 |
210 dest='max_length', | 235 dest='max_length', |
211 help="Maximum length of input protein sequence (AA)", | 236 help="Maximum length of input protein sequence (AA)", |
212 default=None, | 237 default=None, |
213 type=int, | 238 type=int, |
214 ) | 239 ) |
240 parser.add_argument( | |
241 "--multimer", | |
242 action='store_true', | |
243 help="Require multiple input sequences", | |
244 ) | |
215 return parser.parse_args() | 245 return parser.parse_args() |
216 | 246 |
217 | 247 |
218 if __name__ == '__main__': | 248 if __name__ == '__main__': |
219 main() | 249 main() |