comparison validate_fasta.py @ 9:3bd420ec162d draft

planemo upload for repository https://github.com/usegalaxy-au/tools-au commit 7726c3cba165bdc8fc6366ec0ce6596e55657468
author galaxy-australia
date Tue, 13 Sep 2022 22:04:12 +0000
parents ca90d17ff51b
children d00e15139065
comparison
equal deleted inserted replaced
8:ca90d17ff51b 9:3bd420ec162d
2 2
3 import re 3 import re
4 import sys 4 import sys
5 import argparse 5 import argparse
6 from typing import List 6 from typing import List
7
8 MULTIMER_MAX_SEQUENCE_COUNT = 10
7 9
8 10
9 class Fasta: 11 class Fasta:
10 def __init__(self, header_str: str, seq_str: str): 12 def __init__(self, header_str: str, seq_str: str):
11 self.header = header_str 13 self.header = header_str
70 72
71 73
72 class FastaValidator: 74 class FastaValidator:
73 def __init__( 75 def __init__(
74 self, 76 self,
75 fasta_list: List[Fasta],
76 min_length=None, 77 min_length=None,
77 max_length=None): 78 max_length=None,
79 multiple=False):
80 self.multiple = multiple
78 self.min_length = min_length 81 self.min_length = min_length
79 self.max_length = max_length 82 self.max_length = max_length
80 self.fasta_list = fasta_list
81 self.iupac_characters = { 83 self.iupac_characters = {
82 'A', 'B', 'C', 'D', 'E', 'F', 'G', 84 'A', 'B', 'C', 'D', 'E', 'F', 'G',
83 'H', 'I', 'K', 'L', 'M', 'N', 'P', 85 'H', 'I', 'K', 'L', 'M', 'N', 'P',
84 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 86 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
85 'Y', 'Z', '-' 87 'Y', 'Z', '-'
86 } 88 }
87 89
88 def validate(self): 90 def validate(self, fasta_list: List[Fasta]):
89 """Perform FASTA validation.""" 91 """Perform FASTA validation."""
92 self.fasta_list = fasta_list
90 self.validate_num_seqs() 93 self.validate_num_seqs()
91 self.validate_length() 94 self.validate_length()
92 self.validate_alphabet() 95 self.validate_alphabet()
93
94 # not checking for 'X' nucleotides at the moment. 96 # not checking for 'X' nucleotides at the moment.
95 # alphafold can throw an error if it doesn't like it. 97 # alphafold can throw an error if it doesn't like it.
96 # self.validate_x() 98 # self.validate_x()
99 return self.fasta_list
97 100
98 def validate_num_seqs(self) -> None: 101 def validate_num_seqs(self) -> None:
99 """Assert that only one sequence has been provided.""" 102 """Assert that only one sequence has been provided."""
100 if len(self.fasta_list) > 1: 103 fasta_count = len(self.fasta_list)
101 sys.stderr.write( 104
102 'WARNING: More than 1 sequence detected.' 105 if self.multiple:
103 ' Using first FASTA sequence as input.\n') 106 if fasta_count < 2:
104 self.fasta_list = self.fasta_list[:1] 107 raise ValueError(
105 elif len(self.fasta_list) == 0: 108 'Error encountered validating FASTA:\n'
106 raise ValueError( 109 'Multimer mode requires multiple input sequence.'
107 'Error encountered validating FASTA:\n' 110 f' Only {fasta_count} sequences were detected in'
108 ' input file has no FASTA sequences') 111 ' the provided file.')
112 self.fasta_list = self.fasta_list
113
114 elif fasta_count > MULTIMER_MAX_SEQUENCE_COUNT:
115 sys.stderr.write(
116 f'WARNING: detected {fasta_count} sequences but the'
117 f' maximum allowed is {MULTIMER_MAX_SEQUENCE_COUNT}'
118 ' sequences. The last'
119 f' {fasta_count - MULTIMER_MAX_SEQUENCE_COUNT} sequence(s)'
120 ' have been discarded.\n')
121 self.fasta_list = self.fasta_list[:MULTIMER_MAX_SEQUENCE_COUNT]
122 else:
123 if fasta_count > 1:
124 sys.stderr.write(
125 'WARNING: More than 1 sequence detected.'
126 ' Using first FASTA sequence as input.\n')
127 self.fasta_list = self.fasta_list[:1]
128
129 elif len(self.fasta_list) == 0:
130 raise ValueError(
131 'Error encountered validating FASTA:\n'
132 ' no FASTA sequences detected in input file.')
109 133
110 def validate_length(self): 134 def validate_length(self):
111 """Confirm whether sequence length is valid.""" 135 """Confirm whether sequence length is valid."""
112 fasta = self.fasta_list[0] 136 fasta = self.fasta_list[0]
113 if self.min_length: 137 if self.min_length:
168 args = parse_args() 192 args = parse_args()
169 fas = FastaLoader(args.input) 193 fas = FastaLoader(args.input)
170 194
171 # validate 195 # validate
172 fv = FastaValidator( 196 fv = FastaValidator(
173 fas.fastas,
174 min_length=args.min_length, 197 min_length=args.min_length,
175 max_length=args.max_length, 198 max_length=args.max_length,
199 multiple=args.multimer,
176 ) 200 )
177 fv.validate() 201 clean_fastas = fv.validate(fas.fastas)
178 202
179 # write cleaned version 203 # write clean data
180 fw = FastaWriter() 204 fw = FastaWriter()
181 fw.write(fas.fastas[0]) 205 for fas in clean_fastas:
206 fw.write(fas)
182 207
183 except ValueError as exc: 208 except ValueError as exc:
184 sys.stderr.write(f"{exc}\n\n") 209 sys.stderr.write(f"{exc}\n\n")
185 raise exc 210 raise exc
186 211
210 dest='max_length', 235 dest='max_length',
211 help="Maximum length of input protein sequence (AA)", 236 help="Maximum length of input protein sequence (AA)",
212 default=None, 237 default=None,
213 type=int, 238 type=int,
214 ) 239 )
240 parser.add_argument(
241 "--multimer",
242 action='store_true',
243 help="Require multiple input sequences",
244 )
215 return parser.parse_args() 245 return parser.parse_args()
216 246
217 247
218 if __name__ == '__main__': 248 if __name__ == '__main__':
219 main() 249 main()