comparison scripts/validate_fasta.py @ 19:2f7702fd0a4c draft default tip

planemo upload for repository https://github.com/usegalaxy-au/tools-au commit cd0379c8ecc24977dffa462c1897d402c85fa4e6
author galaxy-australia
date Wed, 08 May 2024 06:26:55 +0000
parents e4a053d67e24
children
comparison
equal deleted inserted replaced
18:e4a053d67e24 19:2f7702fd0a4c
3 import argparse 3 import argparse
4 import re 4 import re
5 import sys 5 import sys
6 from typing import List 6 from typing import List
7 7
8 MULTIMER_MAX_SEQUENCE_COUNT = 10 8 DEFAULT_MAX_SEQUENCE_COUNT = 10
9 STRIP_SEQUENCE_CHARS = ['\n', '\r', '\t', ' '] 9 STRIP_SEQUENCE_CHARS = ['\n', '\r', '\t', ' ']
10 10
11 11
12 class Fasta: 12 class Fasta:
13 def __init__(self, header_str: str, seq_str: str): 13 def __init__(self, header_str: str, seq_str: str):
75 self.fastas.append(Fasta(header, sequence)) 75 self.fastas.append(Fasta(header, sequence))
76 76
77 77
78 class FastaValidator: 78 class FastaValidator:
79 def __init__( 79 def __init__(
80 self, 80 self,
81 min_length=None, 81 min_length=None,
82 max_length=None, 82 max_length=None,
83 multiple=False): 83 multiple=False,
84 max_sequence_count=None,
85 ):
84 self.multiple = multiple 86 self.multiple = multiple
85 self.min_length = min_length 87 self.min_length = min_length
86 self.max_length = max_length 88 self.max_length = max_length
87 self.iupac_characters = { 89 self.iupac_characters = {
88 'A', 'B', 'C', 'D', 'E', 'F', 'G', 90 'A', 'B', 'C', 'D', 'E', 'F', 'G',
89 'H', 'I', 'K', 'L', 'M', 'N', 'P', 91 'H', 'I', 'K', 'L', 'M', 'N', 'P',
90 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 92 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
91 'Y', 'Z', '-' 93 'Y', 'Z', '-'
92 } 94 }
95 self.max_sequence_count = (
96 max_sequence_count
97 or DEFAULT_MAX_SEQUENCE_COUNT)
93 98
94 def validate(self, fasta_list: List[Fasta]): 99 def validate(self, fasta_list: List[Fasta]):
95 """Perform FASTA validation.""" 100 """Perform FASTA validation."""
96 self.fasta_list = fasta_list 101 self.fasta_list = fasta_list
97 self.validate_num_seqs() 102 self.validate_num_seqs()
112 'Error encountered validating FASTA:\n' 117 'Error encountered validating FASTA:\n'
113 'Multimer mode requires multiple input sequence.' 118 'Multimer mode requires multiple input sequence.'
114 f' Only {fasta_count} sequences were detected in' 119 f' Only {fasta_count} sequences were detected in'
115 ' the provided file.') 120 ' the provided file.')
116 121
117 elif fasta_count > MULTIMER_MAX_SEQUENCE_COUNT: 122 elif fasta_count > self.max_sequence_count:
118 sys.stderr.write( 123 raise ValueError(
119 f'WARNING: detected {fasta_count} sequences but the' 124 f'WARNING: detected {fasta_count} sequences but the'
120 f' maximum allowed is {MULTIMER_MAX_SEQUENCE_COUNT}' 125 f' maximum allowed is {self.max_sequence_count}'
121 ' sequences. The last' 126 ' sequences.')
122 f' {fasta_count - MULTIMER_MAX_SEQUENCE_COUNT} sequence(s)'
123 ' have been discarded.\n')
124 self.fasta_list = self.fasta_list[:MULTIMER_MAX_SEQUENCE_COUNT]
125 else: 127 else:
126 if fasta_count > 1: 128 if fasta_count > 1:
127 sys.stderr.write( 129 sys.stderr.write(
128 'WARNING: More than 1 sequence detected.' 130 'WARNING: More than 1 sequence detected.'
129 ' Using first FASTA sequence as input.\n') 131 ' Using first FASTA sequence as input.\n')
198 # validate 200 # validate
199 fv = FastaValidator( 201 fv = FastaValidator(
200 min_length=args.min_length, 202 min_length=args.min_length,
201 max_length=args.max_length, 203 max_length=args.max_length,
202 multiple=args.multimer, 204 multiple=args.multimer,
205 max_sequence_count=args.max_sequence_count,
203 ) 206 )
204 clean_fastas = fv.validate(fas.fastas) 207 clean_fastas = fv.validate(fas.fastas)
205 208
206 # write clean data 209 # write clean data
207 fw = FastaWriter() 210 fw = FastaWriter()
244 help="Maximum length of input protein sequence (AA)", 247 help="Maximum length of input protein sequence (AA)",
245 default=None, 248 default=None,
246 type=int, 249 type=int,
247 ) 250 )
248 parser.add_argument( 251 parser.add_argument(
252 "--max-sequences",
253 dest='max_sequence_count',
254 help="Maximum number of input sequences",
255 default=None,
256 type=int,
257 )
258 parser.add_argument(
249 "--multimer", 259 "--multimer",
250 action='store_true', 260 action='store_true',
251 help="Require multiple input sequences", 261 help="Require multiple input sequences",
252 ) 262 )
253 return parser.parse_args() 263 return parser.parse_args()