Mercurial > repos > galaxy-australia > alphafold2
comparison scripts/validate_fasta.py @ 19:2f7702fd0a4c draft
planemo upload for repository https://github.com/usegalaxy-au/tools-au commit cd0379c8ecc24977dffa462c1897d402c85fa4e6
author | galaxy-australia |
---|---|
date | Wed, 08 May 2024 06:26:55 +0000 |
parents | e4a053d67e24 |
children |
comparison
equal
deleted
inserted
replaced
18:e4a053d67e24 | 19:2f7702fd0a4c |
---|---|
3 import argparse | 3 import argparse |
4 import re | 4 import re |
5 import sys | 5 import sys |
6 from typing import List | 6 from typing import List |
7 | 7 |
8 MULTIMER_MAX_SEQUENCE_COUNT = 10 | 8 DEFAULT_MAX_SEQUENCE_COUNT = 10 |
9 STRIP_SEQUENCE_CHARS = ['\n', '\r', '\t', ' '] | 9 STRIP_SEQUENCE_CHARS = ['\n', '\r', '\t', ' '] |
10 | 10 |
11 | 11 |
12 class Fasta: | 12 class Fasta: |
13 def __init__(self, header_str: str, seq_str: str): | 13 def __init__(self, header_str: str, seq_str: str): |
75 self.fastas.append(Fasta(header, sequence)) | 75 self.fastas.append(Fasta(header, sequence)) |
76 | 76 |
77 | 77 |
78 class FastaValidator: | 78 class FastaValidator: |
79 def __init__( | 79 def __init__( |
80 self, | 80 self, |
81 min_length=None, | 81 min_length=None, |
82 max_length=None, | 82 max_length=None, |
83 multiple=False): | 83 multiple=False, |
84 max_sequence_count=None, | |
85 ): | |
84 self.multiple = multiple | 86 self.multiple = multiple |
85 self.min_length = min_length | 87 self.min_length = min_length |
86 self.max_length = max_length | 88 self.max_length = max_length |
87 self.iupac_characters = { | 89 self.iupac_characters = { |
88 'A', 'B', 'C', 'D', 'E', 'F', 'G', | 90 'A', 'B', 'C', 'D', 'E', 'F', 'G', |
89 'H', 'I', 'K', 'L', 'M', 'N', 'P', | 91 'H', 'I', 'K', 'L', 'M', 'N', 'P', |
90 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', | 92 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
91 'Y', 'Z', '-' | 93 'Y', 'Z', '-' |
92 } | 94 } |
95 self.max_sequence_count = ( | |
96 max_sequence_count | |
97 or DEFAULT_MAX_SEQUENCE_COUNT) | |
93 | 98 |
94 def validate(self, fasta_list: List[Fasta]): | 99 def validate(self, fasta_list: List[Fasta]): |
95 """Perform FASTA validation.""" | 100 """Perform FASTA validation.""" |
96 self.fasta_list = fasta_list | 101 self.fasta_list = fasta_list |
97 self.validate_num_seqs() | 102 self.validate_num_seqs() |
112 'Error encountered validating FASTA:\n' | 117 'Error encountered validating FASTA:\n' |
113 'Multimer mode requires multiple input sequence.' | 118 'Multimer mode requires multiple input sequence.' |
114 f' Only {fasta_count} sequences were detected in' | 119 f' Only {fasta_count} sequences were detected in' |
115 ' the provided file.') | 120 ' the provided file.') |
116 | 121 |
117 elif fasta_count > MULTIMER_MAX_SEQUENCE_COUNT: | 122 elif fasta_count > self.max_sequence_count: |
118 sys.stderr.write( | 123 raise ValueError( |
119 f'WARNING: detected {fasta_count} sequences but the' | 124 f'WARNING: detected {fasta_count} sequences but the' |
120 f' maximum allowed is {MULTIMER_MAX_SEQUENCE_COUNT}' | 125 f' maximum allowed is {self.max_sequence_count}' |
121 ' sequences. The last' | 126 ' sequences.') |
122 f' {fasta_count - MULTIMER_MAX_SEQUENCE_COUNT} sequence(s)' | |
123 ' have been discarded.\n') | |
124 self.fasta_list = self.fasta_list[:MULTIMER_MAX_SEQUENCE_COUNT] | |
125 else: | 127 else: |
126 if fasta_count > 1: | 128 if fasta_count > 1: |
127 sys.stderr.write( | 129 sys.stderr.write( |
128 'WARNING: More than 1 sequence detected.' | 130 'WARNING: More than 1 sequence detected.' |
129 ' Using first FASTA sequence as input.\n') | 131 ' Using first FASTA sequence as input.\n') |
198 # validate | 200 # validate |
199 fv = FastaValidator( | 201 fv = FastaValidator( |
200 min_length=args.min_length, | 202 min_length=args.min_length, |
201 max_length=args.max_length, | 203 max_length=args.max_length, |
202 multiple=args.multimer, | 204 multiple=args.multimer, |
205 max_sequence_count=args.max_sequence_count, | |
203 ) | 206 ) |
204 clean_fastas = fv.validate(fas.fastas) | 207 clean_fastas = fv.validate(fas.fastas) |
205 | 208 |
206 # write clean data | 209 # write clean data |
207 fw = FastaWriter() | 210 fw = FastaWriter() |
244 help="Maximum length of input protein sequence (AA)", | 247 help="Maximum length of input protein sequence (AA)", |
245 default=None, | 248 default=None, |
246 type=int, | 249 type=int, |
247 ) | 250 ) |
248 parser.add_argument( | 251 parser.add_argument( |
252 "--max-sequences", | |
253 dest='max_sequence_count', | |
254 help="Maximum number of input sequences", | |
255 default=None, | |
256 type=int, | |
257 ) | |
258 parser.add_argument( | |
249 "--multimer", | 259 "--multimer", |
250 action='store_true', | 260 action='store_true', |
251 help="Require multiple input sequences", | 261 help="Require multiple input sequences", |
252 ) | 262 ) |
253 return parser.parse_args() | 263 return parser.parse_args() |