Mercurial > repos > galaxy-australia > alphafold2
comparison scripts/validate_fasta.py @ 19:2f7702fd0a4c draft
planemo upload for repository https://github.com/usegalaxy-au/tools-au commit cd0379c8ecc24977dffa462c1897d402c85fa4e6
| author | galaxy-australia |
|---|---|
| date | Wed, 08 May 2024 06:26:55 +0000 |
| parents | e4a053d67e24 |
| children | 2891385d6ace |
comparison
equal
deleted
inserted
replaced
| 18:e4a053d67e24 | 19:2f7702fd0a4c |
|---|---|
| 3 import argparse | 3 import argparse |
| 4 import re | 4 import re |
| 5 import sys | 5 import sys |
| 6 from typing import List | 6 from typing import List |
| 7 | 7 |
| 8 MULTIMER_MAX_SEQUENCE_COUNT = 10 | 8 DEFAULT_MAX_SEQUENCE_COUNT = 10 |
| 9 STRIP_SEQUENCE_CHARS = ['\n', '\r', '\t', ' '] | 9 STRIP_SEQUENCE_CHARS = ['\n', '\r', '\t', ' '] |
| 10 | 10 |
| 11 | 11 |
| 12 class Fasta: | 12 class Fasta: |
| 13 def __init__(self, header_str: str, seq_str: str): | 13 def __init__(self, header_str: str, seq_str: str): |
| 75 self.fastas.append(Fasta(header, sequence)) | 75 self.fastas.append(Fasta(header, sequence)) |
| 76 | 76 |
| 77 | 77 |
| 78 class FastaValidator: | 78 class FastaValidator: |
| 79 def __init__( | 79 def __init__( |
| 80 self, | 80 self, |
| 81 min_length=None, | 81 min_length=None, |
| 82 max_length=None, | 82 max_length=None, |
| 83 multiple=False): | 83 multiple=False, |
| 84 max_sequence_count=None, | |
| 85 ): | |
| 84 self.multiple = multiple | 86 self.multiple = multiple |
| 85 self.min_length = min_length | 87 self.min_length = min_length |
| 86 self.max_length = max_length | 88 self.max_length = max_length |
| 87 self.iupac_characters = { | 89 self.iupac_characters = { |
| 88 'A', 'B', 'C', 'D', 'E', 'F', 'G', | 90 'A', 'B', 'C', 'D', 'E', 'F', 'G', |
| 89 'H', 'I', 'K', 'L', 'M', 'N', 'P', | 91 'H', 'I', 'K', 'L', 'M', 'N', 'P', |
| 90 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', | 92 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
| 91 'Y', 'Z', '-' | 93 'Y', 'Z', '-' |
| 92 } | 94 } |
| 95 self.max_sequence_count = ( | |
| 96 max_sequence_count | |
| 97 or DEFAULT_MAX_SEQUENCE_COUNT) | |
| 93 | 98 |
| 94 def validate(self, fasta_list: List[Fasta]): | 99 def validate(self, fasta_list: List[Fasta]): |
| 95 """Perform FASTA validation.""" | 100 """Perform FASTA validation.""" |
| 96 self.fasta_list = fasta_list | 101 self.fasta_list = fasta_list |
| 97 self.validate_num_seqs() | 102 self.validate_num_seqs() |
| 112 'Error encountered validating FASTA:\n' | 117 'Error encountered validating FASTA:\n' |
| 113 'Multimer mode requires multiple input sequence.' | 118 'Multimer mode requires multiple input sequence.' |
| 114 f' Only {fasta_count} sequences were detected in' | 119 f' Only {fasta_count} sequences were detected in' |
| 115 ' the provided file.') | 120 ' the provided file.') |
| 116 | 121 |
| 117 elif fasta_count > MULTIMER_MAX_SEQUENCE_COUNT: | 122 elif fasta_count > self.max_sequence_count: |
| 118 sys.stderr.write( | 123 raise ValueError( |
| 119 f'WARNING: detected {fasta_count} sequences but the' | 124 f'WARNING: detected {fasta_count} sequences but the' |
| 120 f' maximum allowed is {MULTIMER_MAX_SEQUENCE_COUNT}' | 125 f' maximum allowed is {self.max_sequence_count}' |
| 121 ' sequences. The last' | 126 ' sequences.') |
| 122 f' {fasta_count - MULTIMER_MAX_SEQUENCE_COUNT} sequence(s)' | |
| 123 ' have been discarded.\n') | |
| 124 self.fasta_list = self.fasta_list[:MULTIMER_MAX_SEQUENCE_COUNT] | |
| 125 else: | 127 else: |
| 126 if fasta_count > 1: | 128 if fasta_count > 1: |
| 127 sys.stderr.write( | 129 sys.stderr.write( |
| 128 'WARNING: More than 1 sequence detected.' | 130 'WARNING: More than 1 sequence detected.' |
| 129 ' Using first FASTA sequence as input.\n') | 131 ' Using first FASTA sequence as input.\n') |
| 198 # validate | 200 # validate |
| 199 fv = FastaValidator( | 201 fv = FastaValidator( |
| 200 min_length=args.min_length, | 202 min_length=args.min_length, |
| 201 max_length=args.max_length, | 203 max_length=args.max_length, |
| 202 multiple=args.multimer, | 204 multiple=args.multimer, |
| 205 max_sequence_count=args.max_sequence_count, | |
| 203 ) | 206 ) |
| 204 clean_fastas = fv.validate(fas.fastas) | 207 clean_fastas = fv.validate(fas.fastas) |
| 205 | 208 |
| 206 # write clean data | 209 # write clean data |
| 207 fw = FastaWriter() | 210 fw = FastaWriter() |
| 244 help="Maximum length of input protein sequence (AA)", | 247 help="Maximum length of input protein sequence (AA)", |
| 245 default=None, | 248 default=None, |
| 246 type=int, | 249 type=int, |
| 247 ) | 250 ) |
| 248 parser.add_argument( | 251 parser.add_argument( |
| 252 "--max-sequences", | |
| 253 dest='max_sequence_count', | |
| 254 help="Maximum number of input sequences", | |
| 255 default=None, | |
| 256 type=int, | |
| 257 ) | |
| 258 parser.add_argument( | |
| 249 "--multimer", | 259 "--multimer", |
| 250 action='store_true', | 260 action='store_true', |
| 251 help="Require multiple input sequences", | 261 help="Require multiple input sequences", |
| 252 ) | 262 ) |
| 253 return parser.parse_args() | 263 return parser.parse_args() |
