Mercurial > repos > galaxy-australia > alphafold2
comparison scripts/validate_fasta.py @ 19:2f7702fd0a4c draft
planemo upload for repository https://github.com/usegalaxy-au/tools-au commit cd0379c8ecc24977dffa462c1897d402c85fa4e6
| author | galaxy-australia | 
|---|---|
| date | Wed, 08 May 2024 06:26:55 +0000 | 
| parents | e4a053d67e24 | 
| children | 2891385d6ace | 
   comparison
  equal
  deleted
  inserted
  replaced
| 18:e4a053d67e24 | 19:2f7702fd0a4c | 
|---|---|
| 3 import argparse | 3 import argparse | 
| 4 import re | 4 import re | 
| 5 import sys | 5 import sys | 
| 6 from typing import List | 6 from typing import List | 
| 7 | 7 | 
| 8 MULTIMER_MAX_SEQUENCE_COUNT = 10 | 8 DEFAULT_MAX_SEQUENCE_COUNT = 10 | 
| 9 STRIP_SEQUENCE_CHARS = ['\n', '\r', '\t', ' '] | 9 STRIP_SEQUENCE_CHARS = ['\n', '\r', '\t', ' '] | 
| 10 | 10 | 
| 11 | 11 | 
| 12 class Fasta: | 12 class Fasta: | 
| 13 def __init__(self, header_str: str, seq_str: str): | 13 def __init__(self, header_str: str, seq_str: str): | 
| 75 self.fastas.append(Fasta(header, sequence)) | 75 self.fastas.append(Fasta(header, sequence)) | 
| 76 | 76 | 
| 77 | 77 | 
| 78 class FastaValidator: | 78 class FastaValidator: | 
| 79 def __init__( | 79 def __init__( | 
| 80 self, | 80 self, | 
| 81 min_length=None, | 81 min_length=None, | 
| 82 max_length=None, | 82 max_length=None, | 
| 83 multiple=False): | 83 multiple=False, | 
| 84 max_sequence_count=None, | |
| 85 ): | |
| 84 self.multiple = multiple | 86 self.multiple = multiple | 
| 85 self.min_length = min_length | 87 self.min_length = min_length | 
| 86 self.max_length = max_length | 88 self.max_length = max_length | 
| 87 self.iupac_characters = { | 89 self.iupac_characters = { | 
| 88 'A', 'B', 'C', 'D', 'E', 'F', 'G', | 90 'A', 'B', 'C', 'D', 'E', 'F', 'G', | 
| 89 'H', 'I', 'K', 'L', 'M', 'N', 'P', | 91 'H', 'I', 'K', 'L', 'M', 'N', 'P', | 
| 90 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', | 92 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', | 
| 91 'Y', 'Z', '-' | 93 'Y', 'Z', '-' | 
| 92 } | 94 } | 
| 95 self.max_sequence_count = ( | |
| 96 max_sequence_count | |
| 97 or DEFAULT_MAX_SEQUENCE_COUNT) | |
| 93 | 98 | 
| 94 def validate(self, fasta_list: List[Fasta]): | 99 def validate(self, fasta_list: List[Fasta]): | 
| 95 """Perform FASTA validation.""" | 100 """Perform FASTA validation.""" | 
| 96 self.fasta_list = fasta_list | 101 self.fasta_list = fasta_list | 
| 97 self.validate_num_seqs() | 102 self.validate_num_seqs() | 
| 112 'Error encountered validating FASTA:\n' | 117 'Error encountered validating FASTA:\n' | 
| 113 'Multimer mode requires multiple input sequence.' | 118 'Multimer mode requires multiple input sequence.' | 
| 114 f' Only {fasta_count} sequences were detected in' | 119 f' Only {fasta_count} sequences were detected in' | 
| 115 ' the provided file.') | 120 ' the provided file.') | 
| 116 | 121 | 
| 117 elif fasta_count > MULTIMER_MAX_SEQUENCE_COUNT: | 122 elif fasta_count > self.max_sequence_count: | 
| 118 sys.stderr.write( | 123 raise ValueError( | 
| 119 f'WARNING: detected {fasta_count} sequences but the' | 124 f'WARNING: detected {fasta_count} sequences but the' | 
| 120 f' maximum allowed is {MULTIMER_MAX_SEQUENCE_COUNT}' | 125 f' maximum allowed is {self.max_sequence_count}' | 
| 121 ' sequences. The last' | 126 ' sequences.') | 
| 122 f' {fasta_count - MULTIMER_MAX_SEQUENCE_COUNT} sequence(s)' | |
| 123 ' have been discarded.\n') | |
| 124 self.fasta_list = self.fasta_list[:MULTIMER_MAX_SEQUENCE_COUNT] | |
| 125 else: | 127 else: | 
| 126 if fasta_count > 1: | 128 if fasta_count > 1: | 
| 127 sys.stderr.write( | 129 sys.stderr.write( | 
| 128 'WARNING: More than 1 sequence detected.' | 130 'WARNING: More than 1 sequence detected.' | 
| 129 ' Using first FASTA sequence as input.\n') | 131 ' Using first FASTA sequence as input.\n') | 
| 198 # validate | 200 # validate | 
| 199 fv = FastaValidator( | 201 fv = FastaValidator( | 
| 200 min_length=args.min_length, | 202 min_length=args.min_length, | 
| 201 max_length=args.max_length, | 203 max_length=args.max_length, | 
| 202 multiple=args.multimer, | 204 multiple=args.multimer, | 
| 205 max_sequence_count=args.max_sequence_count, | |
| 203 ) | 206 ) | 
| 204 clean_fastas = fv.validate(fas.fastas) | 207 clean_fastas = fv.validate(fas.fastas) | 
| 205 | 208 | 
| 206 # write clean data | 209 # write clean data | 
| 207 fw = FastaWriter() | 210 fw = FastaWriter() | 
| 244 help="Maximum length of input protein sequence (AA)", | 247 help="Maximum length of input protein sequence (AA)", | 
| 245 default=None, | 248 default=None, | 
| 246 type=int, | 249 type=int, | 
| 247 ) | 250 ) | 
| 248 parser.add_argument( | 251 parser.add_argument( | 
| 252 "--max-sequences", | |
| 253 dest='max_sequence_count', | |
| 254 help="Maximum number of input sequences", | |
| 255 default=None, | |
| 256 type=int, | |
| 257 ) | |
| 258 parser.add_argument( | |
| 249 "--multimer", | 259 "--multimer", | 
| 250 action='store_true', | 260 action='store_true', | 
| 251 help="Require multiple input sequences", | 261 help="Require multiple input sequences", | 
| 252 ) | 262 ) | 
| 253 return parser.parse_args() | 263 return parser.parse_args() | 
