Mercurial > repos > galaxy-australia > alphafold2
diff validate_fasta.py @ 9:3bd420ec162d draft
planemo upload for repository https://github.com/usegalaxy-au/tools-au commit 7726c3cba165bdc8fc6366ec0ce6596e55657468
author | galaxy-australia |
---|---|
date | Tue, 13 Sep 2022 22:04:12 +0000 |
parents | ca90d17ff51b |
children | d00e15139065 |
line wrap: on
line diff
--- a/validate_fasta.py Fri Aug 19 00:29:16 2022 +0000 +++ b/validate_fasta.py Tue Sep 13 22:04:12 2022 +0000 @@ -5,6 +5,8 @@ import argparse from typing import List +MULTIMER_MAX_SEQUENCE_COUNT = 10 + class Fasta: def __init__(self, header_str: str, seq_str: str): @@ -72,12 +74,12 @@ class FastaValidator: def __init__( self, - fasta_list: List[Fasta], min_length=None, - max_length=None): + max_length=None, + multiple=False): + self.multiple = multiple self.min_length = min_length self.max_length = max_length - self.fasta_list = fasta_list self.iupac_characters = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', @@ -85,27 +87,49 @@ 'Y', 'Z', '-' } - def validate(self): + def validate(self, fasta_list: List[Fasta]): """Perform FASTA validation.""" + self.fasta_list = fasta_list self.validate_num_seqs() self.validate_length() self.validate_alphabet() - # not checking for 'X' nucleotides at the moment. # alphafold can throw an error if it doesn't like it. # self.validate_x() + return self.fasta_list def validate_num_seqs(self) -> None: """Assert that only one sequence has been provided.""" - if len(self.fasta_list) > 1: - sys.stderr.write( - 'WARNING: More than 1 sequence detected.' - ' Using first FASTA sequence as input.\n') - self.fasta_list = self.fasta_list[:1] - elif len(self.fasta_list) == 0: - raise ValueError( - 'Error encountered validating FASTA:\n' - ' input file has no FASTA sequences') + fasta_count = len(self.fasta_list) + + if self.multiple: + if fasta_count < 2: + raise ValueError( + 'Error encountered validating FASTA:\n' + 'Multimer mode requires multiple input sequence.' + f' Only {fasta_count} sequences were detected in' + ' the provided file.') + self.fasta_list = self.fasta_list + + elif fasta_count > MULTIMER_MAX_SEQUENCE_COUNT: + sys.stderr.write( + f'WARNING: detected {fasta_count} sequences but the' + f' maximum allowed is {MULTIMER_MAX_SEQUENCE_COUNT}' + ' sequences. The last' + f' {fasta_count - MULTIMER_MAX_SEQUENCE_COUNT} sequence(s)' + ' have been discarded.\n') + self.fasta_list = self.fasta_list[:MULTIMER_MAX_SEQUENCE_COUNT] + else: + if fasta_count > 1: + sys.stderr.write( + 'WARNING: More than 1 sequence detected.' + ' Using first FASTA sequence as input.\n') + self.fasta_list = self.fasta_list[:1] + + elif len(self.fasta_list) == 0: + raise ValueError( + 'Error encountered validating FASTA:\n' + ' no FASTA sequences detected in input file.') def validate_length(self): """Confirm whether sequence length is valid.""" @@ -170,15 +194,16 @@ # validate fv = FastaValidator( - fas.fastas, min_length=args.min_length, max_length=args.max_length, + multiple=args.multimer, ) - fv.validate() + clean_fastas = fv.validate(fas.fastas) - # write cleaned version + # write clean data fw = FastaWriter() - fw.write(fas.fastas[0]) + for fas in clean_fastas: + fw.write(fas) except ValueError as exc: sys.stderr.write(f"{exc}\n\n") @@ -212,6 +237,11 @@ default=None, type=int, ) + parser.add_argument( + "--multimer", + action='store_true', + help="Require multiple input sequences", + ) return parser.parse_args()