Previous changeset 5:138feebde7d3 (2022-03-10) Next changeset 7:eb085b3dbaf8 (2022-04-19) |
Commit message:
"planemo upload for repository https://github.com/usegalaxy-au/tools-au commit 724a7a389c878dded1c0332f3b6e507e0c4cd52a-dirty" |
modified:
alphafold.xml validate_fasta.py |
b |
diff -r 138feebde7d3 -r 04e95886cf24 alphafold.xml --- a/alphafold.xml Thu Mar 10 21:53:42 2022 +0000 +++ b/alphafold.xml Mon Apr 04 01:46:22 2022 +0000 |
b |
@@ -29,7 +29,10 @@ echo '$fasta_or_text.fasta_text' > input.fasta && #end if -python3 '$__tool_directory__/validate_fasta.py' input.fasta && +python3 '$__tool_directory__/validate_fasta.py' input.fasta +--min_length \${ALPHAFOLD_AA_LENGTH_MIN:-30} +--max_length \${ALPHAFOLD_AA_LENGTH_MAX:-2000} +> alphafold.fasta && ## env vars ------------------------------- export TF_FORCE_UNIFIED_MEMORY=1 && @@ -49,7 +52,7 @@ --max_template_date=\$DATE --bfd_database_path \${ALPHAFOLD_DB:-/data}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt --uniclust30_database_path \${ALPHAFOLD_DB:-/data}/uniclust30/uniclust30_2018_08/uniclust30_2018_08 -## use_gpu_relax introduced in AlphaFold v2.1.2 +## Param introduced in AlphaFold v2.1.2: --use_gpu_relax=True && |
b |
diff -r 138feebde7d3 -r 04e95886cf24 validate_fasta.py --- a/validate_fasta.py Thu Mar 10 21:53:42 2022 +0000 +++ b/validate_fasta.py Mon Apr 04 01:46:22 2022 +0000 |
[ |
b'@@ -1,6 +1,7 @@\n """Validate input FASTA sequence."""\r\n \r\n import re\r\n+import sys\r\n import argparse\r\n from typing import List, TextIO\r\n \r\n@@ -16,10 +17,6 @@\n """Initialize from FASTA file."""\r\n self.fastas = []\r\n self.load(fasta_path)\r\n- print("Loaded FASTA sequences:")\r\n- for f in self.fastas:\r\n- print(f.header)\r\n- print(f.aa_seq)\r\n \r\n def load(self, fasta_path: str):\r\n """Load bare or FASTA formatted sequence."""\r\n@@ -29,36 +26,32 @@\n if "__cn__" in self.content:\r\n # Pasted content with escaped characters\r\n self.newline = \'__cn__\'\r\n- self.caret = \'__gt__\'\r\n+ self.read_caret = \'__gt__\'\r\n else:\r\n # Uploaded file with normal content\r\n self.newline = \'\\n\'\r\n- self.caret = \'>\'\r\n+ self.read_caret = \'>\'\r\n \r\n self.lines = self.content.split(self.newline)\r\n- header, sequence = self.interpret_first_line()\r\n+\r\n+ if not self.lines[0].startswith(self.read_caret):\r\n+ # Fasta is headless, load as single sequence\r\n+ self.update_fastas(\r\n+ \'\', \'\'.join(self.lines)\r\n+ )\r\n \r\n- i = 0\r\n- while i < len(self.lines):\r\n- line = self.lines[i]\r\n- if line.startswith(self.caret):\r\n- self.update_fastas(header, sequence)\r\n- header = \'>\' + self.strip_header(line)\r\n- sequence = \'\'\r\n- else:\r\n- sequence += line.strip(\'\\n \')\r\n- i += 1\r\n-\r\n- # after reading whole file, header & sequence buffers might be full\r\n- self.update_fastas(header, sequence)\r\n-\r\n- def interpret_first_line(self):\r\n- line = self.lines[0]\r\n- if line.startswith(self.caret):\r\n- header = \'>\' + self.strip_header(line)\r\n- return header, \'\'\r\n else:\r\n- return \'\', line\r\n+ header = None\r\n+ sequence = None\r\n+ for line in self.lines:\r\n+ if line.startswith(self.read_caret):\r\n+ if header:\r\n+ self.update_fastas(header, sequence)\r\n+ header = \'>\' + self.strip_header(line)\r\n+ sequence = \'\'\r\n+ else:\r\n+ sequence += line.strip(\'\\n \')\r\n+ self.update_fastas(header, sequence)\r\n \r\n def strip_header(self, line):\r\n """Strip characters escaped with underscores from pasted text."""\r\n@@ -77,10 +70,14 @@\n \r\n \r\n class FastaValidator:\r\n- def __init__(self, fasta_list: List[Fasta]):\r\n+ def __init__(\r\n+ self,\r\n+ fasta_list: List[Fasta],\r\n+ min_length=None,\r\n+ max_length=None):\r\n+ self.min_length = min_length\r\n+ self.max_length = max_length\r\n self.fasta_list = fasta_list\r\n- self.min_length = 30\r\n- self.max_length = 2000\r\n self.iupac_characters = {\r\n \'A\', \'B\', \'C\', \'D\', \'E\', \'F\', \'G\',\r\n \'H\', \'I\', \'K\', \'L\', \'M\', \'N\', \'P\',\r\n@@ -93,68 +90,89 @@\n self.validate_num_seqs()\r\n self.validate_length()\r\n self.validate_alphabet()\r\n+\r\n # not checking for \'X\' nucleotides at the moment.\r\n # alphafold can throw an error if it doesn\'t like it.\r\n- #self.validate_x()\r\n+ # self.validate_x()\r\n \r\n def validate_num_seqs(self) -> None:\r\n+ """Assert that only one sequence has been provided."""\r\n if len(self.fasta_list) > 1:\r\n- raise Exception(f\'Error encountered validating fasta: More than 1 sequence detected ({len(self.fasta_list)}). Please use single fasta sequence as input\')\r\n+ raise Exception(\r\n+ \'Error encountered validating fasta:\'\r\n+ f\' More than 1 sequence detected ({len(self.fasta_list)}).\'\r\n+ \' Please use single fasta sequence as input.\')\r\n elif len(self.fasta_list) =='..b'ngth:\r\n+ if len(fasta.aa_seq) < self.min_length:\r\n+ raise Exception(\r\n+ \'Error encountered validating fasta: Sequence too short\'\r\n+ f\' ({len(fasta.aa_seq)}AA).\'\r\n+ f\' Minimum length is {self.min_length}AA.\')\r\n+ if self.max_length:\r\n+ if len(fasta.aa_seq) > self.max_length:\r\n+ raise Exception(\r\n+ \'Error encountered validating fasta:\'\r\n+ f\' Sequence too long ({len(fasta.aa_seq)}AA).\'\r\n+ f\' Maximum length is {self.max_length}AA.\')\r\n \r\n def validate_alphabet(self):\r\n """\r\n- Confirms whether the sequence conforms to IUPAC codes.\r\n- If not, reports the offending character and its position.\r\n+ Confirm whether the sequence conforms to IUPAC codes.\r\n+ If not, report the offending character and its position.\r\n """\r\n fasta = self.fasta_list[0]\r\n for i, char in enumerate(fasta.aa_seq.upper()):\r\n if char not in self.iupac_characters:\r\n- raise Exception(f\'Error encountered validating fasta: Invalid amino acid found at pos {i}: "{char}"\')\r\n+ raise Exception(\r\n+ \'Error encountered validating fasta: Invalid amino acid\'\r\n+ f\' found at pos {i}: "{char}"\')\r\n \r\n def validate_x(self):\r\n- """checks if any bases are X. TODO check whether alphafold accepts X bases. """\r\n+ """Check for X bases."""\r\n fasta = self.fasta_list[0]\r\n for i, char in enumerate(fasta.aa_seq.upper()):\r\n if char == \'X\':\r\n- raise Exception(f\'Error encountered validating fasta: Unsupported aa code "X" found at pos {i}\')\r\n+ raise Exception(\r\n+ \'Error encountered validating fasta: Unsupported AA code\'\r\n+ f\' "X" found at pos {i}\')\r\n \r\n \r\n class FastaWriter:\r\n def __init__(self) -> None:\r\n- self.outfile = \'alphafold.fasta\'\r\n- self.formatted_line_len = 60\r\n+ self.line_wrap = 60\r\n \r\n def write(self, fasta: Fasta):\r\n- with open(self.outfile, \'w\') as fp:\r\n- header = fasta.header\r\n- seq = self.format_sequence(fasta.aa_seq)\r\n- fp.write(header + \'\\n\')\r\n- fp.write(seq + \'\\n\')\r\n+ header = fasta.header\r\n+ seq = self.format_sequence(fasta.aa_seq)\r\n+ sys.stdout.write(header + \'\\n\')\r\n+ sys.stdout.write(seq)\r\n \r\n def format_sequence(self, aa_seq: str):\r\n formatted_seq = \'\'\r\n- for i in range(0, len(aa_seq), self.formatted_line_len):\r\n- formatted_seq += aa_seq[i: i + self.formatted_line_len] + \'\\n\'\r\n+ for i in range(0, len(aa_seq), self.line_wrap):\r\n+ formatted_seq += aa_seq[i: i + self.line_wrap] + \'\\n\'\r\n return formatted_seq\r\n \r\n \r\n def main():\r\n # load fasta file\r\n args = parse_args()\r\n- fas = FastaLoader(args.input_fasta)\r\n+ fas = FastaLoader(args.input)\r\n \r\n # validate\r\n- fv = FastaValidator(fas.fastas)\r\n+ fv = FastaValidator(\r\n+ fas.fastas,\r\n+ min_length=args.min_length,\r\n+ max_length=args.max_length,\r\n+ )\r\n fv.validate()\r\n \r\n # write cleaned version\r\n@@ -165,13 +183,26 @@\n def parse_args() -> argparse.Namespace:\r\n parser = argparse.ArgumentParser()\r\n parser.add_argument(\r\n- "input_fasta",\r\n+ "input",\r\n help="input fasta file",\r\n type=str\r\n )\r\n+ parser.add_argument(\r\n+ "--min_length",\r\n+ dest=\'min_length\',\r\n+ help="Minimum length of input protein sequence (AA)",\r\n+ default=None,\r\n+ type=int,\r\n+ )\r\n+ parser.add_argument(\r\n+ "--max_length",\r\n+ dest=\'max_length\',\r\n+ help="Maximum length of input protein sequence (AA)",\r\n+ default=None,\r\n+ type=int,\r\n+ )\r\n return parser.parse_args()\r\n \r\n \r\n-\r\n if __name__ == \'__main__\':\r\n main()\r\n' |