Repository 'alphafold2'
hg clone https://toolshed.g2.bx.psu.edu/repos/galaxy-australia/alphafold2

Changeset 6:04e95886cf24 (2022-04-04)
Previous changeset 5:138feebde7d3 (2022-03-10) Next changeset 7:eb085b3dbaf8 (2022-04-19)
Commit message:
"planemo upload for repository https://github.com/usegalaxy-au/tools-au commit 724a7a389c878dded1c0332f3b6e507e0c4cd52a-dirty"
modified:
alphafold.xml
validate_fasta.py
b
diff -r 138feebde7d3 -r 04e95886cf24 alphafold.xml
--- a/alphafold.xml Thu Mar 10 21:53:42 2022 +0000
+++ b/alphafold.xml Mon Apr 04 01:46:22 2022 +0000
b
@@ -29,7 +29,10 @@
     echo '$fasta_or_text.fasta_text' > input.fasta &&
 #end if
 
-python3 '$__tool_directory__/validate_fasta.py' input.fasta &&
+python3 '$__tool_directory__/validate_fasta.py' input.fasta
+--min_length \${ALPHAFOLD_AA_LENGTH_MIN:-30}
+--max_length \${ALPHAFOLD_AA_LENGTH_MAX:-2000}
+> alphafold.fasta &&
 
 ## env vars -------------------------------
 export TF_FORCE_UNIFIED_MEMORY=1 &&
@@ -49,7 +52,7 @@
 --max_template_date=\$DATE
 --bfd_database_path \${ALPHAFOLD_DB:-/data}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt
 --uniclust30_database_path \${ALPHAFOLD_DB:-/data}/uniclust30/uniclust30_2018_08/uniclust30_2018_08
-## use_gpu_relax introduced in AlphaFold v2.1.2
+## Param introduced in AlphaFold v2.1.2:
 --use_gpu_relax=True
 &&
 
b
diff -r 138feebde7d3 -r 04e95886cf24 validate_fasta.py
--- a/validate_fasta.py Thu Mar 10 21:53:42 2022 +0000
+++ b/validate_fasta.py Mon Apr 04 01:46:22 2022 +0000
[
b'@@ -1,6 +1,7 @@\n """Validate input FASTA sequence."""\r\n \r\n import re\r\n+import sys\r\n import argparse\r\n from typing import List, TextIO\r\n \r\n@@ -16,10 +17,6 @@\n         """Initialize from FASTA file."""\r\n         self.fastas = []\r\n         self.load(fasta_path)\r\n-        print("Loaded FASTA sequences:")\r\n-        for f in self.fastas:\r\n-            print(f.header)\r\n-            print(f.aa_seq)\r\n \r\n     def load(self, fasta_path: str):\r\n         """Load bare or FASTA formatted sequence."""\r\n@@ -29,36 +26,32 @@\n         if "__cn__" in self.content:\r\n             # Pasted content with escaped characters\r\n             self.newline = \'__cn__\'\r\n-            self.caret = \'__gt__\'\r\n+            self.read_caret = \'__gt__\'\r\n         else:\r\n             # Uploaded file with normal content\r\n             self.newline = \'\\n\'\r\n-            self.caret = \'>\'\r\n+            self.read_caret = \'>\'\r\n \r\n         self.lines = self.content.split(self.newline)\r\n-        header, sequence = self.interpret_first_line()\r\n+\r\n+        if not self.lines[0].startswith(self.read_caret):\r\n+            # Fasta is headless, load as single sequence\r\n+            self.update_fastas(\r\n+                \'\', \'\'.join(self.lines)\r\n+            )\r\n \r\n-        i = 0\r\n-        while i < len(self.lines):\r\n-            line = self.lines[i]\r\n-            if line.startswith(self.caret):\r\n-                self.update_fastas(header, sequence)\r\n-                header = \'>\' + self.strip_header(line)\r\n-                sequence = \'\'\r\n-            else:\r\n-                sequence += line.strip(\'\\n \')\r\n-            i += 1\r\n-\r\n-        # after reading whole file, header & sequence buffers might be full\r\n-        self.update_fastas(header, sequence)\r\n-\r\n-    def interpret_first_line(self):\r\n-        line = self.lines[0]\r\n-        if line.startswith(self.caret):\r\n-            header = \'>\' + self.strip_header(line)\r\n-            return header, \'\'\r\n         else:\r\n-            return \'\', line\r\n+            header = None\r\n+            sequence = None\r\n+            for line in self.lines:\r\n+                if line.startswith(self.read_caret):\r\n+                    if header:\r\n+                        self.update_fastas(header, sequence)\r\n+                    header = \'>\' + self.strip_header(line)\r\n+                    sequence = \'\'\r\n+                else:\r\n+                    sequence += line.strip(\'\\n \')\r\n+            self.update_fastas(header, sequence)\r\n \r\n     def strip_header(self, line):\r\n         """Strip characters escaped with underscores from pasted text."""\r\n@@ -77,10 +70,14 @@\n \r\n \r\n class FastaValidator:\r\n-    def __init__(self, fasta_list: List[Fasta]):\r\n+    def __init__(\r\n+            self,\r\n+            fasta_list: List[Fasta],\r\n+            min_length=None,\r\n+            max_length=None):\r\n+        self.min_length = min_length\r\n+        self.max_length = max_length\r\n         self.fasta_list = fasta_list\r\n-        self.min_length = 30\r\n-        self.max_length = 2000\r\n         self.iupac_characters = {\r\n             \'A\', \'B\', \'C\', \'D\', \'E\', \'F\', \'G\',\r\n             \'H\', \'I\', \'K\', \'L\', \'M\', \'N\', \'P\',\r\n@@ -93,68 +90,89 @@\n         self.validate_num_seqs()\r\n         self.validate_length()\r\n         self.validate_alphabet()\r\n+\r\n         # not checking for \'X\' nucleotides at the moment.\r\n         # alphafold can throw an error if it doesn\'t like it.\r\n-        #self.validate_x()\r\n+        # self.validate_x()\r\n \r\n     def validate_num_seqs(self) -> None:\r\n+        """Assert that only one sequence has been provided."""\r\n         if len(self.fasta_list) > 1:\r\n-            raise Exception(f\'Error encountered validating fasta: More than 1 sequence detected ({len(self.fasta_list)}). Please use single fasta sequence as input\')\r\n+            raise Exception(\r\n+                \'Error encountered validating fasta:\'\r\n+                f\' More than 1 sequence detected ({len(self.fasta_list)}).\'\r\n+                \' Please use single fasta sequence as input.\')\r\n         elif len(self.fasta_list) =='..b'ngth:\r\n+            if len(fasta.aa_seq) < self.min_length:\r\n+                raise Exception(\r\n+                    \'Error encountered validating fasta: Sequence too short\'\r\n+                    f\' ({len(fasta.aa_seq)}AA).\'\r\n+                    f\' Minimum length is {self.min_length}AA.\')\r\n+        if self.max_length:\r\n+            if len(fasta.aa_seq) > self.max_length:\r\n+                raise Exception(\r\n+                    \'Error encountered validating fasta:\'\r\n+                    f\' Sequence too long ({len(fasta.aa_seq)}AA).\'\r\n+                    f\' Maximum length is {self.max_length}AA.\')\r\n \r\n     def validate_alphabet(self):\r\n         """\r\n-        Confirms whether the sequence conforms to IUPAC codes.\r\n-        If not, reports the offending character and its position.\r\n+        Confirm whether the sequence conforms to IUPAC codes.\r\n+        If not, report the offending character and its position.\r\n         """\r\n         fasta = self.fasta_list[0]\r\n         for i, char in enumerate(fasta.aa_seq.upper()):\r\n             if char not in self.iupac_characters:\r\n-                raise Exception(f\'Error encountered validating fasta: Invalid amino acid found at pos {i}: "{char}"\')\r\n+                raise Exception(\r\n+                    \'Error encountered validating fasta: Invalid amino acid\'\r\n+                    f\' found at pos {i}: "{char}"\')\r\n \r\n     def validate_x(self):\r\n-        """checks if any bases are X. TODO check whether alphafold accepts X bases. """\r\n+        """Check for X bases."""\r\n         fasta = self.fasta_list[0]\r\n         for i, char in enumerate(fasta.aa_seq.upper()):\r\n             if char == \'X\':\r\n-                raise Exception(f\'Error encountered validating fasta: Unsupported aa code "X" found at pos {i}\')\r\n+                raise Exception(\r\n+                    \'Error encountered validating fasta: Unsupported AA code\'\r\n+                    f\' "X" found at pos {i}\')\r\n \r\n \r\n class FastaWriter:\r\n     def __init__(self) -> None:\r\n-        self.outfile = \'alphafold.fasta\'\r\n-        self.formatted_line_len = 60\r\n+        self.line_wrap = 60\r\n \r\n     def write(self, fasta: Fasta):\r\n-        with open(self.outfile, \'w\') as fp:\r\n-            header = fasta.header\r\n-            seq = self.format_sequence(fasta.aa_seq)\r\n-            fp.write(header + \'\\n\')\r\n-            fp.write(seq + \'\\n\')\r\n+        header = fasta.header\r\n+        seq = self.format_sequence(fasta.aa_seq)\r\n+        sys.stdout.write(header + \'\\n\')\r\n+        sys.stdout.write(seq)\r\n \r\n     def format_sequence(self, aa_seq: str):\r\n         formatted_seq = \'\'\r\n-        for i in range(0, len(aa_seq), self.formatted_line_len):\r\n-            formatted_seq += aa_seq[i: i + self.formatted_line_len] + \'\\n\'\r\n+        for i in range(0, len(aa_seq), self.line_wrap):\r\n+            formatted_seq += aa_seq[i: i + self.line_wrap] + \'\\n\'\r\n         return formatted_seq\r\n \r\n \r\n def main():\r\n     # load fasta file\r\n     args = parse_args()\r\n-    fas = FastaLoader(args.input_fasta)\r\n+    fas = FastaLoader(args.input)\r\n \r\n     # validate\r\n-    fv = FastaValidator(fas.fastas)\r\n+    fv = FastaValidator(\r\n+        fas.fastas,\r\n+        min_length=args.min_length,\r\n+        max_length=args.max_length,\r\n+    )\r\n     fv.validate()\r\n \r\n     # write cleaned version\r\n@@ -165,13 +183,26 @@\n def parse_args() -> argparse.Namespace:\r\n     parser = argparse.ArgumentParser()\r\n     parser.add_argument(\r\n-        "input_fasta",\r\n+        "input",\r\n         help="input fasta file",\r\n         type=str\r\n     )\r\n+    parser.add_argument(\r\n+        "--min_length",\r\n+        dest=\'min_length\',\r\n+        help="Minimum length of input protein sequence (AA)",\r\n+        default=None,\r\n+        type=int,\r\n+    )\r\n+    parser.add_argument(\r\n+        "--max_length",\r\n+        dest=\'max_length\',\r\n+        help="Maximum length of input protein sequence (AA)",\r\n+        default=None,\r\n+        type=int,\r\n+    )\r\n     return parser.parse_args()\r\n \r\n \r\n-\r\n if __name__ == \'__main__\':\r\n     main()\r\n'