Repository 'split_file_to_collection'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/split_file_to_collection

Changeset 5:e77b954f0da5 (2019-10-11)
Previous changeset 4:0850f2dfba13 (2019-10-09) Next changeset 6:d57735dd27b0 (2020-06-30)
Commit message:
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
modified:
split_file_to_collection.py
split_file_to_collection.xml
test-data/rand_0.fasta
test-data/rand_1.fasta
b
diff -r 0850f2dfba13 -r e77b954f0da5 split_file_to_collection.py
--- a/split_file_to_collection.py Wed Oct 09 07:34:49 2019 -0400
+++ b/split_file_to_collection.py Fri Oct 11 18:24:43 2019 -0400
[
b'@@ -1,23 +1,24 @@\n #!/usr/bin/env python\n \n import argparse\n+import math\n import os\n import re\n import random\n-import math\n \n-\n-"""\n-regexes that indicate the *beginning* of a record\n-new file types can be added by appending to this dict,\n-updating the parser, and adding a new type option in the Galaxy wrapper\n-"""\n-FILETYPES = {\'fasta\': \'^>\',\n-             \'fastq\': \'^@\',\n-             \'tabular\': \'^.*\',\n-             \'txt\': \'^.*\',\n-             \'mgf\': \'^BEGIN IONS\',\n-             \'sdf\': \'\\$\\$\\$\\$\',\n+# configuration of the splitting for specific file types\n+# - regular expression matching the record separator (\'\' if not splitting by regex but by number of lines)\n+# - number of lines to split after (0 if not splitting by number of lines but regex)\n+# - a boolean indicating if the record separator is at the end of the record\n+#\n+# new file types can be added by appending to this dict,\n+# updating the parser, and adding a new type option in the Galaxy wrapper\n+FILETYPES = {\'fasta\': (\'^>\', 0,  False),\n+             \'fastq\': (\'\', 4, False),\n+             \'tabular\': (\'\', 1, False),\n+             \'txt\': (\'\', 1, False),\n+             \'mgf\': (\'^BEGIN IONS\', 0, False),\n+             \'sdf\': (\'\\$\\$\\$\\$\', 0, True),\n              }\n \n \n@@ -46,8 +47,8 @@\n         args["match"] = replace_mapped_chars(args["match"])\n         args["sub"] = replace_mapped_chars(args["sub"])\n         split_by_column(args, in_file, out_dir, top)\n-\n     else:\n+        args["generic_re"] = replace_mapped_chars(args["generic_re"])\n         split_by_record(args, in_file, out_dir, top, ftype)\n \n \n@@ -62,23 +63,26 @@\n                                                  " the extension of the new files (without a period)")\n     parser.add_argument(\'--ftype\', \'-f\', help="The type of the file to split", required = True,\n         choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])\n-    parser.add_argument(\'--generic_re\', \'-g\', help="Regular expression indicating the start of a new record (only for generic)", required = False)\n     parser.add_argument(\'--by\', \'-b\', help="Split by line or by column (tabular only)",\n         default = "row", choices = ["col", "row"])\n-    parser.add_argument(\'--top\', \'-t\', type=int, default=0, help="Number of header lines to carry over to new files. " +\n-                                                                 "(tabular only).")\n+    parser.add_argument(\'--top\', \'-t\', type=int, default=0, help="Number of header lines to carry over to new files.")\n     parser.add_argument(\'--rand\', \'-r\', help="Divide records randomly into new files", action=\'store_true\')\n     parser.add_argument(\'--seed\', \'-x\', help="Provide a seed for the random number generator. " +\n                                              "If not provided and args[\\"rand\\"]==True, then date is used", type=int)\n-    parser.add_argument(\'--numnew\', \'-n\', type=int, default = 1,\n+    group = parser.add_mutually_exclusive_group()\n+    group.add_argument(\'--numnew\', \'-n\', type=int, default = 1,\n                         help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")\n-    parser.add_argument(\'--chunksize\', \'-k\', type=int, default = 0,\n+    group.add_argument(\'--chunksize\', \'-k\', type=int, default = 0,\n                         help="Number of records by file. Not valid for splitting on a column")\n     parser.add_argument(\'--batch\', action=\'store_true\',\n                         help="Distribute files to collection while maintaining order. Ignored if splitting on column.")\n-    parser.add_argument(\'--split_after\', \'-p\', action=\'store_true\',\n-                        help="Split between records after separator (default is before)." + \n-                         "Only for generic - specific ftypes are always split in the default way")\n+    generic = parser.add_argument_group(\'Arguments controling generic splitting\')\n+    group = generic.add_mutually_exclusive_group()\n+    group.add_'..b'i + 1\n-        if top:\n-            n_records -= top  # don\'t count the top lines\n-        \n+                if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0):\n+                    n_records += 1\n+                    last_line_matched = True\n+                else:\n+                    last_line_matched = False\n+            if sep_at_end and not last_line_matched:\n+                n_records += 1\n+\n+        # if there are fewer records than desired files\n+        numnew = min(numnew, n_records)\n+        # approx. number of records per file\n         if chunksize == 0: # i.e. no chunking\n-            # approx. number of lines per file\n             n_per_file = n_records // numnew\n         else:\n-            # approx. number of lines per file\n             numnew = n_records // chunksize\n             n_per_file = chunksize\n \n-\n-\n-\n     # make new files\n     # strip extension of old file and add number\n     custom_new_file_name = args["file_names"]\n@@ -161,34 +169,32 @@\n         open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w")\n         for count in range(0, numnew)\n     ]\n-\n     # bunch o\' counters\n     # index to list of new files\n-    new_file_counter = 0\n-\n-    # used for top\n-    # number of lines read so far\n-    n_read = 0\n+    if rand:\n+        new_file_counter = int(math.floor(random.random() * numnew))\n+    else:\n+        new_file_counter = 0\n     # to contain header specified by top\n     header = ""\n     # keep track of the files that have been opened so far\n-    fresh_files = {i for i in range(0, numnew)}\n+    fresh_files = set(range(numnew))\n \n     # keep track in loop of number of records in each file\n     # only used in batch\n     records_in_file = 0\n \n     # open file\n-    with open(in_file, "r") as file:\n+    with open(in_file, "r") as f:\n+        # read header\n+        for i in range(top):\n+            header += f.readline()\n+\n         record = ""\n-        for line in file:\n-            n_read += 1\n-            if n_read <= top:\n-                header += line\n-                continue\n+        for line_no, line in enumerate(f):\n             # check if beginning of line is record sep\n             # if beginning of line is record sep, either start record or finish one\n-            if re.match(sep, line) is not None:\n+            if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0):\n                 # this only happens first time through\n                 if record == "":\n                     record += line\n@@ -198,18 +204,15 @@\n                         newfiles[new_file_counter].write(header)\n                         fresh_files.remove(new_file_counter)\n                     \n-                    if ftype != "sdf" and args["split_after"] == False:\n-                        # write record to file\n-                        newfiles[new_file_counter].write(record)\n-\n-                        # if not the first time through, we assign the new record\n+                    if sep_at_end:\n+                        record += line\n+                    # write record to file\n+                    newfiles[new_file_counter].write(record)\n+                    if not sep_at_end:\n                         record = line\n-                                                \n-                    else:  # for sdf we want to write the line to the record before starting a new one\n-                        record += line\n-                        newfiles[new_file_counter].write(record)\n+                    else:\n                         record = ""\n-                        \n+\n                     # change destination file\n                     if rand:\n                         new_file_counter = int(math.floor(random.random() * numnew))\n@@ -229,6 +232,7 @@\n                 record += line\n         # after loop, write final record to file\n         newfiles[new_file_counter].write(record)\n+\n     # close new files\n     close_files(newfiles)\n \n'
b
diff -r 0850f2dfba13 -r e77b954f0da5 split_file_to_collection.xml
--- a/split_file_to_collection.xml Wed Oct 09 07:34:49 2019 -0400
+++ b/split_file_to_collection.xml Fri Oct 11 18:24:43 2019 -0400
[
b'@@ -1,4 +1,4 @@\n-<tool id="split_file_to_collection" name="Split file" version="0.3.0">\n+<tool id="split_file_to_collection" name="Split file" version="0.4.0">\n     <description>to dataset collection</description>\n     <macros>\n         <xml name="regex_sanitizer">\n@@ -77,9 +77,13 @@\n                 #end if\n             #else\n                 #if $split_parms.select_ftype == "generic"\n-                    --generic_re \'$split_parms.generic_regex\'\n-                    #if $split_parms.split_after == \'true\':\n-                        --split_after\n+                    #if $split_parms.split_method.select_split_method == "regex"\n+                        --generic_re \'$split_parms.split_method.generic_regex\'\n+                        #if $split_parms.split_method.split_after == \'true\':\n+                            --split_after\n+                        #end if\n+                    #else\n+                        --generic_num $split_parms.split_method.record_length\n                     #end if\n                 #end if\n                 #if $split_parms.select_mode.mode == "numnew":\n@@ -163,14 +167,25 @@\n             </when>\n             <when value="generic">\n                 <param name="input" type="data" format="txt" label="File to split"/>\n-                <param name="generic_regex" type="text" label="Regex to match record separator" value="^.*">\n-                    <expand macro="regex_sanitizer"/>\n-                </param>\n+                <conditional name="split_method">\n+                    <param name="select_split_method" type="select" label="Method to split files">\n+                        <option value="regex">Specify record separator as regular expression</option>\n+                        <option value="number">Specify number of lines after which a record ends</option>\n+                    </param>\n+                    <when value="regex">\n+                        <param name="generic_regex" type="text" label="Regex to match record separator" value="^.*">\n+                            <expand macro="regex_sanitizer"/>\n+                        </param>\n+                        <param name="split_after" type="select" value="false" label="Split records before or after the separator?" help="If before, the separator will appear at the start of each record; if after, at the end">\n+                            <option value="false" selected="true">Before</option>\n+                            <option value="true">After</option>\n+                        </param>\n+                    </when>\n+                    <when value="number">\n+                        <param name="record_length" type="integer" value="1" label="Record length" help="The number of lines after which each record ends"/>\n+                    </when>\n+                </conditional>\n                 <expand macro="numnew_fname"/>\n-                <param name="split_after" type="select" value="false" label="Split records before or after the separator?" help="If before, the separator will appear at the start of each record; if after, at the end">\n-                    <option value="false" selected="true">Before</option>\n-                    <option value="true">After</option>\n-                </param>\n             </when>\n         </conditional>\n     </inputs>\n@@ -205,6 +220,7 @@\n         </collection>\n     </outputs>\n     <tests>\n+        <!-- 1 -->\n         <test>\n             <param name="input" value="test.tabular" ftype="tabular"/>\n             <param name="select_ftype" value="tabular"/>\n@@ -219,6 +235,7 @@\n                 <element name="foo3.tab" file="foo3.tab" ftype="tabular"/>\n             </output_collection>\n         </test>\n+        <!-- 2 -->\n         <test>\n             <param name="input" value="test.tabular" ftype="tabular"/>\n             <param name="select_ftype" value="tabular"/>\n@@ -232,6 +249,7 @@\n                 <element name="test_000001.tabular" file="test_1.tabular" ftype="tabular"/>\n             </output_collection>\n         </test>\n+  '..b'           <element name="test_000001" file="test_1.tabular" ftype="txt" lines_diff="1"/>\n             </output_collection>\n         </test>\n+        <!-- generic-regex splitting (of a fasta) w random assignment -->\n         <test>\n             <param name="input" value="test.fasta" ftype="fasta"/>\n             <param name="select_ftype" value="generic"/>\n+            <param name="select_split_method" value="regex"/>\n             <param name="generic_regex" value="^>.*"/>\n             <param name="mode" value="numnew"/>\n             <param name="numnew" value="2"/>\n@@ -417,6 +465,7 @@\n                 <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/>\n             </output_collection>\n         </test>\n+        <!-- sdf + specify desired number of files -->\n         <test>\n             <param name="input" value="3_molecules.sdf" ftype="sdf"/>\n             <param name="select_ftype" value="sdf"/>\n@@ -430,6 +479,7 @@\n                 <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/>\n             </output_collection>\n         </test>\n+        <!-- sdf + specify desired number of records per file (chunksize) -->\n         <test>\n             <param name="input" value="3_molecules.sdf" ftype="sdf"/>\n             <param name="select_ftype" value="sdf"/>\n@@ -443,10 +493,12 @@\n                 <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/>\n             </output_collection>\n         </test>\n+        <!-- test split_after (by splitting fasta files after non-header lines) -->\n         <test>\n             <param name="input" value="test.fasta" ftype="fasta"/>\n             <param name="select_ftype" value="generic"/>\n-            <param name="generic_regex" value="^>.*"/>\n+            <param name="select_split_method" value="regex"/>\n+            <param name="generic_regex" value="^[^>].*"/>\n             <param name="split_after" value="true"/>\n             <param name="mode" value="numnew"/>\n             <param name="numnew" value="2"/>\n@@ -454,7 +506,8 @@\n             <param name="allocate" value="random"/>\n             <param name="seed" value="1010"/>\n             <output_collection name="list_output_generic" type="list">\n-                <element name="rand_000001" file="split_after.fasta" ftype="fasta"/>\n+                <element name="rand_000000" file="rand_0.fasta" ftype="fasta"/>\n+                <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/>\n             </output_collection>\n         </test>\n     </tests>\n@@ -463,10 +516,11 @@\n \n This tool splits a data set consisting of records into multiple data sets within a collection.\n A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence\n-(headers + sequence + qualities), etc. The important property is that the beginning of a new record\n-can be specified by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ.\n-The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, SDF and MGF.\n-For other data types the text delimiting records can be specified manually using the generic splitter. \n+(headers + sequence + qualities), etc. The important property is that the records either have a \n+specific length (e.g. 4 lines for FASTQ) or that the beginning/end of a new record\n+can be specified by a regular expression, e.g. ".*" for lines or ">.*" for FASTA.\n+The tool has presets for text, tabular data sets (which are split after each line), FASTA (new records start with ">.*"), FASTQ (records consist of 4 lines), SDF (records start with "^BEGIN IONS") and MGF (records end with "^$$$$").\n+For other data types the text delimiting records or the number of lines making up a record can be specified manually using the generic splitter. \n If the generic splitter is used, an option is also available to split records either before or after the\n separator. If a preset filetype is used, this is selected automatically (after for SDF, before for all\n others).\n'
b
diff -r 0850f2dfba13 -r e77b954f0da5 test-data/rand_0.fasta
--- a/test-data/rand_0.fasta Wed Oct 09 07:34:49 2019 -0400
+++ b/test-data/rand_0.fasta Fri Oct 11 18:24:43 2019 -0400
b
@@ -1,5 +1,5 @@
->seq1
-PROTEIN0
+>seq3
+ANOTHERPROTEIN
 >seq4
 ASFWEFOIN
 >seq5
b
diff -r 0850f2dfba13 -r e77b954f0da5 test-data/rand_1.fasta
--- a/test-data/rand_1.fasta Wed Oct 09 07:34:49 2019 -0400
+++ b/test-data/rand_1.fasta Fri Oct 11 18:24:43 2019 -0400
b
@@ -1,4 +1,4 @@
+>seq1
+PROTEIN0
 >seq2
 PROTEIN
->seq3
-ANOTHERPROTEIN