Repository 'split_file_to_collection'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/split_file_to_collection

Changeset 8:6cbe2f30c2d7 (2020-07-12)
Previous changeset 7:0046692724f9 (2020-07-10) Next changeset 9:baabc30154cd (2023-11-23)
Commit message:
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
modified:
split_file_to_collection.py
split_file_to_collection.xml
b
diff -r 0046692724f9 -r 6cbe2f30c2d7 split_file_to_collection.py
--- a/split_file_to_collection.py Fri Jul 10 13:41:00 2020 -0400
+++ b/split_file_to_collection.py Sun Jul 12 10:27:06 2020 -0400
[
b'@@ -3,8 +3,8 @@\n import argparse\n import math\n import os\n+import random\n import re\n-import random\n \n # configuration of the splitting for specific file types\n # - regular expression matching the record separator (\'\' if not splitting by regex but by number of lines)\n@@ -13,12 +13,12 @@\n #\n # new file types can be added by appending to this dict,\n # updating the parser, and adding a new type option in the Galaxy wrapper\n-FILETYPES = {\'fasta\': (\'^>\', 0,  False),\n-             \'fastq\': (\'\', 4, False),\n-             \'tabular\': (\'\', 1, False),\n-             \'txt\': (\'\', 1, False),\n-             \'mgf\': (\'^BEGIN IONS\', 0, False),\n-             \'sdf\': (\'\\$\\$\\$\\$\', 0, True),\n+FILETYPES = {\'fasta\': (r\'^>\', 0,  False),\n+             \'fastq\': (r\'\', 4, False),\n+             \'tabular\': (r\'\', 1, False),\n+             \'txt\': (r\'\', 1, False),\n+             \'mgf\': (r\'^BEGIN IONS\', 0, False),\n+             \'sdf\': (r\'\\$\\$\\$\\$\', 0, True),\n              }\n \n \n@@ -41,7 +41,7 @@\n \n     ftype = args["ftype"]\n \n-    assert ftype != "generic" or args["generic_re"] != None, "--generic_re needs to be given for generic input"\n+    assert ftype != "generic" or args["generic_re"] is not None, "--generic_re needs to be given for generic input"\n \n     if args["ftype"] == "tabular" and args["by"] == "col":\n         args["match"] = replace_mapped_chars(args["match"])\n@@ -61,43 +61,37 @@\n     parser.add_argument(\'--file_names\', \'-a\', help="If not splitting by column, the base name of the new files")\n     parser.add_argument(\'--file_ext\', \'-e\', help="If not splitting by column," +\n                                                  " the extension of the new files (without a period)")\n-    parser.add_argument(\'--ftype\', \'-f\', help="The type of the file to split", required = True,\n-        choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])\n+    parser.add_argument(\'--ftype\', \'-f\', help="The type of the file to split", required=True,\n+                        choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])\n     parser.add_argument(\'--by\', \'-b\', help="Split by line or by column (tabular only)",\n-        default = "row", choices = ["col", "row"])\n+                        default="row", choices=["col", "row"])\n     parser.add_argument(\'--top\', \'-t\', type=int, default=0, help="Number of header lines to carry over to new files.")\n     parser.add_argument(\'--rand\', \'-r\', help="Divide records randomly into new files", action=\'store_true\')\n     parser.add_argument(\'--seed\', \'-x\', help="Provide a seed for the random number generator. " +\n                                              "If not provided and args[\\"rand\\"]==True, then date is used", type=int)\n     group = parser.add_mutually_exclusive_group()\n-    group.add_argument(\'--numnew\', \'-n\', type=int, default = 1,\n-                        help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")\n-    group.add_argument(\'--chunksize\', \'-k\', type=int, default = 0,\n-                        help="Number of records by file. Not valid for splitting on a column")\n+    group.add_argument(\'--numnew\', \'-n\', type=int, default=1,\n+                       help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")\n+    group.add_argument(\'--chunksize\', \'-k\', type=int, default=0,\n+                       help="Number of records by file. Not valid for splitting on a column")\n     parser.add_argument(\'--batch\', action=\'store_true\',\n                         help="Distribute files to collection while maintaining order. Ignored if splitting on column.")\n     generic = parser.add_argument_group(\'Arguments controling generic splitting\')\n     group = generic.add_mutually_exclusive_group()\n-    group.add_argument(\'--generic_re\', \'-g\', default="", help="Regular expression indicating the start of a new record (only for generic)", required = F'..b'se:\n         new_file_base = [custom_new_file_name, custom_new_file_ext]\n \n-    newfiles = [\n-        open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w")\n-        for count in range(0, numnew)\n-    ]\n+    newfile_names = [os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) for count in range(0, numnew)]\n     # bunch o\' counters\n     # index to list of new files\n     if rand:\n         new_file_counter = int(math.floor(random.random() * numnew))\n     else:\n         new_file_counter = 0\n+    new_file = open(newfile_names[new_file_counter], "a")\n     # to contain header specified by top\n     header = ""\n     # keep track of the files that have been opened so far\n@@ -201,13 +193,13 @@\n                 else:\n                     # if is in fresh_files, write header and drop from freshFiles\n                     if new_file_counter in fresh_files:\n-                        newfiles[new_file_counter].write(header)\n+                        new_file.write(header)\n                         fresh_files.remove(new_file_counter)\n-                    \n+\n                     if sep_at_end:\n                         record += line\n                     # write record to file\n-                    newfiles[new_file_counter].write(record)\n+                    new_file.write(record)\n                     if not sep_at_end:\n                         record = line\n                     else:\n@@ -216,6 +208,8 @@\n                     # change destination file\n                     if rand:\n                         new_file_counter = int(math.floor(random.random() * numnew))\n+                        new_file.close()\n+                        new_file = open(newfile_names[new_file_counter], "a")\n                     elif batch:\n                         # number of records read per file\n                         records_in_file += 1\n@@ -224,17 +218,19 @@\n                         if records_in_file >= n_per_file:\n                             new_file_counter = (new_file_counter + 1) % numnew\n                             records_in_file = 0  # reset to 0\n+                            new_file.close()\n+                            new_file = open(newfile_names[new_file_counter], "a")\n                     else:\n                         new_file_counter = (new_file_counter + 1) % numnew\n+                        new_file.close()\n+                        new_file = open(newfile_names[new_file_counter], "a")\n             # if beginning of line is not record sep, we must be inside a record\n             # so just append\n             else:\n                 record += line\n         # after loop, write final record to file\n-        newfiles[new_file_counter].write(record)\n-\n-    # close new files\n-    close_files(newfiles)\n+        new_file.write(record)\n+        new_file.close()\n \n \n def split_by_column(args, in_file, out_dir, top):\n@@ -251,7 +247,7 @@\n     sub = args["sub"]\n \n     # set of file names\n-    new_files = dict()\n+    files = set()\n \n     # keep track of how many lines have been read\n     n_read = 0\n@@ -274,19 +270,11 @@\n             out_file_path = os.path.join(out_dir, out_file_name)\n \n             # write\n-            if out_file_name not in new_files.keys():\n-                # open file (new, so not already open)\n-                current_new_file = open(out_file_path, "w")\n-                current_new_file.write(header)\n+            with open(out_file_path, "a") as current_new_file:\n+                if out_file_name not in files:\n+                    current_new_file.write(header)\n+                    files.add(out_file_name)\n                 current_new_file.write(line)\n-                # add to dict\n-                new_files[out_file_name] = current_new_file\n-            else:\n-                # file is already open, so just write to it\n-                new_files[out_file_name].write(line)\n-\n-    # finally, close all files\n-    close_files(new_files.values())\n \n \n if __name__ == "__main__":\n'
b
diff -r 0046692724f9 -r 6cbe2f30c2d7 split_file_to_collection.xml
--- a/split_file_to_collection.xml Fri Jul 10 13:41:00 2020 -0400
+++ b/split_file_to_collection.xml Sun Jul 12 10:27:06 2020 -0400
b
@@ -1,4 +1,4 @@
-<tool id="split_file_to_collection" name="Split file" version="0.4.0">
+<tool id="split_file_to_collection" name="Split file" version="0.5.0">
     <description>to dataset collection</description>
     <macros>
         <xml name="regex_sanitizer">