Previous changeset 7:0046692724f9 (2020-07-10) Next changeset 9:baabc30154cd (2023-11-23) |
Commit message:
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty" |
modified:
split_file_to_collection.py split_file_to_collection.xml |
b |
diff -r 0046692724f9 -r 6cbe2f30c2d7 split_file_to_collection.py --- a/split_file_to_collection.py Fri Jul 10 13:41:00 2020 -0400 +++ b/split_file_to_collection.py Sun Jul 12 10:27:06 2020 -0400 |
[ |
b'@@ -3,8 +3,8 @@\n import argparse\n import math\n import os\n+import random\n import re\n-import random\n \n # configuration of the splitting for specific file types\n # - regular expression matching the record separator (\'\' if not splitting by regex but by number of lines)\n@@ -13,12 +13,12 @@\n #\n # new file types can be added by appending to this dict,\n # updating the parser, and adding a new type option in the Galaxy wrapper\n-FILETYPES = {\'fasta\': (\'^>\', 0, False),\n- \'fastq\': (\'\', 4, False),\n- \'tabular\': (\'\', 1, False),\n- \'txt\': (\'\', 1, False),\n- \'mgf\': (\'^BEGIN IONS\', 0, False),\n- \'sdf\': (\'\\$\\$\\$\\$\', 0, True),\n+FILETYPES = {\'fasta\': (r\'^>\', 0, False),\n+ \'fastq\': (r\'\', 4, False),\n+ \'tabular\': (r\'\', 1, False),\n+ \'txt\': (r\'\', 1, False),\n+ \'mgf\': (r\'^BEGIN IONS\', 0, False),\n+ \'sdf\': (r\'\\$\\$\\$\\$\', 0, True),\n }\n \n \n@@ -41,7 +41,7 @@\n \n ftype = args["ftype"]\n \n- assert ftype != "generic" or args["generic_re"] != None, "--generic_re needs to be given for generic input"\n+ assert ftype != "generic" or args["generic_re"] is not None, "--generic_re needs to be given for generic input"\n \n if args["ftype"] == "tabular" and args["by"] == "col":\n args["match"] = replace_mapped_chars(args["match"])\n@@ -61,43 +61,37 @@\n parser.add_argument(\'--file_names\', \'-a\', help="If not splitting by column, the base name of the new files")\n parser.add_argument(\'--file_ext\', \'-e\', help="If not splitting by column," +\n " the extension of the new files (without a period)")\n- parser.add_argument(\'--ftype\', \'-f\', help="The type of the file to split", required = True,\n- choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])\n+ parser.add_argument(\'--ftype\', \'-f\', help="The type of the file to split", required=True,\n+ choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])\n parser.add_argument(\'--by\', \'-b\', help="Split by line or by column (tabular only)",\n- default = "row", choices = ["col", "row"])\n+ default="row", choices=["col", "row"])\n parser.add_argument(\'--top\', \'-t\', type=int, default=0, help="Number of header lines to carry over to new files.")\n parser.add_argument(\'--rand\', \'-r\', help="Divide records randomly into new files", action=\'store_true\')\n parser.add_argument(\'--seed\', \'-x\', help="Provide a seed for the random number generator. " +\n "If not provided and args[\\"rand\\"]==True, then date is used", type=int)\n group = parser.add_mutually_exclusive_group()\n- group.add_argument(\'--numnew\', \'-n\', type=int, default = 1,\n- help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")\n- group.add_argument(\'--chunksize\', \'-k\', type=int, default = 0,\n- help="Number of records by file. Not valid for splitting on a column")\n+ group.add_argument(\'--numnew\', \'-n\', type=int, default=1,\n+ help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")\n+ group.add_argument(\'--chunksize\', \'-k\', type=int, default=0,\n+ help="Number of records by file. Not valid for splitting on a column")\n parser.add_argument(\'--batch\', action=\'store_true\',\n help="Distribute files to collection while maintaining order. Ignored if splitting on column.")\n generic = parser.add_argument_group(\'Arguments controling generic splitting\')\n group = generic.add_mutually_exclusive_group()\n- group.add_argument(\'--generic_re\', \'-g\', default="", help="Regular expression indicating the start of a new record (only for generic)", required = F'..b'se:\n new_file_base = [custom_new_file_name, custom_new_file_ext]\n \n- newfiles = [\n- open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w")\n- for count in range(0, numnew)\n- ]\n+ newfile_names = [os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) for count in range(0, numnew)]\n # bunch o\' counters\n # index to list of new files\n if rand:\n new_file_counter = int(math.floor(random.random() * numnew))\n else:\n new_file_counter = 0\n+ new_file = open(newfile_names[new_file_counter], "a")\n # to contain header specified by top\n header = ""\n # keep track of the files that have been opened so far\n@@ -201,13 +193,13 @@\n else:\n # if is in fresh_files, write header and drop from freshFiles\n if new_file_counter in fresh_files:\n- newfiles[new_file_counter].write(header)\n+ new_file.write(header)\n fresh_files.remove(new_file_counter)\n- \n+\n if sep_at_end:\n record += line\n # write record to file\n- newfiles[new_file_counter].write(record)\n+ new_file.write(record)\n if not sep_at_end:\n record = line\n else:\n@@ -216,6 +208,8 @@\n # change destination file\n if rand:\n new_file_counter = int(math.floor(random.random() * numnew))\n+ new_file.close()\n+ new_file = open(newfile_names[new_file_counter], "a")\n elif batch:\n # number of records read per file\n records_in_file += 1\n@@ -224,17 +218,19 @@\n if records_in_file >= n_per_file:\n new_file_counter = (new_file_counter + 1) % numnew\n records_in_file = 0 # reset to 0\n+ new_file.close()\n+ new_file = open(newfile_names[new_file_counter], "a")\n else:\n new_file_counter = (new_file_counter + 1) % numnew\n+ new_file.close()\n+ new_file = open(newfile_names[new_file_counter], "a")\n # if beginning of line is not record sep, we must be inside a record\n # so just append\n else:\n record += line\n # after loop, write final record to file\n- newfiles[new_file_counter].write(record)\n-\n- # close new files\n- close_files(newfiles)\n+ new_file.write(record)\n+ new_file.close()\n \n \n def split_by_column(args, in_file, out_dir, top):\n@@ -251,7 +247,7 @@\n sub = args["sub"]\n \n # set of file names\n- new_files = dict()\n+ files = set()\n \n # keep track of how many lines have been read\n n_read = 0\n@@ -274,19 +270,11 @@\n out_file_path = os.path.join(out_dir, out_file_name)\n \n # write\n- if out_file_name not in new_files.keys():\n- # open file (new, so not already open)\n- current_new_file = open(out_file_path, "w")\n- current_new_file.write(header)\n+ with open(out_file_path, "a") as current_new_file:\n+ if out_file_name not in files:\n+ current_new_file.write(header)\n+ files.add(out_file_name)\n current_new_file.write(line)\n- # add to dict\n- new_files[out_file_name] = current_new_file\n- else:\n- # file is already open, so just write to it\n- new_files[out_file_name].write(line)\n-\n- # finally, close all files\n- close_files(new_files.values())\n \n \n if __name__ == "__main__":\n' |
b |
diff -r 0046692724f9 -r 6cbe2f30c2d7 split_file_to_collection.xml --- a/split_file_to_collection.xml Fri Jul 10 13:41:00 2020 -0400 +++ b/split_file_to_collection.xml Sun Jul 12 10:27:06 2020 -0400 |
b |
@@ -1,4 +1,4 @@ -<tool id="split_file_to_collection" name="Split file" version="0.4.0"> +<tool id="split_file_to_collection" name="Split file" version="0.5.0"> <description>to dataset collection</description> <macros> <xml name="regex_sanitizer"> |