Mercurial > repos > bgruening > split_file_to_collection
changeset 9:baabc30154cd draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 37305ce5e6ef1b91f6058eff5b128064f76fbe45
author | bgruening |
---|---|
date | Thu, 23 Nov 2023 20:02:01 +0000 |
parents | 6cbe2f30c2d7 |
children | 2dae863c8f42 |
files | split_file_to_collection.py split_file_to_collection.xml |
diffstat | 2 files changed, 153 insertions(+), 53 deletions(-) [+] |
line wrap: on
line diff
--- a/split_file_to_collection.py Sun Jul 12 10:27:06 2020 -0400 +++ b/split_file_to_collection.py Thu Nov 23 20:02:01 2023 +0000 @@ -13,13 +13,14 @@ # # new file types can be added by appending to this dict, # updating the parser, and adding a new type option in the Galaxy wrapper -FILETYPES = {'fasta': (r'^>', 0, False), - 'fastq': (r'', 4, False), - 'tabular': (r'', 1, False), - 'txt': (r'', 1, False), - 'mgf': (r'^BEGIN IONS', 0, False), - 'sdf': (r'\$\$\$\$', 0, True), - } +FILETYPES = { + "fasta": (r"^>", 0, False), + "fastq": (r"", 4, False), + "tabular": (r"", 1, False), + "txt": (r"", 1, False), + "mgf": (r"^BEGIN IONS", 0, False), + "sdf": (r"\$\$\$\$", 0, True), +} def main(): @@ -29,11 +30,11 @@ # get args and validate in_file = args["in"] if not os.path.isfile(args["in"]): - raise FileNotFoundError('Input file does not exist') + raise FileNotFoundError("Input file does not exist") out_dir = args["out_dir"] if not os.path.isdir(args["out_dir"]): - raise FileNotFoundError('out_dir is not a directory') + raise FileNotFoundError("out_dir is not a directory") top = args["top"] if top < 0: @@ -41,7 +42,9 @@ ftype = args["ftype"] - assert ftype != "generic" or args["generic_re"] is not None, "--generic_re needs to be given for generic input" + assert ( + ftype != "generic" or args["generic_re"] is not None + ), "--generic_re needs to be given for generic input" if args["ftype"] == "tabular" and args["by"] == "col": args["match"] = replace_mapped_chars(args["match"]) @@ -53,42 +56,127 @@ def parser_cli(): - parser = argparse.ArgumentParser(description="split a file into multiple files. " + - "Can split on the column of a tabular file, " + - "with custom and useful names based on column value.") - parser.add_argument('--in', '-i', required=True, help="The input file") - parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True) - parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") - parser.add_argument('--file_ext', '-e', help="If not splitting by column," + - " the extension of the new files (without a period)") - parser.add_argument('--ftype', '-f', help="The type of the file to split", required=True, - choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"]) - parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", - default="row", choices=["col", "row"]) - parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.") - parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') - parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + - "If not provided and args[\"rand\"]==True, then date is used", type=int) + parser = argparse.ArgumentParser( + description="split a file into multiple files. " + + "Can split on the column of a tabular file, " + + "with custom and useful names based on column value." + ) + parser.add_argument("--in", "-i", required=True, help="The input file") + parser.add_argument( + "--out_dir", + "-o", + default=os.getcwd(), + help="The output directory", + required=True, + ) + parser.add_argument( + "--file_names", + "-a", + help="If not splitting by column, the base name of the new files", + ) + parser.add_argument( + "--file_ext", + "-e", + help="If not splitting by column," + + " the extension of the new files (without a period)", + ) + parser.add_argument( + "--ftype", + "-f", + help="The type of the file to split", + required=True, + choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"], + ) + parser.add_argument( + "--by", + "-b", + help="Split by line or by column (tabular only)", + default="row", + choices=["col", "row"], + ) + parser.add_argument( + "--top", + "-t", + type=int, + default=0, + help="Number of header lines to carry over to new files.", + ) + parser.add_argument( + "--rand", + "-r", + help="Divide records randomly into new files", + action="store_true", + ) + parser.add_argument( + "--seed", + "-x", + help="Provide a seed for the random number generator. " + + 'If not provided and args["rand"]==True, then date is used', + type=int, + ) group = parser.add_mutually_exclusive_group() - group.add_argument('--numnew', '-n', type=int, default=1, - help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.") - group.add_argument('--chunksize', '-k', type=int, default=0, - help="Number of records by file. Not valid for splitting on a column") - parser.add_argument('--batch', action='store_true', - help="Distribute files to collection while maintaining order. Ignored if splitting on column.") - generic = parser.add_argument_group('Arguments controling generic splitting') + group.add_argument( + "--numnew", + "-n", + type=int, + default=1, + help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.", + ) + group.add_argument( + "--chunksize", + "-k", + type=int, + default=0, + help="Number of records by file. Not valid for splitting on a column", + ) + parser.add_argument( + "--batch", + action="store_true", + help="Distribute files to collection while maintaining order. Ignored if splitting on column.", + ) + generic = parser.add_argument_group("Arguments controling generic splitting") group = generic.add_mutually_exclusive_group() - group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required=False) - group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required=False) - generic.add_argument('--split_after', '-p', action='store_true', - help="Split between records after separator (default is before). " + - "Only for generic splitting by regex - specific ftypes are always split in the default way") - bycol = parser.add_argument_group('If splitting on a column') - bycol.add_argument('--match', '-m', default="(.*)", help="The regular expression to match id column entries") - bycol.add_argument('--sub', '-s', default=r'\1', - help="The regular expression to substitute in for the matched pattern.") - bycol.add_argument('--id_column', '-c', default="1", - help="Column that is used to name output files. Indexed starting from 1.", type=int) + group.add_argument( + "--generic_re", + "-g", + default="", + help="Regular expression indicating the start of a new record (only for generic)", + required=False, + ) + group.add_argument( + "--generic_num", + type=int, + default=0, + help="Length of records in number of lines (only for generic)", + required=False, + ) + generic.add_argument( + "--split_after", + "-p", + action="store_true", + help="Split between records after separator (default is before). " + + "Only for generic splitting by regex - specific ftypes are always split in the default way", + ) + bycol = parser.add_argument_group("If splitting on a column") + bycol.add_argument( + "--match", + "-m", + default="(.*)", + help="The regular expression to match id column entries", + ) + bycol.add_argument( + "--sub", + "-s", + default=r"\1", + help="The regular expression to substitute in for the matched pattern.", + ) + bycol.add_argument( + "--id_column", + "-c", + default="1", + help="Column that is used to name output files. Indexed starting from 1.", + type=int, + ) return parser @@ -96,7 +184,7 @@ """ handles special escaped characters when coming from galaxy """ - mapped_chars = {'\'': '__sq__', '\\': '__backslash__'} + mapped_chars = {"'": "__sq__", "\\": "__backslash__"} for key, value in mapped_chars.items(): pattern = pattern.replace(value, key) return pattern @@ -104,7 +192,9 @@ def split_by_record(args, in_file, out_dir, top, ftype): # get configuration (record separator, start at end) for given filetype - sep, num, sep_at_end = FILETYPES.get(ftype, (args["generic_re"], args["generic_num"], args["split_after"])) + sep, num, sep_at_end = FILETYPES.get( + ftype, (args["generic_re"], args["generic_num"], args["split_after"]) + ) sep = re.compile(sep) chunksize = args["chunksize"] @@ -126,14 +216,19 @@ # (done always, even if used only for batch mode) # - if the separator is a the start / end of the record n_per_file = math.inf - if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected + if ( + chunksize != 0 or batch + ): # needs to be calculated if either batch or chunksize are selected with open(in_file) as f: # read header lines for i in range(top): f.readline() n_records = 0 + last_line_matched = False for line in f: - if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0): + if (num == 0 and re.match(sep, line) is not None) or ( + num > 0 and n_records % num == 0 + ): n_records += 1 last_line_matched = True else: @@ -147,7 +242,7 @@ if chunksize == 0: # i.e. no chunking n_per_file = n_records // numnew else: - numnew = n_records // chunksize + numnew = max(n_records // chunksize, 1) # should not be less than 1 n_per_file = chunksize # make new files @@ -159,7 +254,10 @@ else: new_file_base = [custom_new_file_name, custom_new_file_ext] - newfile_names = [os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) for count in range(0, numnew)] + newfile_names = [ + os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) + for count in range(0, numnew) + ] # bunch o' counters # index to list of new files if rand: @@ -186,7 +284,9 @@ for line_no, line in enumerate(f): # check if beginning of line is record sep # if beginning of line is record sep, either start record or finish one - if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0): + if (num == 0 and re.match(sep, line) is not None) or ( + num > 0 and line_no % num == 0 + ): # this only happens first time through if record == "": record += line @@ -260,7 +360,7 @@ header += line continue # split into columns, on tab - fields = re.split(r'\t', line.strip('\n')) + fields = re.split(r"\t", line.strip("\n")) # get id column value id_col_val = fields[id_col]
--- a/split_file_to_collection.xml Sun Jul 12 10:27:06 2020 -0400 +++ b/split_file_to_collection.xml Thu Nov 23 20:02:01 2023 +0000 @@ -1,4 +1,4 @@ -<tool id="split_file_to_collection" name="Split file" version="0.5.0"> +<tool id="split_file_to_collection" name="Split file" version="0.5.1"> <description>to dataset collection</description> <macros> <xml name="regex_sanitizer">