Previous changeset 8:6cbe2f30c2d7 (2020-07-12) Next changeset 10:2dae863c8f42 (2024-05-23) |
Commit message:
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 37305ce5e6ef1b91f6058eff5b128064f76fbe45 |
modified:
split_file_to_collection.py split_file_to_collection.xml |
b |
diff -r 6cbe2f30c2d7 -r baabc30154cd split_file_to_collection.py --- a/split_file_to_collection.py Sun Jul 12 10:27:06 2020 -0400 +++ b/split_file_to_collection.py Thu Nov 23 20:02:01 2023 +0000 |
[ |
b'@@ -13,13 +13,14 @@\n #\n # new file types can be added by appending to this dict,\n # updating the parser, and adding a new type option in the Galaxy wrapper\n-FILETYPES = {\'fasta\': (r\'^>\', 0, False),\n- \'fastq\': (r\'\', 4, False),\n- \'tabular\': (r\'\', 1, False),\n- \'txt\': (r\'\', 1, False),\n- \'mgf\': (r\'^BEGIN IONS\', 0, False),\n- \'sdf\': (r\'\\$\\$\\$\\$\', 0, True),\n- }\n+FILETYPES = {\n+ "fasta": (r"^>", 0, False),\n+ "fastq": (r"", 4, False),\n+ "tabular": (r"", 1, False),\n+ "txt": (r"", 1, False),\n+ "mgf": (r"^BEGIN IONS", 0, False),\n+ "sdf": (r"\\$\\$\\$\\$", 0, True),\n+}\n \n \n def main():\n@@ -29,11 +30,11 @@\n # get args and validate\n in_file = args["in"]\n if not os.path.isfile(args["in"]):\n- raise FileNotFoundError(\'Input file does not exist\')\n+ raise FileNotFoundError("Input file does not exist")\n \n out_dir = args["out_dir"]\n if not os.path.isdir(args["out_dir"]):\n- raise FileNotFoundError(\'out_dir is not a directory\')\n+ raise FileNotFoundError("out_dir is not a directory")\n \n top = args["top"]\n if top < 0:\n@@ -41,7 +42,9 @@\n \n ftype = args["ftype"]\n \n- assert ftype != "generic" or args["generic_re"] is not None, "--generic_re needs to be given for generic input"\n+ assert (\n+ ftype != "generic" or args["generic_re"] is not None\n+ ), "--generic_re needs to be given for generic input"\n \n if args["ftype"] == "tabular" and args["by"] == "col":\n args["match"] = replace_mapped_chars(args["match"])\n@@ -53,42 +56,127 @@\n \n \n def parser_cli():\n- parser = argparse.ArgumentParser(description="split a file into multiple files. " +\n- "Can split on the column of a tabular file, " +\n- "with custom and useful names based on column value.")\n- parser.add_argument(\'--in\', \'-i\', required=True, help="The input file")\n- parser.add_argument(\'--out_dir\', \'-o\', default=os.getcwd(), help="The output directory", required=True)\n- parser.add_argument(\'--file_names\', \'-a\', help="If not splitting by column, the base name of the new files")\n- parser.add_argument(\'--file_ext\', \'-e\', help="If not splitting by column," +\n- " the extension of the new files (without a period)")\n- parser.add_argument(\'--ftype\', \'-f\', help="The type of the file to split", required=True,\n- choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])\n- parser.add_argument(\'--by\', \'-b\', help="Split by line or by column (tabular only)",\n- default="row", choices=["col", "row"])\n- parser.add_argument(\'--top\', \'-t\', type=int, default=0, help="Number of header lines to carry over to new files.")\n- parser.add_argument(\'--rand\', \'-r\', help="Divide records randomly into new files", action=\'store_true\')\n- parser.add_argument(\'--seed\', \'-x\', help="Provide a seed for the random number generator. " +\n- "If not provided and args[\\"rand\\"]==True, then date is used", type=int)\n+ parser = argparse.ArgumentParser(\n+ description="split a file into multiple files. "\n+ + "Can split on the column of a tabular file, "\n+ + "with custom and useful names based on column value."\n+ )\n+ parser.add_argument("--in", "-i", required=True, help="The input file")\n+ parser.add_argument(\n+ "--out_dir",\n+ "-o",\n+ default=os.getcwd(),\n+ help="The output directory",\n+ required=True,\n+ )\n+ parser.add_argument(\n+ "--file_names",\n+ "-a",\n+ help="If not splitting by column, the base name of the new files",\n+ )\n+ parser.add_argument(\n+ "--file_ext",\n+ "-e",\n+ help="If not splitting by column,"\n+ + " the extension of the new files (without a period)",\n+ )\n+ parser.add_ar'..b'"If splitting on a column")\n+ bycol.add_argument(\n+ "--match",\n+ "-m",\n+ default="(.*)",\n+ help="The regular expression to match id column entries",\n+ )\n+ bycol.add_argument(\n+ "--sub",\n+ "-s",\n+ default=r"\\1",\n+ help="The regular expression to substitute in for the matched pattern.",\n+ )\n+ bycol.add_argument(\n+ "--id_column",\n+ "-c",\n+ default="1",\n+ help="Column that is used to name output files. Indexed starting from 1.",\n+ type=int,\n+ )\n return parser\n \n \n@@ -96,7 +184,7 @@\n """\n handles special escaped characters when coming from galaxy\n """\n- mapped_chars = {\'\\\'\': \'__sq__\', \'\\\\\': \'__backslash__\'}\n+ mapped_chars = {"\'": "__sq__", "\\\\": "__backslash__"}\n for key, value in mapped_chars.items():\n pattern = pattern.replace(value, key)\n return pattern\n@@ -104,7 +192,9 @@\n \n def split_by_record(args, in_file, out_dir, top, ftype):\n # get configuration (record separator, start at end) for given filetype\n- sep, num, sep_at_end = FILETYPES.get(ftype, (args["generic_re"], args["generic_num"], args["split_after"]))\n+ sep, num, sep_at_end = FILETYPES.get(\n+ ftype, (args["generic_re"], args["generic_num"], args["split_after"])\n+ )\n sep = re.compile(sep)\n \n chunksize = args["chunksize"]\n@@ -126,14 +216,19 @@\n # (done always, even if used only for batch mode)\n # - if the separator is a the start / end of the record\n n_per_file = math.inf\n- if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected\n+ if (\n+ chunksize != 0 or batch\n+ ): # needs to be calculated if either batch or chunksize are selected\n with open(in_file) as f:\n # read header lines\n for i in range(top):\n f.readline()\n n_records = 0\n+ last_line_matched = False\n for line in f:\n- if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0):\n+ if (num == 0 and re.match(sep, line) is not None) or (\n+ num > 0 and n_records % num == 0\n+ ):\n n_records += 1\n last_line_matched = True\n else:\n@@ -147,7 +242,7 @@\n if chunksize == 0: # i.e. no chunking\n n_per_file = n_records // numnew\n else:\n- numnew = n_records // chunksize\n+ numnew = max(n_records // chunksize, 1) # should not be less than 1\n n_per_file = chunksize\n \n # make new files\n@@ -159,7 +254,10 @@\n else:\n new_file_base = [custom_new_file_name, custom_new_file_ext]\n \n- newfile_names = [os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) for count in range(0, numnew)]\n+ newfile_names = [\n+ os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1]))\n+ for count in range(0, numnew)\n+ ]\n # bunch o\' counters\n # index to list of new files\n if rand:\n@@ -186,7 +284,9 @@\n for line_no, line in enumerate(f):\n # check if beginning of line is record sep\n # if beginning of line is record sep, either start record or finish one\n- if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0):\n+ if (num == 0 and re.match(sep, line) is not None) or (\n+ num > 0 and line_no % num == 0\n+ ):\n # this only happens first time through\n if record == "":\n record += line\n@@ -260,7 +360,7 @@\n header += line\n continue\n # split into columns, on tab\n- fields = re.split(r\'\\t\', line.strip(\'\\n\'))\n+ fields = re.split(r"\\t", line.strip("\\n"))\n \n # get id column value\n id_col_val = fields[id_col]\n' |
b |
diff -r 6cbe2f30c2d7 -r baabc30154cd split_file_to_collection.xml --- a/split_file_to_collection.xml Sun Jul 12 10:27:06 2020 -0400 +++ b/split_file_to_collection.xml Thu Nov 23 20:02:01 2023 +0000 |
b |
@@ -1,4 +1,4 @@ -<tool id="split_file_to_collection" name="Split file" version="0.5.0"> +<tool id="split_file_to_collection" name="Split file" version="0.5.1"> <description>to dataset collection</description> <macros> <xml name="regex_sanitizer"> |