Mercurial > repos > bgruening > split_file_to_collection
diff split_file_to_collection.py @ 5:e77b954f0da5 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
author | bgruening |
---|---|
date | Fri, 11 Oct 2019 18:24:43 -0400 |
parents | 0850f2dfba13 |
children | d57735dd27b0 |
line wrap: on
line diff
--- a/split_file_to_collection.py Wed Oct 09 07:34:49 2019 -0400 +++ b/split_file_to_collection.py Fri Oct 11 18:24:43 2019 -0400 @@ -1,23 +1,24 @@ #!/usr/bin/env python import argparse +import math import os import re import random -import math - -""" -regexes that indicate the *beginning* of a record -new file types can be added by appending to this dict, -updating the parser, and adding a new type option in the Galaxy wrapper -""" -FILETYPES = {'fasta': '^>', - 'fastq': '^@', - 'tabular': '^.*', - 'txt': '^.*', - 'mgf': '^BEGIN IONS', - 'sdf': '\$\$\$\$', +# configuration of the splitting for specific file types +# - regular expression matching the record separator ('' if not splitting by regex but by number of lines) +# - number of lines to split after (0 if not splitting by number of lines but regex) +# - a boolean indicating if the record separator is at the end of the record +# +# new file types can be added by appending to this dict, +# updating the parser, and adding a new type option in the Galaxy wrapper +FILETYPES = {'fasta': ('^>', 0, False), + 'fastq': ('', 4, False), + 'tabular': ('', 1, False), + 'txt': ('', 1, False), + 'mgf': ('^BEGIN IONS', 0, False), + 'sdf': ('\$\$\$\$', 0, True), } @@ -46,8 +47,8 @@ args["match"] = replace_mapped_chars(args["match"]) args["sub"] = replace_mapped_chars(args["sub"]) split_by_column(args, in_file, out_dir, top) - else: + args["generic_re"] = replace_mapped_chars(args["generic_re"]) split_by_record(args, in_file, out_dir, top, ftype) @@ -62,23 +63,26 @@ " the extension of the new files (without a period)") parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True, choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"]) - parser.add_argument('--generic_re', '-g', help="Regular expression indicating the start of a new record (only for generic)", required = False) parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", default = "row", choices = ["col", "row"]) - parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " + - "(tabular only).") + parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.") parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + "If not provided and args[\"rand\"]==True, then date is used", type=int) - parser.add_argument('--numnew', '-n', type=int, default = 1, + group = parser.add_mutually_exclusive_group() + group.add_argument('--numnew', '-n', type=int, default = 1, help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.") - parser.add_argument('--chunksize', '-k', type=int, default = 0, + group.add_argument('--chunksize', '-k', type=int, default = 0, help="Number of records by file. Not valid for splitting on a column") parser.add_argument('--batch', action='store_true', help="Distribute files to collection while maintaining order. Ignored if splitting on column.") - parser.add_argument('--split_after', '-p', action='store_true', - help="Split between records after separator (default is before)." + - "Only for generic - specific ftypes are always split in the default way") + generic = parser.add_argument_group('Arguments controling generic splitting') + group = generic.add_mutually_exclusive_group() + group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required = False) + group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required = False) + generic.add_argument('--split_after', '-p', action='store_true', + help="Split between records after separator (default is before). " + + "Only for generic splitting by regex - specific ftypes are always split in the default way") bycol = parser.add_argument_group('If splitting on a column') bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries") bycol.add_argument('--sub', '-s', default = r'\1', @@ -105,8 +109,9 @@ def split_by_record(args, in_file, out_dir, top, ftype): - # get record separator for given filetype - sep = re.compile(FILETYPES.get(ftype, args["generic_re"])) + # get configuration (record separator, start at end) for given filetype + sep, num, sep_at_end = FILETYPES.get(ftype, (args["generic_re"], args["generic_num"], args["split_after"])) + sep = re.compile(sep) chunksize = args["chunksize"] numnew = args["numnew"] @@ -121,33 +126,36 @@ # batched division (maintains order) batch = args["batch"] - + # determine + # - the number of records that should be stored per file + # (done always, even if used only for batch mode) + # - if the separator is a the start / end of the record + n_per_file = math.inf if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected - # define n_per_file so we don't get a warning about ref before assignment - n_per_file = math.inf - - # number of records with open(in_file) as f: - i = 0 + # read header lines + for i in range(top): + f.readline() + n_records = 0 for line in f: - if re.match(sep, line) is not None: - i+=1 - n_records = i + 1 - if top: - n_records -= top # don't count the top lines - + if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0): + n_records += 1 + last_line_matched = True + else: + last_line_matched = False + if sep_at_end and not last_line_matched: + n_records += 1 + + # if there are fewer records than desired files + numnew = min(numnew, n_records) + # approx. number of records per file if chunksize == 0: # i.e. no chunking - # approx. number of lines per file n_per_file = n_records // numnew else: - # approx. number of lines per file numnew = n_records // chunksize n_per_file = chunksize - - - # make new files # strip extension of old file and add number custom_new_file_name = args["file_names"] @@ -161,34 +169,32 @@ open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w") for count in range(0, numnew) ] - # bunch o' counters # index to list of new files - new_file_counter = 0 - - # used for top - # number of lines read so far - n_read = 0 + if rand: + new_file_counter = int(math.floor(random.random() * numnew)) + else: + new_file_counter = 0 # to contain header specified by top header = "" # keep track of the files that have been opened so far - fresh_files = {i for i in range(0, numnew)} + fresh_files = set(range(numnew)) # keep track in loop of number of records in each file # only used in batch records_in_file = 0 # open file - with open(in_file, "r") as file: + with open(in_file, "r") as f: + # read header + for i in range(top): + header += f.readline() + record = "" - for line in file: - n_read += 1 - if n_read <= top: - header += line - continue + for line_no, line in enumerate(f): # check if beginning of line is record sep # if beginning of line is record sep, either start record or finish one - if re.match(sep, line) is not None: + if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0): # this only happens first time through if record == "": record += line @@ -198,18 +204,15 @@ newfiles[new_file_counter].write(header) fresh_files.remove(new_file_counter) - if ftype != "sdf" and args["split_after"] == False: - # write record to file - newfiles[new_file_counter].write(record) - - # if not the first time through, we assign the new record + if sep_at_end: + record += line + # write record to file + newfiles[new_file_counter].write(record) + if not sep_at_end: record = line - - else: # for sdf we want to write the line to the record before starting a new one - record += line - newfiles[new_file_counter].write(record) + else: record = "" - + # change destination file if rand: new_file_counter = int(math.floor(random.random() * numnew)) @@ -229,6 +232,7 @@ record += line # after loop, write final record to file newfiles[new_file_counter].write(record) + # close new files close_files(newfiles)