diff split_file_to_collection.py @ 5:e77b954f0da5 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
author bgruening
date Fri, 11 Oct 2019 18:24:43 -0400
parents 0850f2dfba13
children d57735dd27b0
line wrap: on
line diff
--- a/split_file_to_collection.py	Wed Oct 09 07:34:49 2019 -0400
+++ b/split_file_to_collection.py	Fri Oct 11 18:24:43 2019 -0400
@@ -1,23 +1,24 @@
 #!/usr/bin/env python
 
 import argparse
+import math
 import os
 import re
 import random
-import math
 
-
-"""
-regexes that indicate the *beginning* of a record
-new file types can be added by appending to this dict,
-updating the parser, and adding a new type option in the Galaxy wrapper
-"""
-FILETYPES = {'fasta': '^>',
-             'fastq': '^@',
-             'tabular': '^.*',
-             'txt': '^.*',
-             'mgf': '^BEGIN IONS',
-             'sdf': '\$\$\$\$',
+# configuration of the splitting for specific file types
+# - regular expression matching the record separator ('' if not splitting by regex but by number of lines)
+# - number of lines to split after (0 if not splitting by number of lines but regex)
+# - a boolean indicating if the record separator is at the end of the record
+#
+# new file types can be added by appending to this dict,
+# updating the parser, and adding a new type option in the Galaxy wrapper
+FILETYPES = {'fasta': ('^>', 0,  False),
+             'fastq': ('', 4, False),
+             'tabular': ('', 1, False),
+             'txt': ('', 1, False),
+             'mgf': ('^BEGIN IONS', 0, False),
+             'sdf': ('\$\$\$\$', 0, True),
              }
 
 
@@ -46,8 +47,8 @@
         args["match"] = replace_mapped_chars(args["match"])
         args["sub"] = replace_mapped_chars(args["sub"])
         split_by_column(args, in_file, out_dir, top)
-
     else:
+        args["generic_re"] = replace_mapped_chars(args["generic_re"])
         split_by_record(args, in_file, out_dir, top, ftype)
 
 
@@ -62,23 +63,26 @@
                                                  " the extension of the new files (without a period)")
     parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True,
         choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])
-    parser.add_argument('--generic_re', '-g', help="Regular expression indicating the start of a new record (only for generic)", required = False)
     parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)",
         default = "row", choices = ["col", "row"])
-    parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " +
-                                                                 "(tabular only).")
+    parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.")
     parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true')
     parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " +
                                              "If not provided and args[\"rand\"]==True, then date is used", type=int)
-    parser.add_argument('--numnew', '-n', type=int, default = 1,
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('--numnew', '-n', type=int, default = 1,
                         help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")
-    parser.add_argument('--chunksize', '-k', type=int, default = 0,
+    group.add_argument('--chunksize', '-k', type=int, default = 0,
                         help="Number of records by file. Not valid for splitting on a column")
     parser.add_argument('--batch', action='store_true',
                         help="Distribute files to collection while maintaining order. Ignored if splitting on column.")
-    parser.add_argument('--split_after', '-p', action='store_true',
-                        help="Split between records after separator (default is before)." + 
-                         "Only for generic - specific ftypes are always split in the default way")
+    generic = parser.add_argument_group('Arguments controling generic splitting')
+    group = generic.add_mutually_exclusive_group()
+    group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required = False)
+    group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required = False)
+    generic.add_argument('--split_after', '-p', action='store_true',
+                        help="Split between records after separator (default is before). " + 
+                        "Only for generic splitting by regex - specific ftypes are always split in the default way")
     bycol = parser.add_argument_group('If splitting on a column')
     bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries")
     bycol.add_argument('--sub', '-s', default = r'\1',
@@ -105,8 +109,9 @@
 
 
 def split_by_record(args, in_file, out_dir, top, ftype):
-    # get record separator for given filetype
-    sep = re.compile(FILETYPES.get(ftype, args["generic_re"]))
+    # get configuration (record separator, start at end) for given filetype
+    sep, num, sep_at_end = FILETYPES.get(ftype, (args["generic_re"], args["generic_num"], args["split_after"]))
+    sep = re.compile(sep)
 
     chunksize = args["chunksize"]
     numnew = args["numnew"]
@@ -121,33 +126,36 @@
 
     # batched division (maintains order)
     batch = args["batch"]
-
     
+    # determine
+    # - the number of records that should be stored per file 
+    #   (done always, even if used only for batch mode)
+    # - if the separator is a the start / end of the record
+    n_per_file = math.inf
     if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected
-        # define n_per_file so we don't get a warning about ref before assignment
-        n_per_file = math.inf
-
-        # number of records
         with open(in_file) as f:
-            i = 0
+            # read header lines
+            for i in range(top):
+                f.readline()
+            n_records = 0
             for line in f:
-                if re.match(sep, line) is not None:
-                    i+=1
-            n_records = i + 1
-        if top:
-            n_records -= top  # don't count the top lines
-        
+                if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0):
+                    n_records += 1
+                    last_line_matched = True
+                else:
+                    last_line_matched = False
+            if sep_at_end and not last_line_matched:
+                n_records += 1
+
+        # if there are fewer records than desired files
+        numnew = min(numnew, n_records)
+        # approx. number of records per file
         if chunksize == 0: # i.e. no chunking
-            # approx. number of lines per file
             n_per_file = n_records // numnew
         else:
-            # approx. number of lines per file
             numnew = n_records // chunksize
             n_per_file = chunksize
 
-
-
-
     # make new files
     # strip extension of old file and add number
     custom_new_file_name = args["file_names"]
@@ -161,34 +169,32 @@
         open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w")
         for count in range(0, numnew)
     ]
-
     # bunch o' counters
     # index to list of new files
-    new_file_counter = 0
-
-    # used for top
-    # number of lines read so far
-    n_read = 0
+    if rand:
+        new_file_counter = int(math.floor(random.random() * numnew))
+    else:
+        new_file_counter = 0
     # to contain header specified by top
     header = ""
     # keep track of the files that have been opened so far
-    fresh_files = {i for i in range(0, numnew)}
+    fresh_files = set(range(numnew))
 
     # keep track in loop of number of records in each file
     # only used in batch
     records_in_file = 0
 
     # open file
-    with open(in_file, "r") as file:
+    with open(in_file, "r") as f:
+        # read header
+        for i in range(top):
+            header += f.readline()
+
         record = ""
-        for line in file:
-            n_read += 1
-            if n_read <= top:
-                header += line
-                continue
+        for line_no, line in enumerate(f):
             # check if beginning of line is record sep
             # if beginning of line is record sep, either start record or finish one
-            if re.match(sep, line) is not None:
+            if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0):
                 # this only happens first time through
                 if record == "":
                     record += line
@@ -198,18 +204,15 @@
                         newfiles[new_file_counter].write(header)
                         fresh_files.remove(new_file_counter)
                     
-                    if ftype != "sdf" and args["split_after"] == False:
-                        # write record to file
-                        newfiles[new_file_counter].write(record)
-
-                        # if not the first time through, we assign the new record
+                    if sep_at_end:
+                        record += line
+                    # write record to file
+                    newfiles[new_file_counter].write(record)
+                    if not sep_at_end:
                         record = line
-                                                
-                    else:  # for sdf we want to write the line to the record before starting a new one
-                        record += line
-                        newfiles[new_file_counter].write(record)
+                    else:
                         record = ""
-                        
+
                     # change destination file
                     if rand:
                         new_file_counter = int(math.floor(random.random() * numnew))
@@ -229,6 +232,7 @@
                 record += line
         # after loop, write final record to file
         newfiles[new_file_counter].write(record)
+
     # close new files
     close_files(newfiles)