Mercurial > repos > bgruening > split_file_to_collection

--- a/split_file_to_collection.py	Wed Oct 09 07:34:49 2019 -0400
+++ b/split_file_to_collection.py	Fri Oct 11 18:24:43 2019 -0400
@@ -1,23 +1,24 @@
 #!/usr/bin/env python

 import argparse
+import math
 import os
 import re
 import random
-import math

-
-"""
-regexes that indicate the *beginning* of a record
-new file types can be added by appending to this dict,
-updating the parser, and adding a new type option in the Galaxy wrapper
-"""
-FILETYPES = {'fasta': '^>',
-             'fastq': '^@',
-             'tabular': '^.*',
-             'txt': '^.*',
-             'mgf': '^BEGIN IONS',
-             'sdf': '\$\$\$\$',
+# configuration of the splitting for specific file types
+# - regular expression matching the record separator ('' if not splitting by regex but by number of lines)
+# - number of lines to split after (0 if not splitting by number of lines but regex)
+# - a boolean indicating if the record separator is at the end of the record
+#
+# new file types can be added by appending to this dict,
+# updating the parser, and adding a new type option in the Galaxy wrapper
+FILETYPES = {'fasta': ('^>', 0,  False),
+             'fastq': ('', 4, False),
+             'tabular': ('', 1, False),
+             'txt': ('', 1, False),
+             'mgf': ('^BEGIN IONS', 0, False),
+             'sdf': ('\$\$\$\$', 0, True),
              }


@@ -46,8 +47,8 @@
         args["match"] = replace_mapped_chars(args["match"])
         args["sub"] = replace_mapped_chars(args["sub"])
         split_by_column(args, in_file, out_dir, top)
-
     else:
+        args["generic_re"] = replace_mapped_chars(args["generic_re"])
         split_by_record(args, in_file, out_dir, top, ftype)


@@ -62,23 +63,26 @@
                                                  " the extension of the new files (without a period)")
     parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True,
         choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])
-    parser.add_argument('--generic_re', '-g', help="Regular expression indicating the start of a new record (only for generic)", required = False)
     parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)",
         default = "row", choices = ["col", "row"])
-    parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " +
-                                                                 "(tabular only).")
+    parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.")
     parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true')
     parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " +
                                              "If not provided and args[\"rand\"]==True, then date is used", type=int)
-    parser.add_argument('--numnew', '-n', type=int, default = 1,
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('--numnew', '-n', type=int, default = 1,
                         help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")
-    parser.add_argument('--chunksize', '-k', type=int, default = 0,
+    group.add_argument('--chunksize', '-k', type=int, default = 0,
                         help="Number of records by file. Not valid for splitting on a column")
     parser.add_argument('--batch', action='store_true',
                         help="Distribute files to collection while maintaining order. Ignored if splitting on column.")
-    parser.add_argument('--split_after', '-p', action='store_true',
-                        help="Split between records after separator (default is before)." +
-                         "Only for generic - specific ftypes are always split in the default way")
+    generic = parser.add_argument_group('Arguments controling generic splitting')
+    group = generic.add_mutually_exclusive_group()
+    group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required = False)
+    group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required = False)
+    generic.add_argument('--split_after', '-p', action='store_true',
+                        help="Split between records after separator (default is before). " +
+                        "Only for generic splitting by regex - specific ftypes are always split in the default way")
     bycol = parser.add_argument_group('If splitting on a column')
     bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries")
     bycol.add_argument('--sub', '-s', default = r'\1',
@@ -105,8 +109,9 @@


 def split_by_record(args, in_file, out_dir, top, ftype):
-    # get record separator for given filetype
-    sep = re.compile(FILETYPES.get(ftype, args["generic_re"]))
+    # get configuration (record separator, start at end) for given filetype
+    sep, num, sep_at_end = FILETYPES.get(ftype, (args["generic_re"], args["generic_num"], args["split_after"]))
+    sep = re.compile(sep)

     chunksize = args["chunksize"]
     numnew = args["numnew"]
@@ -121,33 +126,36 @@

     # batched division (maintains order)
     batch = args["batch"]
-

+    # determine
+    # - the number of records that should be stored per file
+    #   (done always, even if used only for batch mode)
+    # - if the separator is a the start / end of the record
+    n_per_file = math.inf
     if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected
-        # define n_per_file so we don't get a warning about ref before assignment
-        n_per_file = math.inf
-
-        # number of records
         with open(in_file) as f:
-            i = 0
+            # read header lines
+            for i in range(top):
+                f.readline()
+            n_records = 0
             for line in f:
-                if re.match(sep, line) is not None:
-                    i+=1
-            n_records = i + 1
-        if top:
-            n_records -= top  # don't count the top lines
-
+                if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0):
+                    n_records += 1
+                    last_line_matched = True
+                else:
+                    last_line_matched = False
+            if sep_at_end and not last_line_matched:
+                n_records += 1
+
+        # if there are fewer records than desired files
+        numnew = min(numnew, n_records)
+        # approx. number of records per file
         if chunksize == 0: # i.e. no chunking
-            # approx. number of lines per file
             n_per_file = n_records // numnew
         else:
-            # approx. number of lines per file
             numnew = n_records // chunksize
             n_per_file = chunksize

-
-
-
     # make new files
     # strip extension of old file and add number
     custom_new_file_name = args["file_names"]
@@ -161,34 +169,32 @@
         open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w")
         for count in range(0, numnew)
     ]
-
     # bunch o' counters
     # index to list of new files
-    new_file_counter = 0
-
-    # used for top
-    # number of lines read so far
-    n_read = 0
+    if rand:
+        new_file_counter = int(math.floor(random.random() * numnew))
+    else:
+        new_file_counter = 0
     # to contain header specified by top
     header = ""
     # keep track of the files that have been opened so far
-    fresh_files = {i for i in range(0, numnew)}
+    fresh_files = set(range(numnew))

     # keep track in loop of number of records in each file
     # only used in batch
     records_in_file = 0

     # open file
-    with open(in_file, "r") as file:
+    with open(in_file, "r") as f:
+        # read header
+        for i in range(top):
+            header += f.readline()
+
         record = ""
-        for line in file:
-            n_read += 1
-            if n_read <= top:
-                header += line
-                continue
+        for line_no, line in enumerate(f):
             # check if beginning of line is record sep
             # if beginning of line is record sep, either start record or finish one
-            if re.match(sep, line) is not None:
+            if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0):
                 # this only happens first time through
                 if record == "":
                     record += line
@@ -198,18 +204,15 @@
                         newfiles[new_file_counter].write(header)
                         fresh_files.remove(new_file_counter)

-                    if ftype != "sdf" and args["split_after"] == False:
-                        # write record to file
-                        newfiles[new_file_counter].write(record)
-
-                        # if not the first time through, we assign the new record
+                    if sep_at_end:
+                        record += line
+                    # write record to file
+                    newfiles[new_file_counter].write(record)
+                    if not sep_at_end:
                         record = line
-
-                    else:  # for sdf we want to write the line to the record before starting a new one
-                        record += line
-                        newfiles[new_file_counter].write(record)
+                    else:
                         record = ""
-
+
                     # change destination file
                     if rand:
                         new_file_counter = int(math.floor(random.random() * numnew))
@@ -229,6 +232,7 @@
                 record += line
         # after loop, write final record to file
         newfiles[new_file_counter].write(record)
+
     # close new files
     close_files(newfiles)
--- a/split_file_to_collection.xml	Wed Oct 09 07:34:49 2019 -0400
+++ b/split_file_to_collection.xml	Fri Oct 11 18:24:43 2019 -0400
@@ -1,4 +1,4 @@
-<tool id="split_file_to_collection" name="Split file" version="0.3.0">
+<tool id="split_file_to_collection" name="Split file" version="0.4.0">
     <description>to dataset collection</description>
     <macros>
         <xml name="regex_sanitizer">
@@ -77,9 +77,13 @@
                 #end if
             #else
                 #if $split_parms.select_ftype == "generic"
-                    --generic_re '$split_parms.generic_regex'
-                    #if $split_parms.split_after == 'true':
-                        --split_after
+                    #if $split_parms.split_method.select_split_method == "regex"
+                        --generic_re '$split_parms.split_method.generic_regex'
+                        #if $split_parms.split_method.split_after == 'true':
+                            --split_after
+                        #end if
+                    #else
+                        --generic_num $split_parms.split_method.record_length
                     #end if
                 #end if
                 #if $split_parms.select_mode.mode == "numnew":
@@ -163,14 +167,25 @@
             </when>
             <when value="generic">
                 <param name="input" type="data" format="txt" label="File to split"/>
-                <param name="generic_regex" type="text" label="Regex to match record separator" value="^.*">
-                    <expand macro="regex_sanitizer"/>
-                </param>
+                <conditional name="split_method">
+                    <param name="select_split_method" type="select" label="Method to split files">
+                        <option value="regex">Specify record separator as regular expression</option>
+                        <option value="number">Specify number of lines after which a record ends</option>
+                    </param>
+                    <when value="regex">
+                        <param name="generic_regex" type="text" label="Regex to match record separator" value="^.*">
+                            <expand macro="regex_sanitizer"/>
+                        </param>
+                        <param name="split_after" type="select" value="false" label="Split records before or after the separator?" help="If before, the separator will appear at the start of each record; if after, at the end">
+                            <option value="false" selected="true">Before</option>
+                            <option value="true">After</option>
+                        </param>
+                    </when>
+                    <when value="number">
+                        <param name="record_length" type="integer" value="1" label="Record length" help="The number of lines after which each record ends"/>
+                    </when>
+                </conditional>
                 <expand macro="numnew_fname"/>
-                <param name="split_after" type="select" value="false" label="Split records before or after the separator?" help="If before, the separator will appear at the start of each record; if after, at the end">
-                    <option value="false" selected="true">Before</option>
-                    <option value="true">After</option>
-                </param>
             </when>
         </conditional>
     </inputs>
@@ -205,6 +220,7 @@
         </collection>
     </outputs>
     <tests>
+        <!-- 1 -->
         <test>
             <param name="input" value="test.tabular" ftype="tabular"/>
             <param name="select_ftype" value="tabular"/>
@@ -219,6 +235,7 @@
                 <element name="foo3.tab" file="foo3.tab" ftype="tabular"/>
             </output_collection>
         </test>
+        <!-- 2 -->
         <test>
             <param name="input" value="test.tabular" ftype="tabular"/>
             <param name="select_ftype" value="tabular"/>
@@ -232,6 +249,7 @@
                 <element name="test_000001.tabular" file="test_1.tabular" ftype="tabular"/>
             </output_collection>
         </test>
+        <!-- 3 -->
         <test>
             <param name="input" value="test.tabular" ftype="tabular"/>
             <param name="select_ftype" value="tabular"/>
@@ -246,6 +264,7 @@
                 <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/>
             </output_collection>
         </test>
+        <!-- 4 -->
         <test>
             <param name="input" value="test.tabular" ftype="tabular"/>
             <param name="select_ftype" value="tabular"/>
@@ -260,6 +279,7 @@
                 <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/>
             </output_collection>
         </test>
+        <!-- 5 -->
         <test>
             <param name="select_ftype" value="txt"/>
             <param name="input" value="karyotype.txt" ftype="txt"/>
@@ -295,6 +315,7 @@
                 <element name="chr_000023.txt" file="chr_000023.txt" ftype="txt"/>
             </output_collection>
         </test>
+        <!-- 6 -->
         <test>
             <param name="input" value="psm.tabular" ftype="tabular"/>
             <param name="select_ftype" value="tabular"/>
@@ -310,6 +331,7 @@
                 <element name="file4.tab" file="file4.tab" ftype="tabular"/>
             </output_collection>
         </test>
+        <!-- 7 splitting of mgf -->
         <test>
             <param name="input" value="demo758Dacentroid.mgf" ftype="mgf"/>
             <param name="select_ftype" value="mgf"/>
@@ -322,6 +344,7 @@
                 <element name="demo_000002.mgf" file="demo_2.mgf" ftype="mgf"/>
             </output_collection>
         </test>
+        <!-- 8 splitting of fasta + desired number of files-->
         <test>
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="fasta"/>
@@ -333,6 +356,7 @@
                 <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/>
             </output_collection>
         </test>
+        <!-- 9 splitting of fasta + desired chunksize -->
         <test>
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="fasta"/>
@@ -344,6 +368,7 @@
                 <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/>
             </output_collection>
         </test>
+        <!-- 10 splitting of fastq, specify desired number of files -->
         <test>
             <param name="input" value="test.fastq" ftype="fastq"/>
             <param name="select_ftype" value="fastq"/>
@@ -355,6 +380,23 @@
                 <element name="test_000001.fastq" file="test_1.fastq" ftype="fastq"/>
             </output_collection>
         </test>
+        <!-- 11 splitting of fastq, specify desired number of files
+             same as previous test, but by specifying the number of lines per record
+             explicitely (not using the preset of the python script) -->
+        <test>
+            <param name="input" value="test.fastq" ftype="fastq"/>
+            <param name="select_ftype" value="generic"/>
+            <param name="select_split_method" value="number"/>
+            <param name="record_length" value="4"/>
+            <param name="mode" value="numnew"/>
+            <param name="numnew" value="2"/>
+            <param name="newfilenames" value="test"/>
+            <output_collection name="list_output_generic" type="list">
+                <element name="test_000000" file="test_0.fastq" ftype="fastq"/>
+                <element name="test_000001" file="test_1.fastq" ftype="fastq"/>
+            </output_collection>
+        </test>
+        <!-- splitting of fasta w random assignment and specific filename prefix -->
         <test>
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="fasta"/>
@@ -368,6 +410,7 @@
                 <element name="rand_000001.fasta" file="rand_1.fasta" ftype="fasta"/>
             </output_collection>
         </test>
+        <!-- splitting of fasta w batch assignment and specific filename prefix -->
         <test>
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="fasta"/>
@@ -380,6 +423,7 @@
                 <element name="fasta_batch_000001.fasta" file="fasta_batch_1.fasta" ftype="fasta"/>
             </output_collection>
         </test>
+        <!-- splitting of txt w default (alternating assignment) -->
         <test>
             <param name="input" value="test.tabular" ftype="txt"/>
             <param name="select_ftype" value="txt"/>
@@ -391,9 +435,11 @@
                 <element name="test_000001.txt" file="test_1.tabular" ftype="txt" lines_diff="1"/>
             </output_collection>
         </test>
+        <!-- generic-regex splitting (of txt) w default assignement (alternating) -->
         <test>
             <param name="input" value="test.tabular" ftype="txt"/>
             <param name="select_ftype" value="generic"/>
+            <param name="select_split_method" value="regex"/>
             <param name="generic_regex" value="^.*"/>
             <param name="mode" value="numnew"/>
             <param name="numnew" value="2"/>
@@ -403,9 +449,11 @@
                 <element name="test_000001" file="test_1.tabular" ftype="txt" lines_diff="1"/>
             </output_collection>
         </test>
+        <!-- generic-regex splitting (of a fasta) w random assignment -->
         <test>
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="generic"/>
+            <param name="select_split_method" value="regex"/>
             <param name="generic_regex" value="^>.*"/>
             <param name="mode" value="numnew"/>
             <param name="numnew" value="2"/>
@@ -417,6 +465,7 @@
                 <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/>
             </output_collection>
         </test>
+        <!-- sdf + specify desired number of files -->
         <test>
             <param name="input" value="3_molecules.sdf" ftype="sdf"/>
             <param name="select_ftype" value="sdf"/>
@@ -430,6 +479,7 @@
                 <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/>
             </output_collection>
         </test>
+        <!-- sdf + specify desired number of records per file (chunksize) -->
         <test>
             <param name="input" value="3_molecules.sdf" ftype="sdf"/>
             <param name="select_ftype" value="sdf"/>
@@ -443,10 +493,12 @@
                 <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/>
             </output_collection>
         </test>
+        <!-- test split_after (by splitting fasta files after non-header lines) -->
         <test>
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="generic"/>
-            <param name="generic_regex" value="^>.*"/>
+            <param name="select_split_method" value="regex"/>
+            <param name="generic_regex" value="^[^>].*"/>
             <param name="split_after" value="true"/>
             <param name="mode" value="numnew"/>
             <param name="numnew" value="2"/>
@@ -454,7 +506,8 @@
             <param name="allocate" value="random"/>
             <param name="seed" value="1010"/>
             <output_collection name="list_output_generic" type="list">
-                <element name="rand_000001" file="split_after.fasta" ftype="fasta"/>
+                <element name="rand_000000" file="rand_0.fasta" ftype="fasta"/>
+                <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/>
             </output_collection>
         </test>
     </tests>
@@ -463,10 +516,11 @@

 This tool splits a data set consisting of records into multiple data sets within a collection.
 A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence
-(headers + sequence + qualities), etc. The important property is that the beginning of a new record
-can be specified by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ.
-The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, SDF and MGF.
-For other data types the text delimiting records can be specified manually using the generic splitter.
+(headers + sequence + qualities), etc. The important property is that the records either have a
+specific length (e.g. 4 lines for FASTQ) or that the beginning/end of a new record
+can be specified by a regular expression, e.g. ".*" for lines or ">.*" for FASTA.
+The tool has presets for text, tabular data sets (which are split after each line), FASTA (new records start with ">.*"), FASTQ (records consist of 4 lines), SDF (records start with "^BEGIN IONS") and MGF (records end with "^$$$$").
+For other data types the text delimiting records or the number of lines making up a record can be specified manually using the generic splitter.
 If the generic splitter is used, an option is also available to split records either before or after the
 separator. If a preset filetype is used, this is selected automatically (after for SDF, before for all
 others).
--- a/test-data/rand_0.fasta	Wed Oct 09 07:34:49 2019 -0400
+++ b/test-data/rand_0.fasta	Fri Oct 11 18:24:43 2019 -0400
@@ -1,5 +1,5 @@
->seq1
-PROTEIN0
+>seq3
+ANOTHERPROTEIN
 >seq4
 ASFWEFOIN
 >seq5
--- a/test-data/rand_1.fasta	Wed Oct 09 07:34:49 2019 -0400
+++ b/test-data/rand_1.fasta	Fri Oct 11 18:24:43 2019 -0400
@@ -1,4 +1,4 @@
+>seq1
+PROTEIN0
 >seq2
 PROTEIN
->seq3
-ANOTHERPROTEIN