changeset 4:0850f2dfba13 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
author bgruening
date Wed, 09 Oct 2019 07:34:49 -0400
parents 2ddc36385d7a
children e77b954f0da5
files split_file_to_collection.py split_file_to_collection.xml test-data/mol_0.sdf test-data/mol_1.sdf test-data/mol_2.sdf test-data/split_after.fasta
diffstat 6 files changed, 350 insertions(+), 218 deletions(-) [+]
line wrap: on
line diff
--- a/split_file_to_collection.py	Tue Sep 10 12:31:15 2019 -0400
+++ b/split_file_to_collection.py	Wed Oct 09 07:34:49 2019 -0400
@@ -16,7 +16,9 @@
              'fastq': '^@',
              'tabular': '^.*',
              'txt': '^.*',
-             'mgf': '^BEGIN IONS'}
+             'mgf': '^BEGIN IONS',
+             'sdf': '\$\$\$\$',
+             }
 
 
 def main():
@@ -59,7 +61,7 @@
     parser.add_argument('--file_ext', '-e', help="If not splitting by column," +
                                                  " the extension of the new files (without a period)")
     parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True,
-        choices=["mgf", "fastq", "fasta", "tabular", "txt", "generic"])
+        choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])
     parser.add_argument('--generic_re', '-g', help="Regular expression indicating the start of a new record (only for generic)", required = False)
     parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)",
         default = "row", choices = ["col", "row"])
@@ -69,10 +71,14 @@
     parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " +
                                              "If not provided and args[\"rand\"]==True, then date is used", type=int)
     parser.add_argument('--numnew', '-n', type=int, default = 1,
-                        help="Number of output files desired. Not valid for splitting on a column")
+                        help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")
+    parser.add_argument('--chunksize', '-k', type=int, default = 0,
+                        help="Number of records by file. Not valid for splitting on a column")
     parser.add_argument('--batch', action='store_true',
                         help="Distribute files to collection while maintaining order. Ignored if splitting on column.")
-
+    parser.add_argument('--split_after', '-p', action='store_true',
+                        help="Split between records after separator (default is before)." + 
+                         "Only for generic - specific ftypes are always split in the default way")
     bycol = parser.add_argument_group('If splitting on a column')
     bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries")
     bycol.add_argument('--sub', '-s', default = r'\1',
@@ -102,6 +108,7 @@
     # get record separator for given filetype
     sep = re.compile(FILETYPES.get(ftype, args["generic_re"]))
 
+    chunksize = args["chunksize"]
     numnew = args["numnew"]
 
     # random division
@@ -114,9 +121,12 @@
 
     # batched division (maintains order)
     batch = args["batch"]
-    # define n_per_file so we don't get a warning about ref before assignment
-    n_per_file = math.inf
-    if batch:
+
+    
+    if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected
+        # define n_per_file so we don't get a warning about ref before assignment
+        n_per_file = math.inf
+
         # number of records
         with open(in_file) as f:
             i = 0
@@ -126,9 +136,17 @@
             n_records = i + 1
         if top:
             n_records -= top  # don't count the top lines
+        
+        if chunksize == 0: # i.e. no chunking
+            # approx. number of lines per file
+            n_per_file = n_records // numnew
+        else:
+            # approx. number of lines per file
+            numnew = n_records // chunksize
+            n_per_file = chunksize
 
-        # approx. number of lines per file
-        n_per_file = n_records // numnew
+
+
 
     # make new files
     # strip extension of old file and add number
@@ -179,13 +197,19 @@
                     if new_file_counter in fresh_files:
                         newfiles[new_file_counter].write(header)
                         fresh_files.remove(new_file_counter)
-
-                    # write record to file
-                    newfiles[new_file_counter].write(record)
+                    
+                    if ftype != "sdf" and args["split_after"] == False:
+                        # write record to file
+                        newfiles[new_file_counter].write(record)
 
-                    # if not the first time through, we assign the new record
-                    record = line
-
+                        # if not the first time through, we assign the new record
+                        record = line
+                                                
+                    else:  # for sdf we want to write the line to the record before starting a new one
+                        record += line
+                        newfiles[new_file_counter].write(record)
+                        record = ""
+                        
                     # change destination file
                     if rand:
                         new_file_counter = int(math.floor(random.random() * numnew))
--- a/split_file_to_collection.xml	Tue Sep 10 12:31:15 2019 -0400
+++ b/split_file_to_collection.xml	Wed Oct 09 07:34:49 2019 -0400
@@ -1,4 +1,4 @@
-<tool id="split_file_to_collection" name="Split file" version="0.2.0">
+<tool id="split_file_to_collection" name="Split file" version="0.3.0">
     <description>to dataset collection</description>
     <macros>
         <xml name="regex_sanitizer">
@@ -15,7 +15,18 @@
             </sanitizer>
         </xml>
         <xml name="numnew_fname">
-            <param name="numnew" type="integer" label="Number of new files" min="1" value="1"/>
+            <conditional name="select_mode">
+                <param name="mode" type="select" label="Specify number of output files or number of records per file?" help="Specify the number of records ('chunk size') to place in each file. The 'Number of new files' parameter will be ignored.">
+                    <option value="chunk">Number of records per file ('chunk mode')</option>
+                    <option value="numnew">Number of output files</option>
+                </param>
+                <when value="chunk">
+                    <param name="chunksize" type="integer" label="Chunk size" min="1" value="1" help="Number of records per output file."/>
+                </when>
+                <when value="numnew">
+                    <param name="numnew" type="integer" label="Number of new files" min="1" value="1"/>
+                </when>
+            </conditional>
             <param name="newfilenames" type="text" label="Base name for new files in collection"
                 help="This will increment automatically - if input is 'file', then output is 'file0', 'file1', etc." value="split_file"/>
             <conditional name="select_allocate">
@@ -25,7 +36,7 @@
                     <option value="byrow" selected="true">Alternate output files</option>
                 </param>
                 <when value="random">
-                    <param name="seed" type="integer" label="Random number seed" help="For reproducibility, set this to some arbitrary integer (i.e. '1010')" value="1010"/>
+                    <param name="seed" type="integer" label="Random number seed" help="For reproducibility, set this to some arbitrary integer (e.g. '1010')" value="1010"/>
                 </when>
                 <when value="batch">
                 </when>
@@ -51,7 +62,11 @@
                     --match '$split_parms.split_by.match_regex'
                     --sub '$split_parms.split_by.sub_regex'
                 #else
-                    --numnew '$split_parms.split_by.numnew'
+                    #if $split_parms.split_by.select_mode.mode == "numnew":
+                        --numnew '$split_parms.split_by.select_mode.numnew'
+                    #else
+                        --chunksize $split_parms.split_by.select_mode.chunksize
+                    #end if
                     #if $split_parms.split_by.select_allocate.allocate == "random":
                         --rand
                         --seed '$split_parms.split_by.rand.seed'
@@ -63,8 +78,15 @@
             #else
                 #if $split_parms.select_ftype == "generic"
                     --generic_re '$split_parms.generic_regex'
+                    #if $split_parms.split_after == 'true':
+                        --split_after
+                    #end if
                 #end if
-                --numnew '$split_parms.numnew'
+                #if $split_parms.select_mode.mode == "numnew":
+                    --numnew '$split_parms.select_mode.numnew'
+                #else
+                    --chunksize $split_parms.select_mode.chunksize
+                #end if
                 #if $split_parms.select_allocate.allocate == "random":
                     --rand
                     --seed '$split_parms.select_allocate.seed'
@@ -93,6 +115,7 @@
                 <option value="fastq">FASTQ</option>
                 <option value="tabular">Tabular</option>
                 <option value="fasta">FASTA</option>
+                <option value="sdf">SD-files</option>
                 <option value="txt">Text files</option>
                 <option value="generic">Generic</option>
             </param>
@@ -130,6 +153,10 @@
                 <param name="input" type="data" format="fasta" label="FASTA file to split"/>
                 <expand macro="numnew_fname"/>
             </when>
+            <when value="sdf">
+                <param name="input" type="data" format="sdf" label="SD-file to split"/>
+                <expand macro="numnew_fname"/>
+            </when>
             <when value="txt">
                 <param name="input" type="data" format="txt" label="Text file to split"/>
                 <expand macro="numnew_fname"/>
@@ -140,6 +167,10 @@
                     <expand macro="regex_sanitizer"/>
                 </param>
                 <expand macro="numnew_fname"/>
+                <param name="split_after" type="select" value="false" label="Split records before or after the separator?" help="If before, the separator will appear at the start of each record; if after, at the end">
+                    <option value="false" selected="true">Before</option>
+                    <option value="true">After</option>
+                </param>
             </when>
         </conditional>
     </inputs>
@@ -160,6 +191,10 @@
             <discover_datasets pattern="__name__" directory="out" visible="false" format="fastq"/>
             <filter>split_parms['select_ftype'] == "fastq"</filter>
         </collection>
+        <collection name="list_output_sdf" type="list" label="${tool.name} on ${on_string}">
+            <discover_datasets pattern="__name__" directory="out" visible="false" format="sdf"/>
+            <filter>split_parms['select_ftype'] == "sdf"</filter>
+        </collection>
         <collection name="list_output_txt" type="list" label="${tool.name} on ${on_string}">
             <discover_datasets pattern="__name__" directory="out" visible="false" format="txt"/>
             <filter>split_parms['select_ftype'] == "txt"</filter>
@@ -189,6 +224,7 @@
             <param name="select_ftype" value="tabular"/>
             <param name="select_split_by" value="row"/>
             <param name="top" value="2"/>
+            <param name="mode" value="numnew"/>
             <param name="numnew" value="2"/>
             <param name="newfilenames" value="test"/>
             <output_collection name="list_output_tab" type="list">
@@ -201,6 +237,7 @@
             <param name="select_ftype" value="tabular"/>
             <param name="select_split_by" value="row"/>
             <param name="top" value="2"/>
+            <param name="mode" value="numnew"/>
             <param name="numnew" value="2"/>
             <param name="newfilenames" value="batch_tab"/>
             <param name="allocate" value="batch"/>
@@ -210,8 +247,23 @@
             </output_collection>
         </test>
         <test>
+            <param name="input" value="test.tabular" ftype="tabular"/>
+            <param name="select_ftype" value="tabular"/>
+            <param name="select_split_by" value="row"/>
+            <param name="top" value="2"/>
+            <param name="mode" value="chunk"/>
+            <param name="chunksize" value="2"/>
+            <param name="newfilenames" value="batch_tab"/>
+            <param name="allocate" value="batch"/>
+            <output_collection name="list_output_tab" type="list">
+                <element name="batch_tab_000000.tabular" file="batch_tab_0.tabular" ftype="tabular"/>
+                <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/>
+            </output_collection>
+        </test>
+        <test>
             <param name="select_ftype" value="txt"/>
             <param name="input" value="karyotype.txt" ftype="txt"/>
+            <param name="mode" value="numnew"/>
             <param name="numnew" value="24"/>
             <param name="newfilenames" value="chr"/>
             <param name="allocate" value="batch"/>
@@ -261,6 +313,7 @@
         <test>
             <param name="input" value="demo758Dacentroid.mgf" ftype="mgf"/>
             <param name="select_ftype" value="mgf"/>
+            <param name="mode" value="numnew"/>
             <param name="numnew" value="3"/>
             <param name="newfilenames" value="demo"/>
             <output_collection name="list_output_mgf" type="list">
@@ -272,6 +325,7 @@
         <test>
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="fasta"/>
+            <param name="mode" value="numnew"/>
             <param name="numnew" value="2"/>
             <param name="newfilenames" value="test"/>
             <output_collection name="list_output_fasta" type="list">
@@ -280,8 +334,20 @@
             </output_collection>
         </test>
         <test>
+            <param name="input" value="test.fasta" ftype="fasta"/>
+            <param name="select_ftype" value="fasta"/>
+            <param name="mode" value="chunk"/>
+            <param name="chunksize" value="3"/>
+            <param name="newfilenames" value="test"/>
+            <output_collection name="list_output_fasta" type="list">
+                <element name="test_000000.fasta" file="test_0.fasta" ftype="fasta"/>
+                <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/>
+            </output_collection>
+        </test>
+        <test>
             <param name="input" value="test.fastq" ftype="fastq"/>
             <param name="select_ftype" value="fastq"/>
+            <param name="mode" value="numnew"/>
             <param name="numnew" value="2"/>
             <param name="newfilenames" value="test"/>
             <output_collection name="list_output_fastq" type="list">
@@ -292,6 +358,7 @@
         <test>
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="fasta"/>
+            <param name="mode" value="numnew"/>
             <param name="numnew" value="2"/>
             <param name="newfilenames" value="rand"/>
             <param name="allocate" value="random"/>
@@ -304,6 +371,7 @@
         <test>
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="fasta"/>
+            <param name="mode" value="numnew"/>
             <param name="numnew" value="2"/>
             <param name="newfilenames" value="fasta_batch"/>
             <param name="allocate" value="batch"/>
@@ -315,6 +383,7 @@
         <test>
             <param name="input" value="test.tabular" ftype="txt"/>
             <param name="select_ftype" value="txt"/>
+            <param name="mode" value="numnew"/>
             <param name="numnew" value="2"/>
             <param name="newfilenames" value="test"/>
             <output_collection name="list_output_txt" type="list">
@@ -326,6 +395,7 @@
             <param name="input" value="test.tabular" ftype="txt"/>
             <param name="select_ftype" value="generic"/>
             <param name="generic_regex" value="^.*"/>
+            <param name="mode" value="numnew"/>
             <param name="numnew" value="2"/>
             <param name="newfilenames" value="test"/>
             <output_collection name="list_output_generic" type="list">
@@ -337,6 +407,7 @@
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="generic"/>
             <param name="generic_regex" value="^>.*"/>
+            <param name="mode" value="numnew"/>
             <param name="numnew" value="2"/>
             <param name="newfilenames" value="rand"/>
             <param name="allocate" value="random"/>
@@ -348,29 +419,59 @@
         </test>
         <test>
             <param name="input" value="3_molecules.sdf" ftype="sdf"/>
-            <param name="select_ftype" value="generic"/>
-            <param name="generic_regex" value="^\$\$\$\$.*"/>
-            <param name="numnew" value="1000"/>
+            <param name="select_ftype" value="sdf"/>
+            <param name="mode" value="numnew"/>
+            <param name="numnew" value="10"/>
+            <param name="newfilenames" value="mol"/>
+            <param name="allocate" value="batch"/>
+            <output_collection name="list_output_sdf" type="list">
+                <element name="mol_000000.sdf" file="mol_0.sdf" ftype="sdf"/>
+                <element name="mol_000001.sdf" file="mol_1.sdf" ftype="sdf"/>
+                <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/>
+            </output_collection>
+        </test>
+        <test>
+            <param name="input" value="3_molecules.sdf" ftype="sdf"/>
+            <param name="select_ftype" value="sdf"/>
+            <param name="mode" value="chunk"/>
+            <param name="chunksize" value="1"/>
             <param name="newfilenames" value="mol"/>
             <param name="allocate" value="batch"/>
+            <output_collection name="list_output_sdf" type="list">
+                <element name="mol_000000.sdf" file="mol_0.sdf" ftype="sdf"/>
+                <element name="mol_000001.sdf" file="mol_1.sdf" ftype="sdf"/>
+                <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/>
+            </output_collection>
+        </test>
+        <test>
+            <param name="input" value="test.fasta" ftype="fasta"/>
+            <param name="select_ftype" value="generic"/>
+            <param name="generic_regex" value="^>.*"/>
+            <param name="split_after" value="true"/>
+            <param name="mode" value="numnew"/>
+            <param name="numnew" value="2"/>
+            <param name="newfilenames" value="rand"/>
+            <param name="allocate" value="random"/>
+            <param name="seed" value="1010"/>
             <output_collection name="list_output_generic" type="list">
-                <element name="mol_000000" file="mol_0.sdf" ftype="sdf"/>
-                <element name="mol_000001" file="mol_1.sdf" ftype="sdf"/>
-                <element name="mol_000002" file="mol_2.sdf" ftype="sdf"/>
+                <element name="rand_000001" file="split_after.fasta" ftype="fasta"/>
             </output_collection>
         </test>
     </tests>
     <help><![CDATA[
 **Split file into a dataset collection**
 
-This tool splits a data sets consisting of records into multiple data sets within a collection.
+This tool splits a data set consisting of records into multiple data sets within a collection.
 A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence
-(headers + sequence + qualities), etc. The important property is that the begin of a new record
-can be speciefied by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ.
-The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, and MGF.
-For other data types the text delimiting records can be specified manually using the generic splitter.
+(headers + sequence + qualities), etc. The important property is that the beginning of a new record
+can be specified by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ.
+The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, SDF and MGF.
+For other data types the text delimiting records can be specified manually using the generic splitter. 
+If the generic splitter is used, an option is also available to split records either before or after the
+separator. If a preset filetype is used, this is selected automatically (after for SDF, before for all
+others).
 
-If splitting by line (or by some other item, like a FASTA entry or an MGF record, the splitting can be either done alternating, in original record order, or at random.
+If splitting by line (or by some other item, like a FASTA entry or an MGF record), the splitting can be either done alternatingly, in original record order, or at random.
 
 If t records are to be distributed to n new data sets, then the i-th record goes to data set
 
@@ -407,13 +508,16 @@
 If a tabular file is used as input, you may choose to split by line or by column. If split by column, a new file is created for each unique value in the column.
 In addition, (Python) regular expressions may be used to transform the value in the column to a new value. Caution should be used with this feature, as it could transform all values to the same value, or other unexpected behavior.
 The default regular expression uses each value in the column without modifying it.
+
+Two modes are available for the tool. For the main mode, the number of output files is selected. In this case, records are shared out between this number of files. Alternatively, 'chunking mode' can be selected, which puts a fixed number of records (the 'chunk size') into each output file.
+
     ]]></help>
     <citations>
         <citation type="bibtex">
 @misc{githubsplit,
   author = {Easterly, Caleb},
   year = {2018},
-  title = {A Galxy tool for splitting a file into a collection},
+  title = {A Galaxy tool for splitting a file into a collection},
   publisher = {GitHub},
   journal = {GitHub repository},
   url = {https://github.com/bgruening/galaxytools/tools/text_processing/split_file_to_collection},
--- a/test-data/mol_0.sdf	Tue Sep 10 12:31:15 2019 -0400
+++ b/test-data/mol_0.sdf	Wed Oct 09 07:34:49 2019 -0400
@@ -163,188 +163,3 @@
 10
 
 $$$$
-2244
- OpenBabel09021316243D
-
- 21 21  0  0  0  0  0  0  0  0999 V2000
-    1.2333    0.5540    0.7792 O   0  0  0  0  0  0  0  0  0  0  0  0
-   -0.6952   -2.7148   -0.7502 O   0  0  0  0  0  0  0  0  0  0  0  0
-    0.7958   -2.1843    0.8685 O   0  0  0  0  0  0  0  0  0  0  0  0
-    1.7813    0.8105   -1.4821 O   0  0  0  0  0  0  0  0  0  0  0  0
-   -0.0857    0.6088    0.4403 C   0  0  0  0  0  0  0  0  0  0  0  0
-   -0.7927   -0.5515    0.1244 C   0  0  0  0  0  0  0  0  0  0  0  0
-   -0.7288    1.8464    0.4133 C   0  0  0  0  0  0  0  0  0  0  0  0
-   -2.1426   -0.4741   -0.2184 C   0  0  0  0  0  0  0  0  0  0  0  0
-   -2.0787    1.9238    0.0706 C   0  0  0  0  0  0  0  0  0  0  0  0
-   -2.7855    0.7636   -0.2453 C   0  0  0  0  0  0  0  0  0  0  0  0
-   -0.1409   -1.8536    0.1477 C   0  0  0  0  0  0  0  0  0  0  0  0
-    2.1094    0.6715   -0.3113 C   0  0  0  0  0  0  0  0  0  0  0  0
-    3.5305    0.5996    0.1635 C   0  0  0  0  0  0  0  0  0  0  0  0
-   -0.1851    2.7545    0.6593 H   0  0  0  0  0  0  0  0  0  0  0  0
-   -2.7247   -1.3605   -0.4564 H   0  0  0  0  0  0  0  0  0  0  0  0
-   -2.5797    2.8872    0.0506 H   0  0  0  0  0  0  0  0  0  0  0  0
-   -3.8374    0.8238   -0.5090 H   0  0  0  0  0  0  0  0  0  0  0  0
-    3.7290    1.4184    0.8593 H   0  0  0  0  0  0  0  0  0  0  0  0
-    4.2045    0.6969   -0.6924 H   0  0  0  0  0  0  0  0  0  0  0  0
-    3.7105   -0.3659    0.6426 H   0  0  0  0  0  0  0  0  0  0  0  0
-   -0.2555   -3.5916   -0.7337 H   0  0  0  0  0  0  0  0  0  0  0  0
-  1  5  1  0  0  0  0
-  1 12  1  0  0  0  0
-  2 11  1  0  0  0  0
-  2 21  1  0  0  0  0
-  3 11  2  0  0  0  0
-  4 12  2  0  0  0  0
-  5  6  1  0  0  0  0
-  5  7  2  0  0  0  0
-  6  8  2  0  0  0  0
-  6 11  1  0  0  0  0
-  7  9  1  0  0  0  0
-  7 14  1  0  0  0  0
-  8 10  1  0  0  0  0
-  8 15  1  0  0  0  0
-  9 10  2  0  0  0  0
-  9 16  1  0  0  0  0
- 10 17  1  0  0  0  0
- 12 13  1  0  0  0  0
- 13 18  1  0  0  0  0
- 13 19  1  0  0  0  0
- 13 20  1  0  0  0  0
-M  END
->  <PUBCHEM_COMPOUND_CID>
-2244
-
->  <PUBCHEM_CONFORMER_RMSD>
-0.6
-
->  <PUBCHEM_CONFORMER_DIVERSEORDER>
-1
-11
-10
-3
-15
-17
-13
-5
-16
-7
-14
-9
-8
-4
-18
-6
-12
-2
-
->  <PUBCHEM_MMFF94_PARTIAL_CHARGES>
-18
-1 -0.23
-10 -0.15
-11 0.63
-12 0.66
-13 0.06
-14 0.15
-15 0.15
-16 0.15
-17 0.15
-2 -0.65
-21 0.5
-3 -0.57
-4 -0.57
-5 0.08
-6 0.09
-7 -0.15
-8 -0.15
-9 -0.15
-
->  <PUBCHEM_EFFECTIVE_ROTOR_COUNT>
-3
-
->  <PUBCHEM_PHARMACOPHORE_FEATURES>
-5
-1 2 acceptor
-1 3 acceptor
-1 4 acceptor
-3 2 3 11 anion
-6 5 6 7 8 9 10 rings
-
->  <PUBCHEM_HEAVY_ATOM_COUNT>
-13
-
->  <PUBCHEM_ATOM_DEF_STEREO_COUNT>
-0
-
->  <PUBCHEM_ATOM_UDEF_STEREO_COUNT>
-0
-
->  <PUBCHEM_BOND_DEF_STEREO_COUNT>
-0
-
->  <PUBCHEM_BOND_UDEF_STEREO_COUNT>
-0
-
->  <PUBCHEM_ISOTOPIC_ATOM_COUNT>
-0
-
->  <PUBCHEM_COMPONENT_COUNT>
-1
-
->  <PUBCHEM_CACTVS_TAUTO_COUNT>
-1
-
->  <PUBCHEM_CONFORMER_ID>
-000008C400000001
-
->  <PUBCHEM_COORDINATE_TYPE>
-2
-5
-10
-
-$$$$
-
-
- 21 21  0  0  0  0  0  0  0  0999 V2000
-    1.2333    0.5540    0.7792 O   0  0  0  0  0  0  0  0  0  0  0  0
-   -0.6952   -2.7148   -0.7502 O   0  0  0  0  0  0  0  0  0  0  0  0
-    0.7958   -2.1843    0.8685 O   0  0  0  0  0  0  0  0  0  0  0  0
-    1.7813    0.8105   -1.4821 O   0  0  0  0  0  0  0  0  0  0  0  0
-   -0.0857    0.6088    0.4403 C   0  0  0  0  0  0  0  0  0  0  0  0
-   -0.7927   -0.5515    0.1244 C   0  0  0  0  0  0  0  0  0  0  0  0
-   -0.7288    1.8464    0.4133 C   0  0  0  0  0  0  0  0  0  0  0  0
-   -2.1426   -0.4741   -0.2184 C   0  0  0  0  0  0  0  0  0  0  0  0
-   -2.0787    1.9238    0.0706 C   0  0  0  0  0  0  0  0  0  0  0  0
-   -2.7855    0.7636   -0.2453 C   0  0  0  0  0  0  0  0  0  0  0  0
-   -0.1409   -1.8536    0.1477 C   0  0  0  0  0  0  0  0  0  0  0  0
-    2.1094    0.6715   -0.3113 C   0  0  0  0  0  0  0  0  0  0  0  0
-    3.5305    0.5996    0.1635 C   0  0  0  0  0  0  0  0  0  0  0  0
-   -0.1851    2.7545    0.6593 H   0  0  0  0  0  0  0  0  0  0  0  0
-   -2.7247   -1.3605   -0.4564 H   0  0  0  0  0  0  0  0  0  0  0  0
-   -2.5797    2.8872    0.0506 H   0  0  0  0  0  0  0  0  0  0  0  0
-   -3.8374    0.8238   -0.5090 H   0  0  0  0  0  0  0  0  0  0  0  0
-    3.7290    1.4184    0.8593 H   0  0  0  0  0  0  0  0  0  0  0  0
-    4.2045    0.6969   -0.6924 H   0  0  0  0  0  0  0  0  0  0  0  0
-    3.7105   -0.3659    0.6426 H   0  0  0  0  0  0  0  0  0  0  0  0
-   -0.2555   -3.5916   -0.7337 H   0  0  0  0  0  0  0  0  0  0  0  0
-  1  5  1  0  0  0  0
-  1 12  1  0  0  0  0
-  2 11  1  0  0  0  0
-  2 21  1  0  0  0  0
-  3 11  2  0  0  0  0
-  4 12  2  0  0  0  0
-  5  6  1  0  0  0  0
-  5  7  2  0  0  0  0
-  6  8  2  0  0  0  0
-  6 11  1  0  0  0  0
-  7  9  1  0  0  0  0
-  7 14  1  0  0  0  0
-  8 10  1  0  0  0  0
-  8 15  1  0  0  0  0
-  9 10  2  0  0  0  0
-  9 16  1  0  0  0  0
- 10 17  1  0  0  0  0
- 12 13  1  0  0  0  0
- 13 18  1  0  0  0  0
- 13 19  1  0  0  0  0
- 13 20  1  0  0  0  0
-M  END
-$$$$
--- a/test-data/mol_1.sdf	Tue Sep 10 12:31:15 2019 -0400
+++ b/test-data/mol_1.sdf	Wed Oct 09 07:34:49 2019 -0400
@@ -0,0 +1,138 @@
+2244
+ OpenBabel09021316243D
+
+ 21 21  0  0  0  0  0  0  0  0999 V2000
+    1.2333    0.5540    0.7792 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.6952   -2.7148   -0.7502 O   0  0  0  0  0  0  0  0  0  0  0  0
+    0.7958   -2.1843    0.8685 O   0  0  0  0  0  0  0  0  0  0  0  0
+    1.7813    0.8105   -1.4821 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.0857    0.6088    0.4403 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.7927   -0.5515    0.1244 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.7288    1.8464    0.4133 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.1426   -0.4741   -0.2184 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.0787    1.9238    0.0706 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.7855    0.7636   -0.2453 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.1409   -1.8536    0.1477 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.1094    0.6715   -0.3113 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.5305    0.5996    0.1635 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.1851    2.7545    0.6593 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.7247   -1.3605   -0.4564 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.5797    2.8872    0.0506 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -3.8374    0.8238   -0.5090 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.7290    1.4184    0.8593 H   0  0  0  0  0  0  0  0  0  0  0  0
+    4.2045    0.6969   -0.6924 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.7105   -0.3659    0.6426 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.2555   -3.5916   -0.7337 H   0  0  0  0  0  0  0  0  0  0  0  0
+  1  5  1  0  0  0  0
+  1 12  1  0  0  0  0
+  2 11  1  0  0  0  0
+  2 21  1  0  0  0  0
+  3 11  2  0  0  0  0
+  4 12  2  0  0  0  0
+  5  6  1  0  0  0  0
+  5  7  2  0  0  0  0
+  6  8  2  0  0  0  0
+  6 11  1  0  0  0  0
+  7  9  1  0  0  0  0
+  7 14  1  0  0  0  0
+  8 10  1  0  0  0  0
+  8 15  1  0  0  0  0
+  9 10  2  0  0  0  0
+  9 16  1  0  0  0  0
+ 10 17  1  0  0  0  0
+ 12 13  1  0  0  0  0
+ 13 18  1  0  0  0  0
+ 13 19  1  0  0  0  0
+ 13 20  1  0  0  0  0
+M  END
+>  <PUBCHEM_COMPOUND_CID>
+2244
+
+>  <PUBCHEM_CONFORMER_RMSD>
+0.6
+
+>  <PUBCHEM_CONFORMER_DIVERSEORDER>
+1
+11
+10
+3
+15
+17
+13
+5
+16
+7
+14
+9
+8
+4
+18
+6
+12
+2
+
+>  <PUBCHEM_MMFF94_PARTIAL_CHARGES>
+18
+1 -0.23
+10 -0.15
+11 0.63
+12 0.66
+13 0.06
+14 0.15
+15 0.15
+16 0.15
+17 0.15
+2 -0.65
+21 0.5
+3 -0.57
+4 -0.57
+5 0.08
+6 0.09
+7 -0.15
+8 -0.15
+9 -0.15
+
+>  <PUBCHEM_EFFECTIVE_ROTOR_COUNT>
+3
+
+>  <PUBCHEM_PHARMACOPHORE_FEATURES>
+5
+1 2 acceptor
+1 3 acceptor
+1 4 acceptor
+3 2 3 11 anion
+6 5 6 7 8 9 10 rings
+
+>  <PUBCHEM_HEAVY_ATOM_COUNT>
+13
+
+>  <PUBCHEM_ATOM_DEF_STEREO_COUNT>
+0
+
+>  <PUBCHEM_ATOM_UDEF_STEREO_COUNT>
+0
+
+>  <PUBCHEM_BOND_DEF_STEREO_COUNT>
+0
+
+>  <PUBCHEM_BOND_UDEF_STEREO_COUNT>
+0
+
+>  <PUBCHEM_ISOTOPIC_ATOM_COUNT>
+0
+
+>  <PUBCHEM_COMPONENT_COUNT>
+1
+
+>  <PUBCHEM_CACTVS_TAUTO_COUNT>
+1
+
+>  <PUBCHEM_CONFORMER_ID>
+000008C400000001
+
+>  <PUBCHEM_COORDINATE_TYPE>
+2
+5
+10
+
+$$$$
--- a/test-data/mol_2.sdf	Tue Sep 10 12:31:15 2019 -0400
+++ b/test-data/mol_2.sdf	Wed Oct 09 07:34:49 2019 -0400
@@ -0,0 +1,47 @@
+
+
+ 21 21  0  0  0  0  0  0  0  0999 V2000
+    1.2333    0.5540    0.7792 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.6952   -2.7148   -0.7502 O   0  0  0  0  0  0  0  0  0  0  0  0
+    0.7958   -2.1843    0.8685 O   0  0  0  0  0  0  0  0  0  0  0  0
+    1.7813    0.8105   -1.4821 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.0857    0.6088    0.4403 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.7927   -0.5515    0.1244 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.7288    1.8464    0.4133 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.1426   -0.4741   -0.2184 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.0787    1.9238    0.0706 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.7855    0.7636   -0.2453 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.1409   -1.8536    0.1477 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.1094    0.6715   -0.3113 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.5305    0.5996    0.1635 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.1851    2.7545    0.6593 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.7247   -1.3605   -0.4564 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.5797    2.8872    0.0506 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -3.8374    0.8238   -0.5090 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.7290    1.4184    0.8593 H   0  0  0  0  0  0  0  0  0  0  0  0
+    4.2045    0.6969   -0.6924 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.7105   -0.3659    0.6426 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.2555   -3.5916   -0.7337 H   0  0  0  0  0  0  0  0  0  0  0  0
+  1  5  1  0  0  0  0
+  1 12  1  0  0  0  0
+  2 11  1  0  0  0  0
+  2 21  1  0  0  0  0
+  3 11  2  0  0  0  0
+  4 12  2  0  0  0  0
+  5  6  1  0  0  0  0
+  5  7  2  0  0  0  0
+  6  8  2  0  0  0  0
+  6 11  1  0  0  0  0
+  7  9  1  0  0  0  0
+  7 14  1  0  0  0  0
+  8 10  1  0  0  0  0
+  8 15  1  0  0  0  0
+  9 10  2  0  0  0  0
+  9 16  1  0  0  0  0
+ 10 17  1  0  0  0  0
+ 12 13  1  0  0  0  0
+ 13 18  1  0  0  0  0
+ 13 19  1  0  0  0  0
+ 13 20  1  0  0  0  0
+M  END
+$$$$
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/split_after.fasta	Wed Oct 09 07:34:49 2019 -0400
@@ -0,0 +1,4 @@
+PROTEIN
+>seq3
+ANOTHERPROTEIN
+>seq4