Repository 're_utils'
hg clone https://toolshed.g2.bx.psu.edu/repos/petr-novak/re_utils

Changeset 3:e320ef2d105a (2019-09-05)
Previous changeset 2:ff658cf87f16 (2019-09-04) Next changeset 4:d397f5a85464 (2019-09-18)
Commit message:
Uploaded
modified:
README.md
added:
ChipSeqRatioAnalysis.R
ChipSeqRatioAnalysis.py
ChipSeqRatioDef.xml
deinterlacer.py
fasta_interlacer.xml
fastq_name_affixer.py
fastq_name_affixer.xml
pairScan.py
pairScan.xml
renameSequences.xml
renameSequences2.py
sampleFasta.xml
test_data/VTS_contigs.info.minRD5
test_data/seq_C_10k
test_data/seq_I_10k
b
diff -r ff658cf87f16 -r e320ef2d105a ChipSeqRatioAnalysis.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ChipSeqRatioAnalysis.R Thu Sep 05 09:04:56 2019 -0400
[
@@ -0,0 +1,140 @@
+#!/usr/bin/env Rscript
+library(R2HTML, quietly=T)
+library(base64enc, quietly=T)
+
+
+htmlheader="
+  <html xmlns:mml=\"http://www.w3.org/1998/Math/MathML\">
+  <head>
+  <title> ChIP-Seq Mapper Output </title>
+  <style>
+  <!--
+  table { background:#FFFFFF;
+  border:1px solid gray;
+  border-collapse:collapse;
+  color:#fff;
+  font:normal 13px verdana, arial, helvetica, sans-serif;
+    width: 100%;
+
+  }
+  caption { border:1px solid #5C443A;
+  color:#5C443A;
+  font-weight:bold;
+  font-size:20pt
+  padding:6px 4px 8px 0px;
+  text-align:center;
+  
+  }
+  td, th { color:#363636;
+  padding:.4em;
+  }
+  tr { border:1px dotted gray;
+  }
+  thead th, tfoot th { background:#5C443A;
+  color:#FFFFFF;
+  padding:3px 10px 3px 10px;
+  text-align:left;
+  text-transform:uppercase;
+  }
+  tbody td a { color:#3636FF;
+  text-decoration:underline;
+  }
+  tbody td a:visited { color:gray;
+  text-decoration:line-through;
+  }
+  tbody td a:hover { text-decoration:underline;
+  }
+  tbody th a { color:#3636FF;
+  font-weight:normal;
+  text-decoration:none;
+  }
+  tbody th a:hover { color:#363636;
+  }
+  tbody td+td+td+td a { background-image:url('bullet_blue.png');
+  background-position:left center;
+  background-repeat:no-repeat;
+  color:#FFFFFF;
+  padding-left:15px;
+  }
+  tbody td+td+td+td a:visited { background-image:url('bullet_white.png');
+  background-position:left center;
+  background-repeat:no-repeat;
+  }
+  tbody th, tbody td { text-align:left;
+  vertical-align:top;
+  }
+  tfoot td { background:#5C443A;
+  color:#FFFFFF;
+  padding-top:3px;
+  }
+  .odd { background:#fff;
+  }
+  tbody tr:hover { background:#EEEEEE;
+  border:1px solid #03476F;
+  color:#000000;
+  }
+  -->
+  </style>
+  
+  </head>
+  
+  "
+
+
+                                        #arguments
+args <- commandArgs(trailingOnly = TRUE)
+input <- args[1]
+HTMLfile <- args[2]
+threshld <- 2/(2+1)
+inputN=as.numeric(args[3])
+chipN=as.numeric(args[4])
+                                        #dataframe preprocessing and table creation
+df <- read.delim(input, comment.char="#")
+
+df$"Ratio Chip/Input"=df$Chip_Hits/df$Input_Hits
+df$"Normalized ratio Chip/Input"=(df$Chip_Hits/chipN)/(df$Input_Hits/inputN)
+
+df$"Ratio Chip/(Chip+Input)"=df$Chip_Hits/(df$Chip_Hits + df$Input_Hits)
+df$"Normalized ratio Chip/(Chip+Input)"=(df$Chip_Hits/chipN)/((df$Input_Hits/inputN)+(df$Chip_Hits/chipN))
+
+outputTable = df[df$"Normalized ratio Chip/(Chip+Input)" > threshld,]
+outputTable = outputTable[!is.na(outputTable$Cluster),]
+save.image("tmp.RData")                                        #Plot creation
+pngfile <- tempfile()
+png(pngfile, width = 1000, height = 1200, pointsize=20)
+par(mfrow=c(3,1))
+lims=range(df$"Normalized ratio Chip/Input"[df$"Normalized ratio Chip/Input">0], finite = TRUE)
+suppressWarnings(plot(df$Cluster,df$"Normalized ratio Chip/Input", log="y", xlab="Cluster Nr.", ylab="Normalized ChiP/Seq ratio", pch=20, ylim=lims))
+abline(h=1,col='#00000080', lwd = 2)
+abline(h=2,col='#FF000080', lwd = 2)
+
+lims=range(df$"Normalized ratio Chip/Input", finite = TRUE)
+suppressWarnings(plot(df$Cluster,df$"Normalized ratio Chip/Input", xlab="Cluster Nr.", ylab="Normalize ChiP/Seq ratio", pch=20, ylim=lims))
+abline(h=1,col='#00000080', lwd = 2)
+abline(h=2,col='#FF000080', lwd = 2)
+
+suppressWarnings(plot(df$Cluster,df$"Normalized ratio Chip/(Chip+Input)", xlab="Cluster Nr.", ylab="Normalized Chip/(Chip+Input)", pch=20))
+abline(h=0.5,col='#00000080', lwd = 2)
+abline(h=threshld,col='#FF000080', lwd = 2)
+
+
+dev.off()
+graph <- paste('<img src="data:image/png;base64 ,',
+               base64encode(pngfile),
+               '" alt="image" />'
+)
+
+                                        #HMTL report creation + writing final output
+directory=dirname(HTMLfile)
+filename=basename(HTMLfile)
+## create HTML header
+cat(htmlheader, file = filename)
+
+
+HTML(graph, file=filename)
+if (nrow(outputTable)>0){
+    HTML(outputTable, file=filename, classtable = "dataframe", row.names=FALSE, Align='left')
+}
+HTMLEndFile(filename) 
+file.rename(from=filename, to=HTMLfile)
+write.table(df, file=input, sep="\t", row.names = FALSE)
b
diff -r ff658cf87f16 -r e320ef2d105a ChipSeqRatioAnalysis.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ChipSeqRatioAnalysis.py Thu Sep 05 09:04:56 2019 -0400
[
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+import re
+import argparse
+import csv
+import os
+import sys
+import os.path
+import subprocess
+import shlex
+import multiprocessing as mp
+import tempfile
+import itertools as it
+
+
+def get_arguments():
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-m',
+                        '--max_cl',
+                        default=300,
+                        type=int,
+                        help='Sets the maximum cluster number. Default = 300')
+    parser.add_argument('-b',
+                        '--bitscore',
+                        default=30,
+                        type=int,
+                        help='minimal bitscore to report')
+    parser.add_argument(
+        '-n',
+        '--nproc',
+        default=mp.cpu_count(),
+        type=int,
+        help='Sets the number of cpus to be used. Default = all available')
+    parser.add_argument('-c',
+                        '--ChipSeq',
+                        required=True,
+                        help='Fasta file containing the Chip Sequences')
+    parser.add_argument('-i',
+                        '--InputSeq',
+                        required=True,
+                        help='Fasta file containing the Input Sequences')
+    parser.add_argument(
+        '-o',
+        '--output',
+        required=False,
+        default='ChipSeqRatio.csv',
+        help=('Specify a name for the CSV file to which the'
+              ' output will be save to. Default: ChipSeqRatio.csv'))
+    parser.add_argument(
+        '-ht',
+        '--html',
+        required=False,
+        default='ChipSeqRatioReport',
+        help='Specify a name for the html report. Default : ChipSeqRatioReport')
+    parser.add_argument('-k',
+                        '--Contigs',
+                        required=True,
+                        help='Contig file for blast')
+    args = parser.parse_args()
+    return args
+
+
+def split(filename, num_chunks):
+    # splits a files into nproc files
+    files = []
+    temp_files = []
+
+    # creating a list with nproc temporary files
+    for f in range(num_chunks):
+        temp_files.append(tempfile.NamedTemporaryFile('w', delete=False))
+    with open(filename, 'r') as f:
+        end = os.path.getsize(filename)
+        for temp in it.cycle(temp_files):
+            # cycling indefenitly through temp files
+            ID = f.readline()  # get sequence id
+            if ID[0] == '>':
+                temp.write(ID)
+                SEQP = f.tell()  # get file pointer location
+                while f.readline()[0] is not '>':
+                    f.seek(SEQP)  # jump to last saved file pointer
+                    temp.write(f.readline())  # write sequen
+                    SEQP = f.tell()  # overwrite last file pointer location
+                    # break loop if file pointer reached the EOF
+                    if (SEQP == end):
+                        break
+            if (SEQP == end):  # break loop if file pointer reached the EOF
+                break
+            f.seek(SEQP)
+
+    for f in range(num_chunks):
+        # save temp files names into list for further use
+        files.append(temp_files[f].name)
+        temp_files[f].close()  # close temp files
+    return files
+
+
+def blast(query, database, bitscore):
+    # blast a file for given arguments and save result to a list
+    print(query)
+    arguments = (
+        "blastn -task blastn -db {} -query {} "
+        "-evalue 1e-2 -gapopen 5 -gapextend 2 -word_size 11 -num_alignments 1"
+        " -penalty -3 -reward 2 -outfmt 6 -dust no").format(database, query)
+    cmd = shlex.split(arguments)
+    Blast_Output = [0 for x in range(max_cl + 1)]
+    ma = re.compile('(\S+)\tCL(\d+)Contig')  # expression to check for
+    with subprocess.Popen(cmd,
+                          stdout=subprocess.PIPE,
+                          universal_newlines=True) as p:
+        for line in p.stdout:
+            if float(line.split()[11]) > bitscore:
+                gr = ma.match(line)
+                previous_query = ''
+                if gr:
+                    if (gr.group(1) != previous_query):
+                        if (int(gr.group(2)) > max_cl):
+                            Blast_Output[0] = Blast_Output[0] + 1
+                        else:
+                            Blast_Output[int(gr.group(2))] = Blast_Output[
+                                int(gr.group(2))] + 1
+                        previous_query = gr.group(1)
+    return Blast_Output
+
+
+def ReduceLists(x, y):
+    ''' reduces two lists into a 2-dim matrix '''
+    Matrix = [[0 for i in range(max_cl + 1)] for i in range(2)]
+    for i in range(len(x)):
+        for j in range(len(x[i])):
+            Matrix[0][j] = Matrix[0][j] + x[i][j]
+    for i in range(len(y)):
+        for j in range(len(y[i])):
+            Matrix[1][j] = Matrix[1][j] + y[i][j]
+    return Matrix
+
+
+def fasta_size(fastafile):
+    with open(fastafile, 'r') as f:
+        s = 0
+        for i in f:
+            if i[0] == ">":
+                s += 1
+    return s
+
+
+def makeblastdb(filename):
+    dbtmp = tempfile.NamedTemporaryFile()
+    cmd = [
+        'makeblastdb', '-in', filename, '-input_type', 'fasta', '-dbtype',
+        'nucl', '-out', dbtmp.name
+    ]
+    subprocess.call(cmd)
+    return dbtmp
+
+
+if __name__ == "__main__":
+    args = get_arguments()
+    max_cl = args.max_cl
+    output = args.output
+    HTMLreport = args.html
+    contigs = args.Contigs
+
+    # Creation of database
+    db = makeblastdb(contigs)
+
+    inputN = fasta_size(args.InputSeq)
+    chipN = fasta_size(args.ChipSeq)
+
+    # Reading and distribution of data to temp files for multiprocessing
+    filesC = split(args.ChipSeq, args.nproc)
+    filesI = split(args.InputSeq, args.nproc)
+
+    # start of parallized blast
+    pool = mp.Pool(processes=args.nproc)
+    results = [pool.apply_async(blast, args=(f, db.name, args.bitscore)) for f in filesC]
+    Cout = [p.get() for p in results]
+    results = [pool.apply_async(blast, args=(f, db.name, args.bitscore)) for f in filesI]
+    Iout = [p.get() for p in results]
+
+    # Merging of blast output into a 2-dim matrix
+    Matrix = ReduceLists(Cout, Iout)
+
+    with open(args.output, 'w') as f:
+        print("Cluster", "Chip_Hits", "Input_Hits", sep='\t', file=f)
+        for hit in range(1, args.max_cl + 1):
+            print(hit, Matrix[0][hit], Matrix[1][hit], sep='\t', file=f)
+    Rarguments = "Rscript " + \
+        os.path.dirname(__file__) + "/ChipSeqRatioAnalysis.R"
+    # order is important - programmed by georg - this it realy ugly!
+    args = shlex.split(Rarguments)
+    args.append(output)
+    args.append(HTMLreport)
+    args.append(str(inputN))
+    args.append(str(chipN))
+    with subprocess.Popen(args, stderr=subprocess.PIPE) as p:
+        print("Creating HTML report")
+        stdout, stderr = p.communicate()
+        if (len(stderr) > 0):
+            print(stderr)
+    # cleanup
+    for i in filesC + filesI:
+        os.unlink(i)
b
diff -r ff658cf87f16 -r e320ef2d105a ChipSeqRatioDef.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ChipSeqRatioDef.xml Thu Sep 05 09:04:56 2019 -0400
b
@@ -0,0 +1,60 @@
+<tool id="chip_seq_ratio_1" name="Chip-Seq Mapper" version="0.1.1">
+  <stdio>
+    <exit_code range="1:" level="fatal" description="Error"/>
+  </stdio>
+    <description></description>
+    <requirements>
+      <requirement type="package">r-base64enc</requirement>
+      <requirement type="package">r-r2html</requirement>
+      <requirement type="package">blast</requirement>
+    </requirements>
+    <command interpreter="python3">
+ ChipSeqRatioAnalysis.py 
+ --ChipSeq=${ChipFile}
+ --InputSeq=${InputFile}
+ --Contigs=${ContigFile}
+ --output=${OutputFile}
+ --html=${ReportFile}
+ --max_cl=${MaxCl}
+  --bitscore=$bitscore
+  --nproc=16
+    </command>
+
+    <inputs>
+        <param name="ChipFile" label="Chip Sequences" type="data" format="fasta" help="NGS data in fasta format"/> 
+ <param name="InputFile" label="Input Sequences" type="data" format="fasta" help="NGS data in fasta format"/>
+ <param name="ContigFile" label="Reference - Contig Sequences" type="data" format="fasta"
+        help="Contigs obtained from RepeatExplorer clustering pipeline in fasta file"/> 
+ <param name="MaxCl" label="Number of clusters to be shown in graph" type="integer" value="200"/>   
+ <param name="bitscore" label="Minimum bit score threshold" type="integer" value="30" help="All similarity hits with lower bit score will not be considered for ChIP/Input ratio calculation"/>   
+    </inputs>
+    <outputs>
+     <data name="OutputFile" format="tabular"/>
+ <data name="ReportFile" format="html"/> 
+    </outputs>
+
+    <help>
+**What it does**
+
+Analysis of NGS sequences from Chromatin Imunoprecipitation. ChiP
+and Input reads are mapped to contigs obtained from graph based
+repetitive sequence clustering(`Novak et al. 2013`__) to enriched repeats. Reads from input
+and ChIP should be ideally short illumina reads with uniform length
+above 80 nt. It is sufficiant to use about 1 milion of reads for both Input and Chip.
+This method was first used in (`Neumann et al. 2012`__) for
+identification of repetitive sequences associated with cetromeric
+region. If you use this method, reference:
+
+
+`PLoS Genet. Epub 2012 Jun 21. Stretching the rules: monocentric chromosomes with multiple centromere domains. Neumann P, Navrátilová A, Schroeder-Reiter E, Koblížková A, Steinbauerová V, Chocholová E, Novák P, Wanner G, Macas J.`__.
+
+.. __: http://bioinformatics.oxfordjournals.org/content/29/6/792.full

+.. __: http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1002777
+.. __: http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1002777
+      
+    </help>
+
+</tool>
+

b
diff -r ff658cf87f16 -r e320ef2d105a README.md
--- a/README.md Wed Sep 04 07:33:42 2019 -0400
+++ b/README.md Thu Sep 05 09:04:56 2019 -0400
[
@@ -48,6 +48,55 @@
 if you want to do comparative analysis with RepeatExplorer and need to
 append sample codes to sequence identifiers
 
+### ChIP-Seq-mapper ###
+
+
+Analysis of NGS sequences from Chromatin Imunoprecipitation. ChiP and Input reads are mapped to contigs obtained from graph based repetitive sequence clustering to enriched repeats. This method was used in (Neumann et al. 2012). for identification of repetitive sequences associated with cetromeric region.
+
+#### Authors ####
+Petr Novak, Jiri Macas, Pavel Neumann, Georg Hermanutz
+
+Biology Centre CAS, Czech Republic
+
+
+#### Installation and dependencies ####
+
+ChIP-Seq-mapper require NCBI blast to be installed, R programming language with installed R2HTML and base64 packages and python3
+
+#### Usage ####
+
+```
+ChipSeqRatioAnalysis.py [-h] [-m MAX_CL] [-n NPROC] -c CHIPSEQ -i
+                               INPUTSEQ [-o OUTPUT] [-ht HTML] [-t THRESHOLD]
+                               -k CONTIGS
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -m MAX_CL, --max_cl MAX_CL
+                        Sets the maximum cluster number. Default = 200
+  -n NPROC, --nproc NPROC
+                        Sets the number of cpus to be used. Default = all
+                        available
+  -c CHIPSEQ, --ChipSeq CHIPSEQ
+                        Fasta file containing the Chip Sequences
+  -i INPUTSEQ, --InputSeq INPUTSEQ
+                        Fasta file containing the Input Sequences
+  -o OUTPUT, --output OUTPUT
+                        Specify a name for the CSV file to which the output
+                        will be save to. Default: ChipSeqRatio.csv                      
+  -ht HTML, --html HTML                                                                 
+                        Specify a name for the html report. Default :                   
+                        ChipSeqRatioReport                                              
+  -t THRESHOLD, --threshold THRESHOLD                                                   
+                        Optional plot filter. Default: mean ration between              
+                        Input hits and Chip hits.                                       
+  -k CONTIGS, --Contigs CONTIGS                                                         
+                        Contig file for blast 
+```
+
+####  References ####
+[PLoS Genet. Epub 2012 Jun 21. Stretching the rules: monocentric chromosomes with multiple centromere domains. Neumann P, Navrátilová A, Schroeder-Reiter E, Koblížková A, Steinbauerová V, Chocholová E, Novák P, Wanner G, Macas J.](http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1002777)
+
 
 ## Dependencies ##
 
b
diff -r ff658cf87f16 -r e320ef2d105a deinterlacer.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/deinterlacer.py Thu Sep 05 09:04:56 2019 -0400
[
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+'''very simple deinterlacer - fasta and fastq'''
+import sys
+import itertools
+
+
+def is_header(line, counter, fasta):
+    ''' return True is line is header '''
+    if fasta:
+        if line[0] == ">":
+            return True
+    else:
+        if counter == 4 and line[0] == "@":
+            return True
+    return False
+
+
+def main():
+    '''deinterlace fasta or fastq format'''
+    infile = sys.argv[1]
+    file_a = sys.argv[2]
+    file_b = sys.argv[3]
+    with open(infile) as f, open(file_a, 'w') as A, open(file_b, 'w') as B:
+        ABiter = itertools.cycle([A, B])
+        counter = 3  # four lines per record in fastq
+        pos = f.tell()
+        is_fasta = f.readline()[0] == ">"
+        f.seek(pos)
+        for line in f:
+            counter += 1
+            if is_header(line, counter, is_fasta):
+                fout = next(ABiter)
+                counter = 0
+            fout.write(line)
+
+
+if __name__ == "__main__":
+    main()
b
diff -r ff658cf87f16 -r e320ef2d105a fasta_interlacer.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_interlacer.xml Thu Sep 05 09:04:56 2019 -0400
b
@@ -0,0 +1,33 @@
+
+<tool id="fasta_interlacer" name="FASTA interlacer" version="1.0.0">
+<description> Join pared reads into single file </description>
+<command interpreter="python">
+fasta_interlacer.py -a $A -b $B -p $paired -x $single
+</command>
+
+ <inputs>
+  <param format="fasta" type="data" name="A" label="Left-hand mates" />
+  <param format="fasta" type="data" name="B" label="Right-hand mates" />
+ </inputs>
+
+
+ <outputs>
+    <data format="fasta" name="paired" label="interlaced paired reads from datasets ${A.hid} and ${B.hid} "/>
+    <data format="fasta" name="single" label="reads without available pair reads from datasets ${A.hid} and ${B.hid}"/>
+ </outputs>
+
+ <help>
+**What it does**
+ This tools joins paired end FASTA reads from separate files, one with the left mates and one with the right mates, into a single files.
+ Last character in identifiers is used to distinguish pairs.
+  
+**Note !!!**
+ This tools is to be used as more efficient replacement of FASTQ interlacer. Galaxy built-in FASTQ interlacer allows different ordering
+ of sequences in both files but this flexibility comes with high memory requirements when large files are used. FASTA interlacer is simple but order of magnitude 
+ faster tools which can be used on files where reads are in the same order.


+</help>
+

+</tool>
b
diff -r ff658cf87f16 -r e320ef2d105a fastq_name_affixer.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fastq_name_affixer.py Thu Sep 05 09:04:56 2019 -0400
[
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+import sys
+
+from optparse import OptionParser
+
+parser = OptionParser()
+parser.add_option("-f", "--fastq", dest="fastq", help="fastq file")
+parser.add_option("-p", "--prefix", dest="prefix", help="prefix to be added to names")
+parser.add_option("-s", "--suffix", dest="suffix", help="suffix to be added",default='')
+parser.add_option("-n", "--nspace", dest="nspace", help="number of spaces to ignore",default='0')
+options, args = parser.parse_args()
+nspace=int(options.nspace)
+
+f=open(options.fastq,"r")
+j=0
+for oneline in f:
+    if oneline=="":
+        continue
+    j+=1
+    if j==5:
+        j=1
+    if not oneline:
+        break
+   
+    if (oneline[0]=="@" and j==1) or (oneline[0]=="+" and len(oneline)>2 and j==3):
+        header=" ".join(oneline.split()[:1+nspace])
+        header_out=header[0]+options.prefix+header[1:]+options.suffix+"\n"
+        sys.stdout.write(header_out)
+    else:
+        sys.stdout.write(oneline)
+
b
diff -r ff658cf87f16 -r e320ef2d105a fastq_name_affixer.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fastq_name_affixer.xml Thu Sep 05 09:04:56 2019 -0400
b
@@ -0,0 +1,95 @@
+<tool id="names_affixer" name="FASTQ Read name affixer" version="1.0.0">
+<description> Tool appending suffix and prefix to sequences names </description>
+<command interpreter="python">
+${__tool_directory__}/name_affixer.py -f $input -p "$prefix" -s "$suffix" -n $nspace > $output
+</command>
+
+ <inputs>
+  <param format="fastq" type="data" name="input" label="Choose your fastq file" />
+  <param name="prefix" type="text" size="10" value="" label="Prefix" help="Enter prefix which will be added to all sequences names" />
+  <param name="suffix" type="text" size="10" value="" label="Suffix" help="Enter suffix which will be added to all sequences names"/>
+  <param name="nspace" type="integer" size="10" value="0" min="0" max="1000" label="Number of spaces in name to ignore" help="Sequence name is a string before the first space. If you want name to include spaces in name, enter positive integer. All other characters beyond ignored spaces are omitted"/>
+ </inputs>
+
+
+ <outputs>
+  <data format="fastq" name="output" label="fastq dataset ${input.hid} with modified sequence names" />
+ </outputs>
+
+ <help>
+**What is does**

+Tool for appending prefix and suffix to sequences names in fastq formated sequences.
+
+**Example**
+
+The following Solexa-FASTQ file:

+::

+ @CSHL_4_FC042GAMMII_2_1_517_596
+ GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT
+ +CSHL_4_FC042GAMMII_2_1_517_596
+ 40 40 40 40 40 40 40 40 40 40 38 40 40 40 40 40 14 40 40 40 40 40 36 40 13 14 24 24 9 24 9 40 10 10 15 40

+is renamed to:
+
+::

+ @prefixCSHL_4_FC042GAMMII_2_1_517_596suffix
+ GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT
+ +prefixCSHL_4_FC042GAMMII_2_1_517_596suffix
+ 40 40 40 40 40 40 40 40 40 40 38 40 40 40 40 40 14 40 40 40 40 40 36 40 13 14 24 24 9 24 9 40 10 10 15 40
+
+different format:

+
+::

+ @HISEQ1:92:c0190acxx:8:1101:1252:2230 2:N:0:CGATGT
+ AGAGGAAAAAACATAGTTCTTGTCTAAAAAAATCCCTTGAAAAAGGGCAGATGTATAGAAATAGAAAATTTCAAAGAAAAACTCTCTACAAATGGAAGAGA
+ +
+ CCCFFFFFHHHHHJJJJIJJJJJJJJJJJJJJJIJJJJJIIJJJJJJGIJIJIHHHHHHHHFFFFFFDEEEEEDCDDDDDDDCCDDDEDDDDD>CCCCB@9
+
+is renamed to:
+
+::

+ @prefixHISEQ1:92:c0190acxx:8:1101:1252:2230suffix
+ AGAGGAAAAAACATAGTTCTTGTCTAAAAAAATCCCTTGAAAAAGGGCAGATGTATAGAAATAGAAAATTTCAAAGAAAAACTCTCTACAAATGGAAGAGA
+ +
+ CCCFFFFFHHHHHJJJJIJJJJJJJJJJJJJJJIJJJJJIIJJJJJJGIJIJIHHHHHHHHFFFFFFDEEEEEDCDDDDDDDCCDDDEDDDDD>CCCCB@9

+note that string after first space is omitted! 
+
+Because sequence names sometimes containg spaces which delimit the actual name. By default, anything after spaces is 
+excluded from sequences name. In example sequence:

+::
+  
+ @SRR352150.23846180 HWUSI-EAS1786:7:119:15910:19280/1
+ CTGGATTCTATACCTTTGGCAACTACTTCTTGGTTGATCAGGAAATTAACACTAGTAGTTTAGGCAATTTGGAATGGTGCCAAAGATGTATAGAACTTTC
+ +
+ IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGIIIHIIIIIFIIIIIIHDHBBIHFIHIIBHHDDHIFHIHIIIHIHGGDFDEI@EGEGFGFEFB@ECG
+
+when **Number of spaces in name to ignore** is set to 0 (default) the output will be:

+::

+ @prefixSRR352150.23846180suffix
+ CTGGATTCTATACCTTTGGCAACTACTTCTTGGTTGATCAGGAAATTAACACTAGTAGTTTAGGCAATTTGGAATGGTGCCAAAGATGTATAGAACTTTC
+ +
+ IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGIIIHIIIIIFIIIIIIHDHBBIHFIHIIBHHDDHIFHIHIIIHIHGGDFDEI@EGEGFGFEFB@ECG

+If you want to keep spaces the setting **Number of spaces in name to ignore** to 1 will yield 

+:: 

+ @prefixSRR352150.23846180 HWUSI-EAS1786:7:119:15910:19280/1suffix
+ CTGGATTCTATACCTTTGGCAACTACTTCTTGGTTGATCAGGAAATTAACACTAGTAGTTTAGGCAATTTGGAATGGTGCCAAAGATGTATAGAACTTTC
+ +
+ IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGIIIHIIIIIFIIIIIIHDHBBIHFIHIIBHHDDHIFHIHIIIHIHGGDFDEI@EGEGFGFEFB@ECG


+</help>
+</tool>
b
diff -r ff658cf87f16 -r e320ef2d105a pairScan.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pairScan.py Thu Sep 05 09:04:56 2019 -0400
[
@@ -0,0 +1,171 @@
+#!/usr/bin/env python
+import sys
+import os
+from optparse import OptionParser
+import Levenshtein
+
+
+class Error(Exception):
+    """Base class for exceptions in this module."""
+    pass
+
+
+def readSingleSeq(file):
+    line = file.readline()
+    if not line:
+        return False  # end of file
+    if line[0] != ">":
+        raise Error("no header on the first line")
+    seqname = line[1:].strip()
+    seq = ""
+    # read sequences
+    while True:
+        last_pos = file.tell()
+        line = file.readline()
+        if not line:
+            break
+        if line[0] == ">":
+            file.seek(last_pos)
+            break
+        seq = seq + line.strip()
+    return {'name': seqname, 'sequence': seq}
+
+
+def writeSingleSeq(fileobject, seq):
+    fileobject.write(">")
+    fileobject.write(seq['name'] + "\n")
+    fileobject.write(seq['sequence'] + "\n")
+
+
+def comparePairs(seq1, seq2, max_mismatch=3, offset=5):
+    s1 = seq1['sequence'].lower()
+    s2 = seq2['sequence'].lower()[::-1]
+    m = 0
+    intab = "ctagn"
+    outtab = "gatcn"
+    trantab = str.maketrans(intab, outtab)
+    s2 = s2.translate(trantab)
+    s1 = "-" * offset + s1
+    s2 = s2 + "-" * offset
+    n1 = len(s1)
+    n2 = len(s2)
+    m = 0
+    for i in range(1, min(n1 + 1, n2 + 1)):
+        #remove tails is any:
+        ss1 = s1[n1 - i:n1]
+        ss2 = s2[0:i]
+        added = ss1.count("-") + ss2.count("-")
+        d = Levenshtein.hamming(ss1, ss2) - added
+        if 100.0 * d / i <= max_mismatch:
+            m = max(m, i - d - added)
+    return m
+
+
+def split_file(filename, N, min_chunk=2):
+    f1 = open(filename, 'r')
+    filenames = [filename + "." + str(i) for i in range(N)]
+    f2 = list(map(open, filenames, 'w' * N))
+    while True:
+        for i in f2:
+            for j in range(min_chunk):
+                line = f1.readline()
+                if not line:
+                    [i.close() for i in f2]
+                    f1.close()
+                    return filenames
+                i.write(line)
+
+
+def find_overlapping_sequences(seqfile,
+                               seqfile2=None,
+                               seqfile_good="",
+                               seqfile_bad="",
+                               min_overlap=30,
+                               max_mismatch=2,
+                               offset=5):
+    ''' return id ove overlaping pairs - only first id is returned '''
+    # default names - if empty
+    if seqfile_good == "":
+        seqfile_good = seqfile + ".pass"
+    if seqfile_bad == "":
+        seqfile_bad = seqfile + ".bad"
+
+    minscore = min_overlap * 2
+
+    fgood = open(seqfile_good, 'w')
+    fbad = open(seqfile_bad, 'w')
+    f = open(seqfile, 'r')
+    if seqfile2:
+        f2 = open(seqfile2)
+    else:
+        f2 = f
+    while True:
+        seq1 = readSingleSeq(f)
+        seq2 = readSingleSeq(f2)
+        if not seq1 or not seq2:
+            break  # end of file
+        score = comparePairs(seq1, seq2, max_mismatch, offset=offset)
+        if score > min_overlap:
+            writeSingleSeq(fbad, seq1)
+            writeSingleSeq(fbad, seq2)
+        else:
+            writeSingleSeq(fgood, seq1)
+            writeSingleSeq(fgood, seq2)
+    f.close()
+    if not f2.closed:
+        f2.close
+    fgood.close()
+    fbad.close()
+
+
+def main():
+    parser = OptionParser()
+    parser.add_option("-f",
+                      "--fasta_file",
+                      dest="seqfile",
+                      help="input sequences in fasta format")
+    parser.add_option(
+        "-r",
+        "--fasta_file2",
+        default=None,
+        dest="seqfile2",
+        help=
+        "input sequences in fasta format, second file should be specified if pairs are not interlaced, all pairs must be complete!")
+    parser.add_option("-p",
+                      "--fasta_file_pass",
+                      dest="seqfile_good",
+                      help="output file with good sequences",
+                      default='')
+    parser.add_option("-b",
+                      "--fasta_file_bad",
+                      dest="seqfile_bad",
+                      help="output file with bad sequences",
+                      default='')
+    parser.add_option("-o",
+                      "--minimal_overlap",
+                      dest="min_overlap",
+                      help="minimal overlap between pair ends",
+                      default='30')
+    parser.add_option(
+        "-m",
+        "--max_mismatch",
+        dest="max_mismatch",
+        help="maximum number of mismatches in overlap per 100 nt",
+        default='2')
+    parser.add_option("-s",
+                      "--offset",
+                      dest="offset",
+                      help="maximum offset",
+                      default='5')
+    options, args = parser.parse_args()
+    find_overlapping_sequences(seqfile=options.seqfile,
+                               seqfile2=options.seqfile2,
+                               seqfile_good=options.seqfile_good,
+                               seqfile_bad=options.seqfile_bad,
+                               min_overlap=int(options.min_overlap),
+                               max_mismatch=int(options.max_mismatch),
+                               offset=int(options.offset))
+
+
+if __name__ == "__main__":
+    main()
b
diff -r ff658cf87f16 -r e320ef2d105a pairScan.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pairScan.xml Thu Sep 05 09:04:56 2019 -0400
[
@@ -0,0 +1,51 @@
+
+<tool id="pairScan" name="Scan paired reads for overlap" version="1.0.0">
+  <description> Scan paired reads for overlap </description>
+  <requirements>
+    <requirement type="package">python-levenshtein</requirement>
+  </requirements>
+  <command interpreter="python">
+    pairScan.py -f $fasta_input -o $min_overlap -m $max_mismatch -p $pass -b $bad -s $offset
+  </command>
+
+  <inputs>
+    <param format="fasta" type="data" name="fasta_input" label="sequences in fasta format" />
+    <param name="min_overlap" type="integer" size="3" value="30" min="20" max="100" label="minimum overlap length [nt] " />
+    <param name="max_mismatch" type="select" label="Maximum number of mismatches per 100 bp">
+      <option value="0">0</option>
+      <option value="1" selected="true">1</option>
+      <option value="2">2</option>
+      <option value="3">3</option>
+      <option value="4">4</option>
+    </param>
+    <param name="offset" type="select" label="Maximum offset">
+      <option value="0">0</option>
+      <option value="1">1</option>
+      <option value="2">2</option>
+      <option value="3">3</option>
+      <option value="4">4</option>
+      <option value="5" selected="true" >5</option>
+      <option value="6">6</option>
+      <option value="7">7</option>
+      <option value="8">8</option>
+      <option value="9">9</option>
+      <option value="10">10</option>
+    </param>
+    
+    
+    
+  </inputs>
+
+
+  <outputs>
+    <data format="fasta" name="pass" label="pairs with no overlap from dataset ${fasta_input.hid}"/>
+    <data format="fasta" name="bad" label="overlaping pairs from dataset ${fasta_input.hid}"/>
+  </outputs>
+
+  <help>
+    **Scan paired reads for overlap**
+    
+  </help>
+
+  
+</tool>
b
diff -r ff658cf87f16 -r e320ef2d105a renameSequences.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/renameSequences.xml Thu Sep 05 09:04:56 2019 -0400
b
@@ -0,0 +1,25 @@
+<tool id="rename_sequences" name="Rename sequences" version="1.0.0">
+<description> Rename sequences using numerical counter, keep required prefix and pair information </description>
+<command interpreter="python">
+renameSequences2.py $input $paired index.tmp $prefix_length > $output
+</command>
+
+ <inputs>
+  <param format="fasta" type="data" name="input" label="Choose your fasta file" />
+  <param name="prefix_length" type="integer" size="10" value="0" label="Prefix length" help="Enter length of prefix to keep in sequences names" />
+  <param name="paired" type="boolean" truevalue="true" falsevalue="false" checked="False" label="All sequence reads are paired" help="check if you are using pair reads and input sequences contain both read mates and  left mates alternate with their right mates"/>
+ </inputs>
+
+
+ <outputs>
+  <data format="fasta" name="output" label="renamed sequences from dataset ${input.hid}" />
+ </outputs>
+
+ <help>
+**What is does**

+Use this tool to rename your sequences with numerical counter while keeping sequence name prefex as part of the name. 
+If paired sequences are used, last character in sequence name is used to distinguish pairs.  

+</help>
+</tool>
b
diff -r ff658cf87f16 -r e320ef2d105a renameSequences2.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/renameSequences2.py Thu Sep 05 09:04:56 2019 -0400
[
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+### this version does not use st input!!!!
+# how to use:
+# renameSequences.py fasta.file true index.out prefix_length   # for paired sequences
+# renameSequences.py fasta.file false index.out prefix_length   # not paired sequences
+
+import sys
+paired = sys.argv[2] == "true"
+index = open(sys.argv[3], 'w')
+
+if len(sys.argv) == 4:
+    prefix = 0
+else:
+    prefix = int(sys.argv[4])
+
+if paired:
+    P = 2
+    suffix = "f\n"
+else:
+    P = 1
+    suffix = "\n"
+i = j = 0
+reader = open(sys.argv[1], mode='r')
+for oneline in reader:
+    if oneline == "":
+        continue
+    if oneline[0] == ">":
+        i += 1
+        j += 1
+        prefix_string = oneline[1:(1 + prefix)].strip()
+        if j == 1:
+            header = ">" + prefix_string + str(i) + suffix
+            index.write(oneline[1:].strip() + "\t" + prefix_string + str(i) +
+                        suffix)
+        if j == 2:
+            i -= 1
+            header = ">" + prefix_string + str(i) + "r\n"
+            index.write(oneline[1:].strip() + "\t" + prefix_string + str(i) +
+                        "r\n")
+        sys.stdout.write(header)
+        if j == P:
+            j = 0
+    else:
+        sys.stdout.write(oneline)
+index.close()
b
diff -r ff658cf87f16 -r e320ef2d105a sampleFasta.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sampleFasta.xml Thu Sep 05 09:04:56 2019 -0400
b
@@ -0,0 +1,47 @@
+<tool id="sampler" name="Sequence sampling" version="1.0.0">
+  <description> Tool for creating samples of sequences from larger dataset</description>
+  <requirements>
+    <requirement type="package">seqkit</requirement>
+  </requirements>
+  <stdio>
+    <exit_code range="1:" level="fatal" description="Error" />
+  </stdio>
+  <command>
+    #if str($paired)=="true"
+      ${__tool_directory__}/deinterlacer.py $input Afile Bfile
+      &amp;&amp;
+      seqkit sample -2 --number $number --rand-seed $seed -o Asample -w 0 Afile
+      &amp;&amp;
+      seqkit sample -2 --number $number --rand-seed $seed -o Bsample -w 0 Bfile
+      &amp;&amp;
+      ${__tool_directory__}/fasta_interlacer.py -a Asample -b Bsample -p $output -x tmpfile
+    #else
+      seqkit sample -2 --number $number --rand-seed $seed -o $output -w 0 $input
+    #end if
+  </command>
+
+  <inputs>
+    <param format="fasta" type="data" name="input" label="Choose your fasta file" />
+    <param name="number" type="integer" size="7" value="500000" min="1" label="number of sequences or sequence pairs"/>
+    <param name="seed" type="integer" size="10" value="10" min="0" label="random number generator seed " />
+    <param name="paired" type="boolean" truevalue="true" falsevalue="false" checked="False" label="All sequence reads are paired" help="check if you are using pair reads and and input sequences contain both read mates and  left mates alternate with their right mates"/>
+    
+    
+  </inputs>
+
+
+  <outputs>
+    <data format="fasta" name="output" label="Random selection from dataset ${input.hid}, sample size ${number})" />
+  </outputs>
+
+  <help>
+    **What it does**
+    
+    This tools is intended to create sample of sequences from by taking 'random' sample from larger data sets.
+    Using a same seed parameter make sampling reproducible. 
+
+    
+  </help>
+
+  
+</tool>
b
diff -r ff658cf87f16 -r e320ef2d105a test_data/VTS_contigs.info.minRD5
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test_data/VTS_contigs.info.minRD5 Thu Sep 05 09:04:56 2019 -0400
b
b'@@ -0,0 +1,428103 @@\n+>CL41Contig243 (655-2038.9-1335472)\n+ATTGCCAGCTTACACCTCAGAGAGCCGCAAAACTTATTCCCCCAGTGGAGTTGCCGGCTC\n+ATTACCCACAAAATTCAGGTCATGTTCACAACATGTTGCACTCACATGTTCATCATATAT\n+AAAGCACTTTCATTGTGGTATCTTAGGGGCTACAAATTGTGCATTCTGTTGTATTTAAGT\n+CTACCTTCATCCCTCCGGATCGAAGAAGCCTTAAATAGGGGCATCTGTCATACCCAATTT\n+TTTGACCCCCCTAAGATCCCACATGCATCATATCAAGCATTGCATTCACATCAACCATAA\n+GATCATACTTATGTTGATAGTTGATTCTCTCTCCTAAGGATTGGCCTCCTCTGAGGACAC\n+TTAGGCATTTGAGACTCCATGATCATCACTAACCACTAATCATATCCATCATACAAAGAT\n+ATGCTTGTGCTTTGTTCATCTCACCCATTGCAGGTAAAGTTGGCTCAAAGGATCATCAAA\n+GCATCCCCAACTAGGGTTTAGTCCCTCAGATGAATTTCTTCCTCAAAATGGTGAAGCATC\n+AATGCTAATCAACCACCATGGACCACAAGTCACTCTATGCATCAAGGACAACCTTCACAA\n+CATAGCTTTTGACCTACAATGGCTCAGAAGTTTTGAAGCTTGCTCCCCAAGGCAT\n+>CL64Contig342 (450-1716.7-772504)\n+TTATGATATGGACCTCATTCTTTCACTCAAGTAATCTTCAATGAGTCCTCTACAAATGTT\n+ATGTTGAGAGCAAAGATCAATTATGGATGGAAGCATTTCAATTTCATCAAGACATTACAG\n+GTCATTTTGTCTGTCTATAGGAGCCTAAGTTCACAAGATCATATCTCTCTCAATTTTCAT\n+GATATGGAGGTGATTCCTTTTCTCAACATCTTGTATTAACCTCCTCTTCAATTCATCTTC\n+ACAAAGTTTGGGCCAAAAATGCATAGAAAAGGTAGCTATCCATGGGAGCTAGGCTTATTT\n+CTTAACTTAGTGAAATTTTCAAGGCATAAAAACCCTAAAATCTGCCAACTTCATGGCCAA\n+ATTCCACCAAGATGCAAGGTGAATCAAGTCAAGTCATCAACTACAAAGTTGAAGAGCATC\n+CCCTCACCTTTCTAAAAAGTCCAAGATCAC\n+>CL56Contig206 (406-1569.1-637072)\n+TCAACCACCATGGACCACAAGTCACTCTATGCATTAAGGACAATCTTCACAACATAGCTT\n+GTTGACCTACAATGGCTCAAAAGCTTGAAGCTTGCTCCTCAAGGCATCATCAGCTAGGGT\n+TTGATCCTCTGATGAATTGGATCATCAGTCTAAGGACTTCCAACACAAGACAATCATAAA\n+TGGTCTTAGGACTCACTTGTCTTAAGCATTAACACCATTTCATGACCATTCAAGTCATTC\n+TCAGCTCAAGGGTTAGGAAAACCCAGTCATCAAGAGATCTCCTAAGTCCTCAGCATCACC\n+TGAGACTTGGTCCACAAGCAATCTCAATGACTACATCAAATTATCCAAGGATTCCTTCAT\n+GGATCATTCCATTGACTTCATGTCATCTCATTCAATTGAGAAAAGT\n+>CL81Contig246 (411-1562.1-642013)\n+TCCGTAGTACTCTACGGGAGCCCGTTAGTTTTTATTCTGTCTTTTACTGCTTTAACTGTT\n+TGTGTGTTTGCCGTTCATGGGCAAAGTTTCTCCTTTGTGTTTTTTATATGGGTGAGGACC\n+TAAAGATGTGTGGATCTTATCATGCTCGCAGAGATCTCTGGATCTCATGCCTACATATCC\n+CTAGAGGAAGTCAGAGCATCTGTAGTTCGGGGACTACGGGTATCTAGGGTTTGTTTTGAT\n+GGTGATTATGCTCGCAGAGGATAGACCTCGTGCCTACGTATTCTCAAAAGGAAGTTGAGA\n+AATCAGAGCAAACGTAGTTCCCACTTATATGCCAGTGGAGGCAGAGTTGTGTTTTTGTTT\n+TTGAAGAGACGAAGACAAGTCGTCATTTTAATAAAATATAAATCACCTACT\n+>CL85Contig254 (680-1497.5-1018285)\n+TATCGATTGGATCGCCTCTATACACGGCAAGGATCCCACTTAAGGATCCAGGGAAGGCAT\n+CTTCTCGATAGGCGACAAGGATCCGACATAAGGATTCAATGAATGGATTTCCTCTTTAGA\n+CGGCAAGGATCCAGCGAATGGCTCGCATTTATAAGCAGCAAGGATCCCATTAAAGGATCT\n+ATCGAATGGATCGCCTCTATAGGCGGCAAGGATCCCACTTAAGGATCCAGGGAATGGATC\n+TTCTCGATAGGCGACAAGGATCCCACATAAGGATTCAATGAATGGATTTCCTCTTTAGAC\n+GGCAAGGATCCCACTTAAGGATCCAGGGAATGGATCTTCTCGATAGGCGACAAGGATCCG\n+ACATAAGGATTCAATGAATGGATTTCCTCTTTAGACGGCAAGGATCCAGCGAATGGCTCG\n+CATTTATAAGCAGCAAGGATCCCATTAAAGGATCTATCGAATGGATCGCCTCTATAGGCG\n+GCAAGGATCCCACTTAAGGATCCAGGGAATGGATCTTCTCGATACGCGACAAGGATCCGA\n+CATAAGGATTCAATGAATGGATTTCCTCTTTAGTCGGCAAGGATCTAGCGAATCGATTAC\n+ATTTGTAAGCCACAAGGATCCCACTAAAGAATACAGGGAATGGATCGCCTCAATAGGCGA\n+CAAGGATCCCACTTAAGGAT\n+>CL136Contig14 (845-1299.1-1097761)\n+TACATACACAATTAAAGCGTAGTACTTAACAGCTGCGATCATACCAGCACTAATGTACTG\n+GATCCCATCAGAACTCCGCAGTTAAGCGTGCTTGGGCGAGAGTAGTACTAGGATGGGTGA\n+CCTCCTGGGAAGTCCTTGTGTTGTTATTTTCGTTAATCGGCGTGAAATGTTATCGCTAGT\n+AAGGACTGATTTTTTTTATATTTTTGTTAATCGGCGTGAAATATTATCGGTAGTGAGGTT\n+AATCGGCGTGAAATATTATCGGTAGTGAGGATAAAACAAAAATAGTTGCGTTAATTTTAT\n+TGAGTGGTGTGATAAATGTCGTAAGTAAGGAGAGAAGATATATACGGTTCGTTTTGTTTA\n+CCGGTAAAAGAAAATGGAGTTAGTAAGGAGAAAAGGAATATACATACACAATTAAAGCGT\n+AGTACTTATCAGGTGCGATCATACCAGCACTAATGCACCGGATCCCATCAGAACTCCGCA\n+GTTAAGCGTGCTTGGGCGAGAGTAGTACTAGGATGGGTGACCTCCTGGGAAGTCCTTGTG\n+TTGCACCTCTTTTTTTGTTTTTGCAAGGAATTCTTTTTATTTTCGTTAATCGGCGTGAAA\n+TGTTATCGCTAGTAAGGACTGATTTTTTTATATTTTTGTTAATCGGCGTGAAATATTATC\n+GGTAGTGAGGTTAATCGGCGTGAAATATTATCGGTAGTGAGGTTAATCGGCGTGAAATAT\n+TATCGGTAGTGAGGATAAAACAAAAATAGTTGCGTTAATTTTATTGAGTGGTAGATAAAT\n+GTCGTAAGTAAGGACAGAAGATATATACGTTTCGTTTTGTTTACCGGTAAAAGAAAATGG\n+AGTTA\n+>CL33Contig1110 (580-1253.5-727047)\n+TGCTGGTTTGCTTACTCAAAGGTATGTACAAAGGTTAAAGGATAAAAAAGGGTAAGTATT\n+TTATTTACAATTTTGATTCGATAGAGTTTAAAGACTCTATGCCTACGTACTCCCTCGTGC\n+AATGGGAAAGTCAGAATCACGTAGTTCTGC'..b'TT\n+AACTGGGAGAAAGTTTGAATTGATTTGATTTGAAAAGATATTTTTGAATGTGGTGTGATA\n+GAAGAAAATATTTTATAGAGATGAATCATGTCTAAATCTCTAAGTGTTTTTGATGATTTT\n+TTGAATGGGTGTTGAGGTGCTTTTAAGAAGCCTGAGGGTCCTTGTATGAA\n+>CL11Contig153 (258-5.0-1283)\n+TCAATAACTATAAATTGAGTAAACTCTTTAGTTTCATTGAGACAAAACCTTTTGGAAAAG\n+GTTCTGACTTTTGGATGCTTCTGATTTATGAGAACTTCACTGTTCAAAAGTCTCACACTT\n+CAGAAACGCTTAAGCCTCAGAATCTCATTTTATCGTTCTGAAAGATAAAAGTTATGAAAT\n+TCTCATTTCTTTCATTATGGACATAATTCTATGTTCAAATGTTTGAGAATGAAAACAAAT\n+CTATCTTCAGCTATGGAT\n+>CL207Contig19 (159-5.0-795)\n+CAAGTTATCGTGATTTCTTAAATAATTTTTGATCAATTCATCCTTAATCCATATTTCCTT\n+GAAGAATAGGTCATGACAAATGCCTTTGGCCCAAAAATTCCAATTTTGTCCCTTTTGAAT\n+CAGATGAAATTTCATCAATCAAAACCCTAATTTGACCGG\n+>CL33Contig936 (112-5.0-558)\n+GATAGAAAAAAGGGTTAGGGATCACTTGACGGAGTCAAGGTTTATTATGAAAGCATTTGA\n+AGGGTTAAGGTAAAGGTTAAGGATATATGTACAAGGATAGGGTTGAGATGTT\n+>CL25Contig1626 (159-5.0-797)\n+CATTCCACCCGCCAAGGGGATTTCGCCTCCTTCAATGAGGTCTTAATCCACTATAACCAA\n+ACTGATTACAAACACAAAGACCTACTGTCAATGTCTTCTTGAGAATCTGACTATACCCTA\n+GTCTCTCAAGGAATATCAACTCACAAGTTAAGATACAAA\n+>CL175Contig200 (149-5.0-741)\n+TGATTGTGTTTGTGAAGTTTAAAATGGGAGTTTTGGGTAAAAAGGGTTGAAAACTCGTAT\n+TTGGAAAAATGGTGATTTTTGGGTTCTGAACTGGTGGGAACCGGTTACCACCCTGGAGGG\n+AACCGGTTAGCACAAGAGACAGTAGCGTG\n+>CL86Contig1868 (186-5.0-924)\n+TGGATTATTGACACGGGTAACTGATCCTTGTATCATGCTTCAACTTTCTTATGGATTTCA\n+GATTCTCCATGGGACTCGTGCCCAATTATAAGAATTATCGCTCCCTATGGGACTCAAACC\n+CATTTCGAGCCTTCTTCCTCGTATGGGACTCTAACCCACTTTGACGTCCACCGCTCTATG\n+GGACTC\n+>CL62Contig2990 (143-5.0-719)\n+TGAAGAGAAGCACAAAGGCTCAGATTTTGAGTTGTCAAAATCTAAAGTTCCAGACTCTGA\n+AGGTTCATCAGACTCTAAAGTCTATGCAACTTCTGAAGGAGGCTCAGACTCTGAAGAGTG\n+AGACTTTATCTCAACCTCTGAAG\n+>CL6320Contig1 (222-5.0-1102)\n+TCCTTTCTGAACAAATAATATAACTGAACATGGATTAAAAAAAATCTGCAACTCATAGCT\n+AGTACATTCCGAGTTAACATATCCGACTATCATGTTTAAATTACTAACTTTTACTGTTAA\n+AACTAGTCTAACTTAAAATTGGCACAAGAGGATTGACATTAACTTAGCTTAATTTGCTAC\n+AACCAATTACTAATACTCTAACTGTTTTGTAACTAACACTAA\n+>CL9Contig1287 (136-5.0-675)\n+TAACTCTTTGGTACGTTACCTCGGTGTTCTTTAAACCAAAAGGCATAACCTTGTAGCAAA\n+AAGTTCCCCATGGGGTAATAAATGTGGTCTTTTCCATATCTTCTGGCGCCATCCTGATTT\n+GATTGTATCCCAAAAA\n+>CL71Contig2784 (156-5.0-780)\n+GTTTTTTCTAGTTCGGGTATGTTTTCGGATATACATATCTGAATTAACCTCACGTATTTA\n+AAAAAAATCAATCAGGGATAGTTTCGGATATTAATCTCCGAAGCTACCCTTTTTTGTAAA\n+AAAAGTTTTGGTTTATTCGGACATGAATATCCGAAA\n+>CL76Contig1338 (133-5.0-663)\n+GTTTTAACCTCTTAGATGTATGGCTAGGTTCTCACTCTTAGATGTATGCTTAAGATATTT\n+TATTTCCATGTCCATCTGATGATTATTTCTCTTGATTATGATTATCATGTGAATGATGCT\n+TGTATGTATGAAT\n+>CL4207Contig1 (348-5.0-1724)\n+ATTGGGCATGGGTAATATAAGATTGTAAGTAAATGGTAGGGCACACATCATCACTAGAGT\n+TTTCTTTGTGCCTGAATTGAAGAATAATTTACTAAGCATTGGCCAACTGCAAGCCAAAAA\n+CCTAACTATCATGTTTCAAGGTGGAAAATGCAAAGTATTTCATCCTGAAAAAGGTCTCCT\n+CATGGAAACAGATATGGCTGCCAATCGAATGTTCATTCTGAATGTTGTGGCTCAACCTAA\n+GTCTTCCACGTGTTTTAACACTGTTGCTGAGGATGAATGGTATCTTTGGCATTGTCGGTA\n+CGGACATTTAGGTTTCAAGGGACTCCAAACTCTTGTTCAGAAAAAGAT\n+>CL1Contig321 (162-5.0-806)\n+CCATGTACTTGTATACTACAGAAACTTGTTTTTCTGCACATAGTGCGATCCTGCACTCCA\n+ATTTTCCATCCCACGCATCCATAATCAATCCAAAACCTTACATACATGTTATATATACCT\n+CTATAACATCTAATTGCATCAAAACAACACCTTAATTCATGG\n+>CL44Contig2458 (116-5.0-583)\n+TCGAAATTGTTGCCATTTTCTTTAAACGATAAAAAACAAAACGAAAATGTAACAAAACGC\n+TTTTATTAATGATGATATTCATTTAAAACAAAAGGGGCCCTACATATGATCGTTTA\n+>CL76Contig880 (167-5.0-838)\n+CTAAACACTTGTCATCTTTCTAAACCCGATCCTTTTTTCAAGATCTTCGCCCCATTTAAA\n+CATTTTCAAGGTATCAGTCGGTCTTTTTATCCAATTTTAGTTAGAGTCTTTGATTTTCTT\n+AAACAAATGAGTCTTAATCAATCTTAAATAAGTGAATTGATCGTCAC\n+>CL67Contig523 (262-5.0-1308)\n+ATCATTGCTAGTGTTATGTTTAAGTCTTTTCCCATTTACAATAAAAGGACCCGACGATTG\n+GTTTTGAATCTCAACAGCACCAGATTGCATAACTTTTGTAACTCTGAATGGCCCAGTCCA\n+TCTAGAGCGTAACTTACCAGGGAAAAGGCGCAATCTGGAGTTGAAAAGGAGCACTAAGTC\n+GCCTGCAGTAAAGTCTTTCTTCGTAATTTGCCTATCATGCCATTTCTCAGTGCGTTCCTT\n+ATAGATTACAGTGTTTTCGTAG\n+>CL60Contig1946 (146-5.0-733)\n+AAGTCCAGAAGATTCCCAAAGAAGTCTTCTGGGATCAGAATTCAGGAACCATCTTCTAAA\n+GTTCCTTCAGGTATGCCACCTATACCTTATGTTACTTTTGTTTCTATTTCAGTTTATATG\n+CCAACATCTGAACCAACTCCCCCAAT\n+>CL32Contig2720 (132-5.0-664)\n+ATTATTTGTTGGTCGTTTGTTTTCAAAAAAACATTTCCGTGGTTGATTCCGGATCTGTTA\n+TTCCTCAGTGTTTCCGTTGATGATTCCAGATTCACTTGTTTCCAGCTTATAGGATATTTC\n+TGTTGATGATTC\n'
b
diff -r ff658cf87f16 -r e320ef2d105a test_data/seq_C_10k
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test_data/seq_C_10k Thu Sep 05 09:04:56 2019 -0400
b
b'@@ -0,0 +1,30000 @@\n+>2000001\n+GTTTATCTGATTTGGCATGAGTATTTCATGGTGTACAATGCATTTGATGTATCTGAGAAT\n+TCTCTTAGCTACAAGTATAAATTGTATGATCTTGGTTCCT\n+>2000002\n+TGATCTACCAAATTATCTAATGAGGCTTTATAGAAGCTAGCATATAGGAAGAAGATATTG\n+TTTCCTGAGAGATCTAGTTGGCAAGATCGGAAGAGCACAC\n+>2000003\n+GTTCAATATGGTGAAGTATTGATATTTCTTTTCATCAATACCGTGAGGTATTGATCTTTC\n+TTTTTCTTCAAAACCGTGAAGTATTGATCTTCCCTAGATC\n+>2000004\n+TGCAACACTAGATCTTTTACTTGTCATATTGCGGGCTTATTAAGCAGATAGATGGAGGAA\n+CCAAGATCATACAATTTATAGTTTGAGCTAAAAGATTCCT\n+>2000005\n+CTTTTTGGTTTATTACGGTTTTGGATTTTGCTTACACGGGATTGACCTTTCAACGACTCA\n+TGTTATGTAATGTAGTTCTTTGTTCATCGAGTACTGGACC\n+>2000006\n+GAATATTGGAAGACTGAGCTACAACTTGCAGATTTTCTAACCAAGCCATCAAAGAAAACA\n+AGACTTGAGGGTTTTAAAATACTTATGTAAATGAGAAGAC\n+>2000007\n+TCCCGGTAGATTGAAAACCCGAAAGGGCAATCTAGGAAAAAGTTAGGGATTCATGGCAAG\n+TAACTGCATCATAAGACTTGATCGCCCGCAACGTCGAGAT\n+>2000008\n+TAAAGAGAACTACGTTACATAACATGAGCAGTTGAAAGGTCAATCCCAAGTAAGCAAAAT\n+CAAAGCCATAATAAGCCAAAGAGAAAAGAAAGGAAGATCG\n+>2000009\n+GACAAAGACTCAACCATAAAGAAGAAGTTGCAGAGAAAAGAAGTGCTTTACATACTATGA\n+GTAGTTGAAACGTACCTTATCTGTAGACCAAATAGCTAGT\n+>2000010\n+TATAAAATCATAAAAACGTAACGATTTGTTATCTCGTTGTCATTCTGACTTCCTCTCAAA\n+CTCTGAATTGGAATAGCTAGATGCAAGATCGGAAGAGCAC\n+>2000011\n+TGCCAAATAAGAAAAACTCAAGAAAGGGGAGGGAAATGCATATGTCTATTCTAATTCAGA\n+TATAAAAGAAAGACAACATGACATGTTGCAGCCTCTAGAT\n+>2000012\n+CTTTCTATCGAAATCTTTACTTCATTGCTTTATCGCTTTTCTTAACCTTTTTTATTGTCA\n+AACTGTTCATAGATCGGAAGAGCACACGTCTGAACTCCAG\n+>2000013\n+GGATGGGTAGTCGATTTATCCGCCAATTGCAATGTCATTTTTGTTGCTTTCATATCAATG\n+CTCCCTAATCTTTTCACAATCGACAATAGAATCAAGTTTA\n+>2000014\n+GACTCGTAGATGGAACCTTGTACAAGCAGATCATCAATTCACTAAAGTACATATGCAACA\n+CGAGACATTATATTTGTCATAGTGTGGTCTTAGTAAGCAG\n+>2000015\n+GTACTAGTATGATATCATGCTCGTTATTCTTGATGAAAAGAGTGTTATCAACCTGTCCTC\n+CATGTAACATTTTTTCAGGCGGGAATTTAATTAAAGTGGC\n+>2000016\n+GAGTTTTTGAAAGGATCAATCCTTATCTACCAAAGACTCAACCATAAAGAAGAAGTTGCA\n+GAGAAAAGAAGTGCTTTACATACTATGAGTAGTTGAAGCG\n+>2000017\n+GAGCCCAATTCCTATATTCCGATGAAGCTCTTCATTAGAGCTTTAATTGGAAGAGCTAAG\n+ACTCATCCTCTCTCTCAACACAGATAAAAAACCACACATT\n+>2000018\n+CTCTTTTTAACCTTTGTAACACATTCACGTCATTCCAATAAACATTATTTTTACCTTGAT\n+TTCAGTCTAAGACATTATAAATTCCATGGAGTCTGTAAAA\n+>2000019\n+ACTAAATGAAGAAAAGAAACTACATTACACAACATGAGTAGTTGAAAGGGCCAACCCTTG\n+TTAATTAAAGACTTAAGCCTAGTAAACCAAAAACAGAGAA\n+>2000020\n+CCATTCATCAAGCTACGGTACATACTTTGACTTAGCTTATAGGTAAATTCATCAAGGTAA\n+CGTAGATATTTTGACTTAGTGTAAGGTCGAATCCTCAACG\n+>2000021\n+CGTCAAAAAATGTACATTTCAACTACTCAAACTATGTAATGTACTTTCAATCCTCTTTAT\n+TTGTTTTCTTATTGTTAAGTCTTTGATCAACATCGGTCAG\n+>2000022\n+GTGATTAGGGATTTCGTCGACAAGGAAGAGACGTTTCAACTAATAAAATTATGTAAGTCA\n+CTTATTTTCTCTTCATTTGCTTCTTTAAGTTTGAGTCTCT\n+>2000023\n+TACTTAGAGTTAAGAAAATTCTCACATACATCAAATGCACTCTAGACCCTGAAATACTCA\n+TGCCAAATCTGATAAACTCAAGAAGGGAAGCAAATGCATC\n+>2000024\n+CCATGACATGATGCACAGGGAAATTGAAGTGTATGTTGATGACATGATTGCCAAATCTCG\n+ATCTGAAGAGGGCCATCTTGATGATTTGTTAAAACTGTTT\n+>2000025\n+CTGCTTGTACAAGGTTCCATCTACCCTTCTATCAATTGAGTCCTTATTCAGTTCGATTGA\n+TGTTTTAATGGAATTTATAACGTCTTAGAATGAAATCGAG\n+>2000026\n+GCCATCAAAGAAAACAAGACTTGAGGGTTTTAAAATACTTATGTAAATGAGAAGACTGGA\n+TGATCCGAATTAAAATGAGTATTGGCAATGTAATTCAATT\n+>2000027\n+AGGACATTTTGCAGGGCTTCCCGGTGGGCTTCTGAGTGCAGTAGCAAAGAGAGTATTGAT\n+ATTCTAGAAGGTGTTTGTAACAGTTGATAGATCGGAAGAG\n+>2000028\n+GTGACTTTGTGGCTTTATTTCACAAATTGAGCCAACGGAAATTAGTGAAGCTATAGTCGA\n+CAAACATTGGTATCTTGCAAGATCGGAAGAGCACACGTCT\n+>2000029\n+CTTCATTTTCTTTCTAATAATTATTAGGTCTTTCTTCATCAACATATGGATCTTTCAACT\n+ACTCAAAGTATGTAATGTACTCTCTTTTATTAGATCGGAA\n+>2000030\n+GAGATGGTGAATTGACTGGTTATCTTGGGTGACAACAATTCACAATTTGGATTCGGCCGA\n+GTGAAAGCTTTGAAGAACGGTGATTTCTTGCAAAGGTGTA\n+>2000031\n+AACTAGAGGAATGGCCTTGATAAATTTACCTTTACGTTATGTTAAAGTACATATTCTACA\n+TTGATGAAGATCGGAAGAGCACACGTCTGAACTCCAGTCA\n+>2000032\n+GGTTTCTTTTATCTCAAAGTATAAATTGTATGATCTTGGTTCCTCCGTCTATCTGCTTAC\n+TAAGCCCGCATTATGATAAATAAAAGGTCTAGATCGGAAG\n+>2000033\n+GGATAGGTCCATCCATTCTAAACCTAATCCTAAACTATAGGAAACCAAATGAAAAGAGAG\n+GAAATATTACACATGAAATGCGTAGTTGAAAGGATCAATC\n+>2000034\n+GCTCATGGATCAGTTTGAACAGACTCAGAAGATGCTTAGGGACGAAGTAAATGTCATGTT\n+TGGTAAACTTGTGGAAGCTCTCTCGAGATCGGAAGAGCAC\n+>2000035\n+CCTAAAAGACGAAAGAAAGTACGTTACAAACTTGGAGTAGTTAAAAGGTTCATTCCTCGT\n+TGAACAAAGACCTAGTAATTAAAAAACAAA'..b'TAACCCAA\n+>2009966\n+GTCATTTTGACTTTCTTTTAAATCTGAATGAGAATAGACATATGCATTTGCCTCCCTTCT\n+TGAGTTTTTCTTATTTGGCATGAGTATTTCAAGGTCTAGA\n+>2009967\n+TCGGGGTTTTACAGGGAGCATCTCAATTTCCGGAGTCAATCCCAAATCACAAGGAGCATT\n+ACCTTTTTGTTGACAATGTGGAAGACATCGTTCGATTGGA\n+>2009968\n+AGACTTAAGCCTAGTAAACCAAAAACAGAGAAAAATGTACATTGCATACTATGAGTAGTT\n+GAAATGTATGACCCTTGTTGACTAAAGACTTAACCATAAG\n+>2009969\n+GGTTGATCTTTTGTTAGTTTTCATTCTTTTGGGAGATCTGCGTTTTCCTGTGGAGAGTAT\n+GGGATTTGTTGCTGAACTGATTCTTCTACTATGACTAGAT\n+>2009970\n+AGTCACTTATTTTCTCTTCATTTTCTTCAAGTTTGAGTCTCTGGTTGATAAGAACTGATC\n+CTTTCAACTACTCATAGTATGTGAAATATTTCCTTTCTTT\n+>2009971\n+TCTTACGATTAAGTCTTTGAGAAACAAGGGGTTCTCTTTTTAACCACTAATGTATTTTAT\n+GTACTCTTCATTTGATTTATATTACTTGGTCTTTGATCAT\n+>2009972\n+GGTGAAGTTCTGAAGATCAAGTTCTTAGTTGAAAACAAATCAATAATTTTTAGAAGCAAG\n+CATATAGAAAGAAGATATTTTTTCCTGGGAGATCAAGTTG\n+>2009973\n+GTTGACAACAAATCAAAAATTGATCTACCAAATTATCTAATGAGGCATGTTAGAAGCAAG\n+CATATAGAAAGAAGATATTGTTTCCTAAGAGATCAAGTTG\n+>2009974\n+AAGGATGTAATGTACCTTGATGAATGGACCTTTACACTAAGTCAAAGTATGTACCGTACC\n+TTGAAGAATGCACGTGTACACTATGTCAAAGTATGTGCCT\n+>2009975\n+GCAACCCAAGCAAACTGTGACCACTTAACCATTTAAAAATTTAGTAAGTTGACTAAATAT\n+TTGGCTATATATATACTCTTTTAAACCTTTGTAACACACA\n+>2009976\n+GAAAAACTCATGAAAGGGGAGGGAAATGCATACGTCTATTCTAATTCAGATTTAAAAGAA\n+AGACAAAATGACAAGATCGGAAGAGCACACGTCTGAACTC\n+>2009977\n+GGTGATTGGAGCTTATACCATACAGAGAGGCCCATTGAAAGTTTATTGAAAACATATAGA\n+AAAGATATTTATAAAAATGGTTGGGTCTTACACTCTATAT\n+>2009978\n+GACGTTTCAACTAATAAAATTATGTAAGTCACTTATTTTCTCTTCATTTGCTTCTTTAAG\n+TTTGAGTCTCTGGTTGATAAGAACTGATCCTTTCAACTAC\n+>2009979\n+GTCATAGTAGAAGAATCAGTTCAGGATCAAATCCCATACACTCCACAGGAAAACGCAGAT\n+CTCCCAAAAGAATGAAAACTAACAAAAGATCATCCGATAG\n+>2009980\n+GCGGTTTTAGCGCAAAGGTTTTGAAAAGGTGGTAAAAGCAAGCAAACTAGCCTAAACTAA\n+TGCAAGAAATAAATTGGTCTCATTGTAAGGTAGCCCAAGA\n+>2009981\n+GGAAGCAAATTGAAAAGAAAAACAAAGCGGGAAATTTACTTCTGCCAAGGAACTTAGGAA\n+GCAAATTGAAAAGAAAAACAAAGCGGGAAATTTACTTCTC\n+>2009982\n+GAGGTTTCTTTTATCGCAAAGTATAAATTGTATGATCTTGGTTCCTCCGTCTATCTGCTT\n+ACTAAGCCCGCCTTATGATAAATAAAAGGTCTAAGATCGG\n+>2009983\n+TCTAATTCCTAGGTCATAGTTCATCAAGGAATGGACCTTTCAACTAATCAAAGTATGTAA\n+TGTACTTTATTTCCTCTTTATTTGGTTTCTTATGGTTAAG\n+>2009984\n+GACTTGAGGGTTTTAAAATACTTATGTAAATGAGAAGACTGGATGATCCGAATTAAAATG\n+AGTATTGGCAATGTAATTCAAAGATCGGAAGAGCACACGT\n+>2009985\n+CATAAAATACATTAGTGGTTAAAAAGAGAACCCCTTGTTTCTCAAAGACTTAATCGTAAG\n+AAACCAAATAAACAAATAAGAAGTACTTAGATCGGAAGAG\n+>2009986\n+GTGTTGCAAGTATTATTGCAAGTCTAATACTTGCAACACGACTTGCAACACAACTAACTG\n+CTAATATAGTTGAATTATAGTGCCAATACTCATTTTATTT\n+>2009987\n+ATGAAAAAAAAGAAACTACATTACACAACATGAGTAGTTGAAAGGGCCAACCCTTGTTAA\n+TTAAAGACTTAAGCCTAGTAAACCAAAAACAGAGAAAAAT\n+>2009988\n+AGTTCCATTCATCAATGTAGAATATGTACTTTAACATAACGTAAAGGTCCATTTATCAAG\n+GCCATTCCTCTAGTAAGACCACATACATCAACTTAGTGAA\n+>2009989\n+CCCTCAAGTCTTGTTTTCTTTGATGGCTTGGTTAGAAAATCTGCAAGTTGTAGCTCAGTC\n+TTCCAATATTCAATCTTCAAACTTCTTATTTGCCAACTTG\n+>2009990\n+GGGGTTTTTTTTTACTAGAAGTAATATGCAAAAGATATCTTGAAGAAGCTCAAGAAGAGT\n+AAATTGCAACCCTGCAATTACTCCATTGAAACGTGAATCG\n+>2009991\n+ACAACTCATTGTATGTAAAGCACTGAATTTCTCTTCAATTTCTTCTTCATGGTTGAGTCT\n+TTGGTCGAAGATCGGAAGAGCACACGTCTGAACTCCAGTC\n+>2009992\n+GAGAAGGTGACCTGGTACTAAAAAACATTAATCTTTCCTCACAGACTCTAGGGGCAAATG\n+GACGCCTAATTATGATGGGCCATACGTCGTCAAGAAAGCC\n+>2009993\n+GTCTTAGAATGAAATCAAGTTAAAAATAATGTTTATTGGAATGACGTGAATGTGTGTTAC\n+AAAGGTTTAAAAGAGTATATATATATAGCCCAATATTTAG\n+>2009994\n+GGTTGAGTCTGTGGTCGATAAGAATTGATCATTTCAACTACTCATAGTATGTCTTAGTTT\n+TCCTTTCTTTTTATTCGGTTTATTAGTTTAATATGGTTTT\n+>2009995\n+ATACATTACATATCATTAGTTTAAAGGTAAGAAGTACTTTACAGGTAAGAAGAAGTACGT\n+TACATAATATGAGTAGTTTAAAGGTCCATTCTTTGTTGAC\n+>2009996\n+CTTTATTTGGATTTTAATGCCTACGTCTTTGGCCAAAGACGCCTGGACTTTTCAACTAAT\n+CATATTATGGAAACTACTTCGTACCTATTTATCTATTCAC\n+>2009997\n+GATCCACTAACCAAATGTTTGCACATAAGACTTTTCATGGGTATGTTGGTCACACGGGTC\n+TTGGTTTTGAGATTGCCTTTGATTCGAACGAGTATTTTCC\n+>2009998\n+CCCTAGCGGATTTTCTCTCTCGACATATTACATTCCTTAGCAAAATTTCTTTGCATTCAA\n+GGGAGAATCTCAGTTCCCAATGGACTAAATTCCCCAGTGA\n+>2009999\n+TCACCAATATTCAACTCTTCAAATAACAATTCAATTCACAAGGCTTGGTAGTTTAAATAA\n+GCGGCCGCAACATATTCTACTTCACATGATGGATGATAAG\n+>2010000\n+TGGTTGAGTCTGTGGTCGATAAGAATTGATCATTTCAACTACTCATAGTATGTCTTAGTT\n+TTCCTTTCTTTTTATTCGGTTTAGATCGGAAGAGCACACG\n'
b
diff -r ff658cf87f16 -r e320ef2d105a test_data/seq_I_10k
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test_data/seq_I_10k Thu Sep 05 09:04:56 2019 -0400
b
b'@@ -0,0 +1,30000 @@\n+>2000001\n+TCTTTCCATTCAAAGCACAAGTACATTTGAATCAGGGTTTATACTACAAAACGTCCAGGT\n+ACATAAGCTTAATCTCCAAGTTCCAATTCATAACTTTTCC\n+>2000002\n+GGTCAATACTCCCTCTCTGCATGTATTCACAGAAATCCATTTACCTGAGGCTGAATGGTG\n+TCAAGATAGGTTTGACCAGCTTGATTCGGTCGAAGGAAAA\n+>2000003\n+TTGTTAGGAGAAATAATTAAAACTCCAACACATGTTCCATCCCTATGACTAGAACCATCA\n+AAATATAACTTCCATGGCTCGAGCTCTAGATAGTTTTGGG\n+>2000004\n+CACAAACCAAATCTTTAAAATTCCCTAAAAATAAACGTCATATGCTAAGAACATTTTTGA\n+TGCAAATGTATGCAATGTCATGTAATGCTTTAATCTCAGG\n+>2000005\n+ATCTTTAATATTTAAAAATTAATTACAAGTGAATTCTACAAATTAATGTTTCAAATTGAT\n+GTTACAAATTAAAATACAAATTAAATAATTACAAATAATA\n+>2000006\n+AGAGAAAAATCATATGTGTTTGAAGTCTTTAAAGATCTATGTCAAATATTACAAAGAGAG\n+AAAGTGGAATAATCAGAATCAGAAGTGACCATGGTAAGGA\n+>2000007\n+GACGGAAGTATGAGCGGCAACCATCGATGAAGATAGCTCCTCCGACGTTACGATCCTTCC\n+AGGTATCTCATTTTGCTCTTTTTTCCTTTCTCCAATAAAA\n+>2000008\n+CTCTTTTTTTTTATAATTGTCTAAAATTTAACAAATTTATAAATAGATAAAAGTTTGGTA\n+CAATTAAGGACAAAAATTGTACTCTCTAAAATTTATAAAA\n+>2000009\n+GAGATTTGCCGGTGATGATGCACCGTTACCGCCTGTGTACGGTTGTGATTATAGCGAGGG\n+TGATCGGACGGACGAGATGGTTGGACTTGTTAGTCTTTTG\n+>2000010\n+GGCAAAGGCCAAGGGTAAACAGGTCATGATTGACGATCAAGATTCTGCACCAGTAAATAT\n+CCCCAAGCAAAGTGCGATGCCCGAAGCTTCTTCGTCTCAA\n+>2000011\n+GGACGAGTATTGAAAAAAGAATATTAACATGGACGAGTGTTGGAAATCGATACGAACGAG\n+TATTGGAAAAAGATAAGAATCAACACGGACGAGTGTCGGA\n+>2000012\n+TCCTGTTATGCGGCATGGGAAAATTTTATATAGGTCTCTCATGTCCCTCAACGTGATTTG\n+TGGAGTATCCATCAACCATATTTATAGTCATCTTGCTACA\n+>2000013\n+TGTTCAATGTCATATTTATCACAATATTCCTCAAAGAGGTGGTTTTCAAATTCACCTCCG\n+TGATCGGTTCAAATAGCAACTATTTTTAAACTAAACTTGT\n+>2000014\n+GTACAACCTCTAGGTATAAGTTCGATATATCCTCCGCATCCTCTGCACTCAGACCTTGAG\n+ATCTATCAGACTTTGGAGATAATGCAGCTGCAATAATTGA\n+>2000015\n+TGTTCTTCATTAATCTATGATCATGTTGCTGCAAATTACATGGATACGTATAACAACATC\n+CTTAATCTGTCAATATATGGCCCAATGCTGCAAACCAGAC\n+>2000016\n+AGGACCCTCGATGTCCCTCGGATAGCCTTCTCTTGGGCTCAAAATACAAGGACCCTCGAT\n+GTCCCTCGGATAGCCTCCTCTTGGGCTTCATACAAGGACC\n+>2000017\n+TCTCCCGTGGCGAGGATCGGGGACGGGGACGGGGAATAATTTGGGGGACGGGGCGGAGAA\n+CGCGGAAGCATCCTCCGCAGATTCCCCGCCCCGTTGACAT\n+>2000018\n+AACATGTCGAGATATTTCTAGAAAAAATCTGGTTGCATATGATTTTACAAATTGTTTCAA\n+TTACATTTACTATAAATATATTTCCTAAAATAAAAAAAAA\n+>2000019\n+GAATCGATGCCCATTTGAAGCAGTGGGCTCGATGCGTCTGATCATGTCAATGCCCCACAT\n+TGCGAAGGGCCAAGGAGAAGTCAGAACATTCAAAGGTACA\n+>2000020\n+AAGTCCACAGTCCAAACTCCAGATGCTTAGGATAATCAAAACAACTCCAAAAGAGATTAT\n+CAATTTTTTTAGAGTTTTTGATATTATTTATTGTTTTTAG\n+>2000021\n+CACGATCTCAAATATGTAAACAATTTTTATTATGACCAAACATATAACAATAAATTTGAG\n+CATGAAGGTTGAGATCTACAAAATTCAGATCATCATGAAA\n+>2000022\n+GACATGTAAGTTGTATCTCTCAGAAAGAAAACATATTTAGAGAGTCTGTGTACCTCACGA\n+GTTTGTAGTGAAGTTGTGTGTGTTCTCATCCATGAGCTTT\n+>2000023\n+AGAAACAAAATATAAATAAAAGAGAGTACATTACATACTTTGAGTAGTTGAAAGATCCAT\n+ATGTTGATGAACAAAGACCTAATAATTATTAGAAAGAAAA\n+>2000024\n+GATTTTAGGCCTTAGGCCCATGTTTCCATTTACTCCTTGGAACCCCCATTTACTCTTTGC\n+ACTCCCCTTGAGTTTTTATTTATTTTATGCTTTCAAATAT\n+>2000025\n+GCTCATGGAAAACATCATTGTGCGCAATTTCAACATGTCGGTGAAGGTTGAGATGTCTCT\n+TAATCAGCAAGAAGAAAACAAATCCATAATATTTCAAATG\n+>2000026\n+GAAAAATAACTGGCGAAAACAACAATTGTAAGCACTCTGAGACGACTCTCAAACCGGTCA\n+ATTAAAGTTTTCAAAGTCTAAAGAGTAAAAGTTGATTCAA\n+>2000027\n+TGCTTTTTAGGGGGAGCGTGGTTCTATTTTGATTGGTTGTTGTGTTGTTGTATGGTTGCA\n+TGGCATACCTGATGTCCTGACATCCTGACTATGATAATTT\n+>2000028\n+AAGTTCATCACCATCACCATCTAGAATTTCACATTGAGATTCTAGATTGGTGGAACTAGA\n+CCTTCATCAATTCAATTCACATTCATCATCAAAGTATCAT\n+>2000029\n+GGACTGTTGCAAATTTTCATATCTCAAGCCAATTAAATGTTGCAGTACTATACCGTGTAT\n+ATAACTACATCATTTATTCATTTTCCGATTAATATTTAAT\n+>2000030\n+GGAGCTTGCTTCAACCCATATAAGCTCTTTCTCAATCTACACACAAGATCTTCTTTTCCT\n+TTAACTTGAAAACCATCAGGTTGTTTCATGTAGATATCTT\n+>2000031\n+GTTTCTCCTCTTGAGATATCAGATTATTATTAGGACTATCGTTTCTTCTTCACCTTAAAT\n+CTACCAAAAGGACGTGAACATTGTCTTCCTCAACGGATAC\n+>2000032\n+CAGGCCCTAGCGATTGTGTGTGTTTGTGTGTGTTTGAAAGAAATAATAGACAAATATACT\n+TGTTTAGTGAGCACAACCATCTAAACTACAATCGCTACCT\n+>2000033\n+GAACACAAGATTATGGCATGATGATGATGATGATGTGATAAAGAAAAGATGAGAGTGAAG\n+TTAGAATATTTATACCAACTAGTGCCACTTGGTTTGGTGA\n+>2000034\n+TCACCCCTAAGATCCCACATATATCTCAATACATAGCATGCATCTCATATGTTTGATTGA\n+TGTCTTTGCTTATTAATCTTCCCCAAGGTTTCCTCACTTT\n+>2000035\n+CCTTTGTAACACATTCACATCATTCCAATAAACGTTATTTGTACTGTAATTTAAACATTC\n+TAAGCCACTATAAATTCCATGGAGTGTGTC'..b'TGAGGGCT\n+>2009966\n+CAACAGATTCCTGGGCAGATGTGAAGAAGAACTCCCTGAATTGGAGGTCTCAGATAATGT\n+TTTGTGTAAGGAGATAACTTCAAGGCAAGTGAAGATCGGA\n+>2009967\n+GATTTGAGTAGTTTTTAGTTTAATGGTGGTTGTTTATAAAGAACCAAAACTTGACCCCAC\n+ATTTTATATTTTTTATTTGTAGTTTAATATTATGTTGCAG\n+>2009968\n+CCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCT\n+AAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACC\n+>2009969\n+GAAACTAATCAATCCTTTTGAAAATGGATAGAATCATTTATCCTTAAATCATGCGCCACA\n+TGGAATAACAGAATATTCTCAAATATTCTGCAATAGAATT\n+>2009970\n+TCCTTGGTTGATAAGGGAAACTAGTATAATTCCCTTGTGTTCTTTACTTTTATGCATTTT\n+ATTTTCTTAATTTCATTATTGTAAGAACATAATATTCAAT\n+>2009971\n+GGGAGCTCAACCTTTGAAAAGTTAAACAAAAAAGGGAATGCAACTAAATCTAGATATCCA\n+TATTTTTCAGTAATAAGAGCTAATATTTTCTGTAAACAAT\n+>2009972\n+GCTTGAGATCAGATTTCACTAACCGGTTACCATACCATGAGCATGTAAAGATAACGGGCT\n+TAATGTGATTGCTTTAGAATGAAAAAGGATTTGAAAAGAG\n+>2009973\n+TTCTCAGCATAAAAGAAATATTTTGCAATTCTCCTTTTTCTCTATATACCTTCTTGAGGA\n+GGCAACTCAGGCCTTATAATTCTGAGATATAATATCTTAA\n+>2009974\n+CAAGTATTGGTCTCGATTTCACTTCCGTCGACCCCTTTCTTCTTACCACACTTCTGTCCT\n+CCGAGGAGGTAAAAATGAAGAAGGATTTGTGTGCTTGGTA\n+>2009975\n+GTGAATCCTGGGAACACTATGAAGATCAATGTTGACAGACCAAATCCCTCCATTCAACCA\n+AGGTTTGGGTCTTTTTATTTTTGTTTTGATGGTTGTAAAA\n+>2009976\n+GCTTCCACCTAAACTGACTAATCCTGGTAGATTCACCATCCCTTGTTCTATTGGGCCTGT\n+AAAATTTGGCCAAGCTCTTTGTAATTTGGGGGCAAGCATT\n+>2009977\n+AATTATAACTATTCCACCTTTTTGTTGCACATAACGACATATTAATCATCTACTCGAATG\n+AGTATCTACTAATTAGATTTTAGGAGATTTAAGAATAGTA\n+>2009978\n+GGCTCCAGATGAAAAGGGAAAGAAGAATTTTCAACAAAAAATGAAGGTAGGAAAGGGATA\n+AGTGGCGGACGTGTTCCTGCTGCAACACCCTACTACCCCA\n+>2009979\n+CTACGTGCACCTATTGGGTCAAAATTGTTTTTATCACCTTCATCTCTTTCTTTTTATTGA\n+ACAAGTGACAAGATCATAGTCTCTACTATGCATTTAACAC\n+>2009980\n+CCGAACATTCCGAGAGGAAAAAACTATATGGCGAGGACTGGGATCCAGGTGCTACTTTTT\n+TATTTAAAAAATTAAACTCGTGTTGTGCAGATAACATGAA\n+>2009981\n+GAGGGGAAACTAAATATTATTTGGAATCTCTTAGATAACAAAAAAACTAAGACTAAAAAG\n+GGGGTGAGATAAGGAATGGGTGTATGGGCCTAAAATTAAA\n+>2009982\n+GTCCTTTTGAAAAGAAAAAAAGAGAAGAATAAAAGGAAAAAGAGAAGAGAAATAAGTTGT\n+GAGGGTGTTGTTTGAAAGAAATTGAGGTATCTTATAGTGC\n+>2009983\n+CCCTCTGCACCCATGCAATACACTCTACTTTTGGAGCTTTACAGCTAACTCTGCTTTTGG\n+AATATGAAAATCATTAACAGCTAACTCTGCTTTTAGAGCT\n+>2009984\n+GTTCTAATAACCTCCTTTTCCCCACGATAAGCGCATTTGGCAGTTCCCTGGGTGCATTCC\n+TAACAGAGCTCCTTTGTGATGTTACCAGCAATGTTGCATG\n+>2009985\n+GGGGAATCACCGTCAAAGAAATCCCTATAGTCTTCATCCTCATCTTCTAGCTTATCAAAA\n+GCTGTTTGAAACTTCTCAGCCGCATCTAACATCAAATATA\n+>2009986\n+GAAATATAAGGTCATGCCACTCAAAAGTACTTCTTACACCTCACTATTATCTGTAAGCTA\n+TTCAACCTTTCTCATATTTTTTTTCTTCTTTTTACTTTCA\n+>2009987\n+TGAATAATTGCATACTTAGTTGAAAAAAAAGACTTAGTAAATTTTTACTAAGTGTTGAAT\n+TTCAATAAGTGAGAAAAATCCATTTTTAAAACAACATAAT\n+>2009988\n+GCGTCAACATGTTTTGTCCAGATTCAAATGACGTTACATATCATTTCTTCAGATTTAGAA\n+CTTGTAAGTTCAACCGCACCCTAGATAAAACAATTAGGGT\n+>2009989\n+ACACCACAATCATCACGATTTCATCAAAACACACAATAATCCTCACAACACACTAATTAT\n+CACAACATCATAATTTTCCTCCAAGTACGCCATAAACGTC\n+>2009990\n+CTCTTCAACATCCTTTGAAGGTTGCAAATCGAGTCTAACAGTTGCCAGATCTACCACCAT\n+ATTCATTTCATTCATGTTACTTCCAATTTAAGCAATATTT\n+>2009991\n+GGTACATGTTGTTGCAAGTTGTCTAATTTCGACCTACCATGGCCAACCTGAACTTGCAGA\n+ATTTGCATTTTCACCCTTAGGCCCATATCCCTTTTTTTTT\n+>2009992\n+GTTAGTTTTCATTCTTTTGGGAGATCTGCGTTTTCCTGTGGAGTGTATGGGATTTGATCC\n+TGAACTGATTCTTCTACTATGACTTGATTTCCCCTTAACC\n+>2009993\n+CTCTTGTTGCAGGACTGAAGGTCCTGGACAAAGGATATTCTATTGCAGATCATGTAAAGA\n+AGATTATTAGAAGTCTTCATAAGAAATGGAGACCTATGGT\n+>2009994\n+GTTAGTTGAGTTGTAATTCAACATCTTTAATATGTTGAAAGCATATATCACTAGGGTAGT\n+GATTGAGAGAATGTGAGAAGGGTTCACATATTTATGGGGA\n+>2009995\n+AGGGTTTCGATTTTGGCATTTCGGAACAACAACCAAAAAAATGGGAGGTGGTGCAGCAGA\n+TCACGGGAATGGCGGCAATGGAGATTTCAGATACAAGGTT\n+>2009996\n+TGGCATGTCTTGTCTCCTTTAGTTTAAACTCTATCAAATGGACGTGAAAATTGTCTTTAT\n+CAATGAGTACTTGAATAAGGAAGTCTATGTTGAGAAAACA\n+>2009997\n+TTCCTTCTCCTTCAATTGCCTTTGTAGTTCTAACTTCTTCGTATTCAAGGCATGAAACTT\n+GTTTTCCCAAGCATCTCTATCTTTCTTTATCCGAGCTAGA\n+>2009998\n+GAAGAATCGGATTTTTATCGTTTTAGTTTCGGTTTCGATTTCATTCGGTTTTGTAAATAA\n+TTTTAGCTTAGGGTTTTGTTTGCTTGTTTGCTTGGTTTGT\n+>2009999\n+GGTTATGGCACAAAAGAATTGCACATATTCACATGGAGCACTTAAACAAGTTAGTGAAGC\n+ATGACCTTGTTATCGGCCTACCAAAGATGAAGTTCCTCAA\n+>2010000\n+AATTAAAGTCATATTTTGATATCTCCTTGGACATCGGTTATTTTAAAGTTTGTGTTATTA\n+AATATTTTTCAAAACTCACTACACACATGACTTCAATGGT\n'