Repository 'dedup_hash'
hg clone https://toolshed.g2.bx.psu.edu/repos/mvdbeek/dedup_hash

Changeset 0:f33e9e6a6c88 (2016-11-23)
Commit message:
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
added:
HISTORY.rst
LICENSE.txt
Makefile
README.rst
dedup_hash.xml
dedup_hash/__init__.py
dedup_hash/__init__.pyc
dedup_hash/dedup_hash.py
requirements.txt
setup.py
test-data/r1.fastq
test-data/r1.fastq.gz
test-data/r1_dedup.fastq
test-data/r1_dedup.fastq.gz
test-data/r2.fastq
test-data/r2.fastq.gz
test-data/r2_dedup.fastq
test-data/r2_dedup.fastq.gz
test/test_dedup_hash.py
tox.ini
b
diff -r 000000000000 -r f33e9e6a6c88 HISTORY.rst
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/HISTORY.rst Wed Nov 23 07:49:05 2016 -0500
b
@@ -0,0 +1,20 @@
+.. :changelog:
+
+History
+-------
+
+.. to_doc
+
+---------------------
+0.1.1 (2016-11-23)
+---------------------
+* Make python2/3 compatible
+* Use smhasher as default hasher and add options for cityhash and hashxx
+* Testing enhancements
+
+* Initial version
+---------------------
+0.1.0 (2016-11-16)
+---------------------
+
+* Initial version
b
diff -r 000000000000 -r f33e9e6a6c88 LICENSE.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/LICENSE.txt Wed Nov 23 07:49:05 2016 -0500
b
@@ -0,0 +1,8 @@
+The MIT License (MIT)
+Copyright (c) <year> <copyright holders>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
b
diff -r 000000000000 -r f33e9e6a6c88 Makefile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Makefile Wed Nov 23 07:49:05 2016 -0500
[
@@ -0,0 +1,53 @@
+# Location of virtualenv used for development.
+VENV?=.venv
+BRANCH=curie2
+GALAXY_REPO=https://github.com/mvdbeek/galaxy.git
+CONDA_PREFIX?=.conda
+CONDA_PREFIX_PATH?=`readlink -e .conda`
+# Source virtualenv to execute command (flake8, sphinx, twine, etc...)
+IN_VENV=if [ -f $(VENV)/bin/activate ]; then . $(VENV)/bin/activate; fi;
+PLANEMO=$(IN_VENV) planemo
+
+setup-venv: ## setup a development virutalenv in current directory
+ if [ ! -d $(VENV) ]; then virtualenv $(VENV); exit; fi;
+ $(IN_VENV) pip install -r requirements.txt;
+ $(IN_VENV) pip install planemo
+
+lint-35: setup-venv
+ $(IN_VENV) pip install tox && $(IN_VENV) tox -e py35-lint
+
+lint-27: setup-venv
+ $(IN_VENV) pip install tox && $(IN_VENV) tox -e py27-lint
+
+db:
+ if [ ! -d db_gx_rev_0127.sqlite ]; then wget https://github.com/jmchilton/galaxy-downloads/raw/master/db_gx_rev_0127.sqlite ; exit; fi;
+
+setup_galaxy_clone:
+ if [ ! -d .galaxy ]; then git clone --depth=50 --branch $(BRANCH) $(GALAXY_REPO) .galaxy; exit; fi;
+
+planemo-test: db setup-venv setup_galaxy_clone
+ if [ ! -d $(CONDA_PREFIX) ]; then $(PLANEMO) conda_init --conda_prefix $(CONDA_PREFIX);fi && \
+ $(PLANEMO) conda_install --conda_prefix $(CONDA_PREFIX_PATH) . && \
+ $(PLANEMO) test \
+ --galaxy_database_seed db_gx_rev_0127.sqlite \
+        --galaxy_root .galaxy \
+ --galaxy_source $(GALAXY_REPO) \
+ --galaxy_branch $(BRANCH) \
+ --conda_dependency_resolution \
+ --conda_prefix $(CONDA_PREFIX_PATH)
+
+planemo-serve: db setup-venv setup_galaxy_clone
+ $(PLANEMO) serve \
+        --galaxy_database_seed db_gx_rev_0127.sqlite \
+        --galaxy_root .galaxy \
+ --galaxy_source $(GALAXY_REPO) \
+ --galaxy_branch $(BRANCH) \
+ --conda_auto_install \
+ --conda_dependency_resolution \
+ --conda_prefix $(CONDA_PREFIX_PATH)
+
+py-test:
+ $(IN_VENV) python test/test_dedup_hash.py
+
+clean:
+ rm -Rf *.sqlite* .venv .conda .galaxy dist/ *.egg-info tool_test_output.* .tox  || true
b
diff -r 000000000000 -r f33e9e6a6c88 README.rst
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst Wed Nov 23 07:49:05 2016 -0500
b
@@ -0,0 +1,26 @@
+.. image:: https://travis-ci.org/mvdbeek/dedup_hash.svg?branch=master
+    :target: https://travis-ci.org/mvdbeek/dedup_hash
+
+dedup_hash
+----------------------------
+
+
+This is a commandline utility to remove exact duplicate reads
+from paired-end fastq files. Reads are assumed to be in 2 separate
+files. Read sequences are then concatenated and a short hash is calculated on
+the concatenated sequence. If the hash has been previsouly seen the read will
+be dropped from the output file.  This means that reads that have the same
+start and end coordinate, but differ in lengths will not be removed (but those
+will be "flattened" to at most 1 occurence).
+
+This algorithm is very simple and fast, and saves memory as compared to
+reading the whole fastq file into memory, such as fastuniq does.
+
+Installation
+------------
+
+depdup_city relies on the cityhash python package,
+which supports python-2.7 exclusively.
+
+``pip install dedup_hash``
+
b
diff -r 000000000000 -r f33e9e6a6c88 dedup_hash.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/dedup_hash.xml Wed Nov 23 07:49:05 2016 -0500
[
@@ -0,0 +1,114 @@
+<tool id="dedup_hash" name="Deduplicate FASTQ files" version="0.1.1">
+    <description>with fast and memory-efficient sequence hashes</description>
+    <requirements>
+        <requirement type="package" version="0.150.1">smhasher</requirement>
+    </requirements>
+    <command><![CDATA[
+    python '$__tool_directory__/dedup_hash/dedup_hash.py'
+    #if str($readtype.single_or_paired) == "se":
+        --r1_in '${readtype.input_single}'
+        --r1_out '$output_single'
+    #elif str($readtype.single_or_paired) == "pe_sep":
+        --r1_in '${readtype.input_paired1}'
+        --r2_in '${readtype.input_paired2}'
+        --r1_out '$output_paired1'
+        --r2_out '$output_paired2'
+    #else
+        --r1_in '${readtype.input_paired.forward}'
+        --r2_in '${readtype.input_paired.reverse}'
+        --r1_out '${output_paired_coll.forward}'
+        --r2_out '${output_paired_coll.reverse}'
+    #end if
+    $compress_fastq
+    ]]></command>
+    <inputs>
+        <conditional name="readtype">
+            <param name="single_or_paired" type="select" label="Single-end or paired-end reads?">
+                <option value="se" selected="true">Single-end</option>
+                <option value="pe_sep">Paired-end (two separate input files)</option>
+                <option value="pe_collection">Paired-end (as collection)</option>
+            </param>
+            <when value="se">
+                <param format="fastq,fastq.gz" name="input_single" type="data" label="Single-end FASTQ reads" help="(-f)" />
+            </when>
+            <when value="pe_sep">
+                <param format="fastq,fastq.gz" name="input_paired1" type="data" label="Paired-end forward strand FASTQ reads" help="(-f)" />
+                <param format="fastq,fastq.gz" name="input_paired2" type="data" label="Paired-end reverse strand FASTQ reads" help="(-r)" />
+            </when>
+            <when value="pe_collection">
+                <param name="input_paired" format="fastq,fastq.gz" type="data_collection" collection_type="paired" label="Paired-end FASTQ reads as paired collection" />
+            </when>
+        </conditional>
+        <param name="compress_fastq" type="boolean" checked="true" truevalue="--write_gzip" falsevalue="" label="Produce compressed fastq?"/>
+    </inputs>
+    <outputs>
+        <data name="output_single" format="fastq" label="Single-end output of ${tool.name} on ${on_string}">
+            <filter>readtype['single_or_paired'] == 'se'</filter>
+            <change_format>
+                <when input="compress_fastq" value="--write_gzip" format="fastq.gz" />
+            </change_format>
+        </data>
+        <data name="output_paired1" format="fastq" label="Paired-end forward strand output of ${tool.name} on ${on_string}">
+            <filter>readtype['single_or_paired'] == 'pe_sep'</filter>
+            <change_format>
+                <when input="compress_fastq" value="--write_gzip" format="fastq.gz" />
+            </change_format>
+        </data>
+        <data name="output_paired2" format="fastq" label="Paired-end reverse strand output of ${tool.name} on ${on_string}">
+            <filter>readtype['single_or_paired'] == 'pe_sep'</filter>
+            <change_format>
+                <when input="compress_fastq" value="--write_gzip" format="fastq.gz" />
+            </change_format>
+        </data>
+        <collection name="output_paired_coll" type="paired" structured_like="readtype.pe_collection" label="Paired-end output of ${tool.name} on ${on_string}">
+            <filter>readtype['single_or_paired'] == 'pe_collection'</filter>
+            <data name="forward" format="fastq">
+                <change_format>
+                    <when input="compress_fastq" value="--write_gzip" format="fastq.gz" />
+                </change_format>
+            </data>
+            <data name="reverse" format="fastq">
+                <change_format>
+                    <when input="compress_fastq" value="--write_gzip" format="fastq.gz" />
+                </change_format>
+            </data>
+        </collection>
+    </outputs>
+    <tests>
+        <test>
+            <param name="single_or_paired" value="pe_sep"/>
+            <param name="input_paired1" value="r1.fastq.gz" ftype="fastq.gz"/>
+            <param name="input_paired2" value="r2.fastq.gz" ftype="fastq.gz"/>
+            <param name="compress_fastq" value="--write_gzip"/>
+            <output name="output_paired1" file="r1_dedup.fastq.gz" ftype="fastq.gz" compare="sim_size"/>
+            <output name="output_paired2" file="r2_dedup.fastq.gz" ftype="fastq.gz" compare="sim_size"/>
+        </test>
+        <test>
+            <param name="single_or_paired" value="pe_sep"/>
+            <param name="input_paired1" value="r1.fastq" ftype="fastq"/>
+            <param name="input_paired2" value="r2.fastq" ftype="fastq"/>
+            <param name="compress_fastq" value=""/>
+            <output name="output_paired1" file="r1_dedup.fastq" ftype="fastq"/>
+            <output name="output_paired2" file="r2_dedup.fastq" ftype="fastq"/>
+        </test>
+    </tests>
+    <help> <![CDATA[
+**Deduplicate paired fastq** is a fast and memory-efficient tool for removal of duplicates in paired short DNA sequence reads in fastq format.
+It identifies duplicates by concatenating the sequence of a readpair and calculating a short hash that uniquely identifies the concatenated sequence.
+Sequences that are not unique (i.e a hash of the concatenated sequence has been seen previously) are being discarded.
+
+Compared to fastuniq this tool requires only a fraction of the memory, but does not identify pairs that are identical,
+except for a switch of R1 and R2. Such reads may nevertheless align to different places based on the seed-searching of the aligner,
+so this may or may not be a problem for your application.
+
+Fastuniq consumed 76 GB of memory and took 4:01.52 on a typical dataset of 100nt 25 x 10^6 paired end reads,
+while this tool took 4.7GB of memory and 3:23.27 for the same dataset.
+
+Both tools produced the exact same result, arguing that, at least before quality and/or adapter trimming,
+the previously mentioned limitations are of theoretical concern.
+
+     ]]> </help>
+    <citations>
+        <citation type="doi">doi:10.1371/journal.pone.0052249</citation>
+    </citations>
+</tool>
b
diff -r 000000000000 -r f33e9e6a6c88 dedup_hash/__init__.pyc
b
Binary file dedup_hash/__init__.pyc has changed
b
diff -r 000000000000 -r f33e9e6a6c88 dedup_hash/dedup_hash.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/dedup_hash/dedup_hash.py Wed Nov 23 07:49:05 2016 -0500
[
@@ -0,0 +1,161 @@
+import argparse
+import gzip
+import io
+import sys
+from itertools import cycle
+try:
+    from itertools import izip
+except ImportError:
+    pass  # we can simply use zip in python3
+
+
+class UniqueFastqBase(object):
+    def __init__(self,
+                 infiles,
+                 outfiles,
+                 write_gzip,
+                 buffer_size=32768,
+                 compresslevel=2,
+                 hash_module="smhasher"):
+        self.seen_hashes = set()
+        self.infiles = infiles
+        self.outfiles = outfiles
+        self.write_gzip = write_gzip
+        self.buffer_size = buffer_size
+        self.compresslevel = compresslevel
+        self.hash_module = self.import_hash_module(hash_module)
+        self.cur_fastq_str_r1 = ""
+        self.cur_fastq_str_r2 = ""
+        self.cur_uniq = False
+        self.fastq_cycle = cycle([self.header_one_action, self.seq_action, self.header_two_action, self.qual_action])
+        self.infiles = self.get_inputs()
+        self.outfiles = self.get_outputs()
+        self.process_files()
+        self.close_io()
+
+    def import_hash_module(self, hash_module):
+        if hash_module == "smhasher":
+            from smhasher import murmur3_x64_64
+            return murmur3_x64_64
+        if hash_module == "CityHash64":
+            from cityhash import CityHash64
+            return CityHash64
+        if hash_module == "hashxx":
+            from pyhashxx import hashxx
+            return hashxx
+
+    def get_input(self, infile):
+        if self._is_gzip(infile):
+            return io.BufferedReader(gzip.GzipFile(infile, 'rb'), buffer_size=self.buffer_size)
+        else:
+            return open(infile)
+
+    def get_inputs(self):
+        return [self.get_input(infile) for infile in self.infiles]
+
+    def get_outputs(self):
+        if self.write_gzip:
+            return [io.BufferedWriter(gzip.GzipFile(outfile, 'wb', compresslevel=self.compresslevel), buffer_size=self.buffer_size) for outfile in self.outfiles]
+        return [open(outfile, 'w') for outfile in self.outfiles]
+
+    def close_io(self):
+        [infile.close() for infile in self.infiles]
+        [outfile.close() for outfile in self.outfiles]
+
+    def _is_gzip(self, infile):
+        gzip_magic_byte = b"\x1f\x8b\x08"
+        with open(infile, 'rb') as input:
+            return gzip_magic_byte == input.read(len(gzip_magic_byte))
+
+    def process_files(self):
+        raise Exception('Not implemented')
+
+    def seq_action(self, lines):
+        cur_hash = self.hash_module("".join(lines))
+        if cur_hash in self.seen_hashes:
+            self.cur_uniq = False
+        else:
+            self.seen_hashes.add(cur_hash)
+            self.cur_uniq = True
+            self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)]
+
+    def header_one_action(self, lines):
+        self.cur_uniq = False
+        self.cur_fastq_strs = lines
+
+    def header_two_action(self, lines):
+        self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)]
+
+    def qual_action(self, lines):
+        if self.cur_uniq:
+            self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)]
+            [outfile.write(string) for string, outfile in zip(self.cur_fastq_strs, self.outfiles)]
+
+
+class UniqueFastqPairsPy2(UniqueFastqBase):
+
+    def process_files(self):
+        for items in izip(self.fastq_cycle, *self.infiles):
+            fastq_item = items[0]
+            lines = items[1:]
+            fastq_item(lines)
+
+
+class UniqueFastqPairsPy3(UniqueFastqBase):
+
+    def process_files(self):
+        for items in zip(self.fastq_cycle, *self.infiles):
+            fastq_item = items[0]
+            lines = items[1:]
+            # The following might be slow, rework this to something smarter
+            # it it slows down too much.
+            fastq_item([l if isinstance(l, str) else l.decode() for l in lines])
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='Get unique reads from fastq files')
+    parser.add_argument('--r1_in', required=True, help='Read1 input fastq file')
+    parser.add_argument('--r2_in', required=False, default=None, help='Read2 input fastq file')
+    parser.add_argument('--r1_out', required=True, help='Read1 output fastq file')
+    parser.add_argument('--r2_out', required=False, help='Read2 output fastq file')
+    parser.add_argument('--write_gzip', action='store_true', help="Compress output in gzip format?")
+    parser.add_argument('--buffer_size', default=32768, type=int, help="Set buffer size for reading gzip files")
+    parser.add_argument('--compresslevel', default=2, type=int, choices=list(range(1, 10)), help="Set compression level (1: fastest, 9: highest compression)")
+    parser.add_argument('--algo', default='smhasher', choices=['CityHash64', 'hashxx', 'smhasher'], help='Select hash algorithm')
+    return parser.parse_args()
+
+
+def get_infiles(args):
+    if args.r2_in:
+        return [args.r1_in, args.r2_in]
+    else:
+        return [args.r1_in]
+
+
+def get_outfiles(args):
+    if args.r2_out:
+        return [args.r1_out, args.r2_out]
+    else:
+        return [args.r1_out]
+
+
+def get_unique_fastq_instance():
+    if sys.version_info.major == 2:
+        return UniqueFastqPairsPy2
+    elif sys.version_info.major == 3:
+        return UniqueFastqPairsPy3
+
+
+def main():
+    args = get_args()
+    UniqueFastqPairs = get_unique_fastq_instance()
+    UniqueFastqPairs(infiles=get_infiles(args),
+                     outfiles=get_outfiles(args),
+                     write_gzip=args.write_gzip,
+                     buffer_size=args.buffer_size,
+                     compresslevel=args.compresslevel,
+                     hash_module=args.algo)
+
+
+if __name__ == '__main__':
+    main()
b
diff -r 000000000000 -r f33e9e6a6c88 requirements.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/requirements.txt Wed Nov 23 07:49:05 2016 -0500
b
@@ -0,0 +1,1 @@
+smhasher
b
diff -r 000000000000 -r f33e9e6a6c88 setup.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/setup.py Wed Nov 23 07:49:05 2016 -0500
[
@@ -0,0 +1,39 @@
+try:
+    from setuptools import setup
+except ImportError:
+    from distutils.core import setup
+
+requirements = ['smhasher']
+
+ENTRY_POINTS = '''
+        [console_scripts]
+        dedup_hash=dedup_hash.dedup_hash:main
+'''
+
+readme = open('README.rst').read()
+history = open('HISTORY.rst').read().replace('.. :changelog:', '')
+
+setup(
+    name='dedup_hash',
+    version='0.1.1',
+    packages=['dedup_hash'],
+    install_requires=requirements,
+    long_description=readme + '\n\n' + history,
+    entry_points=ENTRY_POINTS,
+    keywords='Bioinformatics',
+    classifiers=[
+        'Development Status :: 5 - Production/Stable',
+        'Intended Audience :: Developers',
+        'Environment :: Console',
+        'Operating System :: POSIX',
+        'Topic :: Scientific/Engineering :: Bio-Informatics',
+        'Natural Language :: English',
+        "Programming Language :: Python :: 2",
+        'Programming Language :: Python :: 2.7',
+    ],
+    url='https://github.com/mvdbeek/dedup_hash',
+    license='MIT',
+    author='Marius van den Beek',
+    author_email='m.vandenbeek@gmail.com',
+    description='Finds and discards exact duplicate reads in fastq files.'
+)
b
diff -r 000000000000 -r f33e9e6a6c88 test-data/r1.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/r1.fastq Wed Nov 23 07:49:05 2016 -0500
b
@@ -0,0 +1,200 @@
+@2
+GGTAATGAAATAAAATGTACAT
++2
+5555;<;:;A@=>4-/=<<AAA
+@3
+TTATGCATAATTAAAATAATTATTCTCTG
++3
+:::8<=<<=EEEEE5869/2?@@;EEE?E
+@4
+ATCCCAGCCCACTCAGCCTGTTAAGTTCGCT
++4
+5555978;:E>E;E?9@@?AA:AA;=8=>?@
+@5
+ATGACCAGAAGATCAAAACAGAACTCCTT
++5
+7777===<<<<<>?EEEEEA9@:@@AAA?
+@6
+TGTATTGAATGGAAGTTTTGCTTGCAACAATCTTCCAACTATTCATTGTA
++6
+6856::7::>>4=4:74::8:=9,=9<<<;;@@76:;:;48>><:::<07
+@8
+GGGTAACTCTTATTAATTTTTGTATTTGAAAAAGAAAAAACAATCTG
++8
+15*/898991<<=:=>E?EEEEEE@>6><<1<):2222244444@@@
+@10
+ATATTGTGGAAAAGGCAATCCAT
++10
+8897967669=8<9A?>A?@AA?
+@13
+TTATGCAAACGATAGTTAACGAGAAAGAGATTA
++13
+777/:;;;7EEEEBAAA9A@AA77?<;<9@?<@
+@14
+TGAAAAGACCGAGTTAGGAAGTT
++14
+323376933A????@@9<@@A@?
+@15
+GTCAAAGTTAAACCCCATGAATAAAAGCATGCAGTCAACTAAAAGTTAAC
++15
+323275686=,<<:>>>>=::<924,,<:???>?7;;2;<;4=<94<:78
+@16
+GTTTCAGCGAGAAAAATTACCCAGAGGACTTCGAGTCATAATGACGTTTG
++16
+7776<;;;3@?@<;,,,<<@@A???@<24.5852::=+97660777;;7A
+@19
+ACTTGCATCGCCACCAAATTC
++19
+111189888?>???6<=?>??
+@25
+TTTTTACATGTTTTTCGGAACTTTAAGTTTTGAC
++25
+993587==9;><;;22?>>??@?994=<:?@<@@
+@27
+CAAGAGTTACAGCCCTTTTCCACAAGCCTGTCGCCTGTCCAA
++27
+777;55646;6=<=>>>8>@?;<@<6<=42(98306897A@?
+@28
+GGAGAAGGTACTTGTACCCTCACCAATAGAAGAATTTCATGTACCCACAT
++28
+7785:=:::@?@@?B<BBBEEEEE>;A>ABBB>6;@@@4<<==:@@AA9>
+@29
+ACTGAGGTCTAACGATTCTAGTTTAC
++29
+222256779<<4==;===<<>>>><@
+@30
+TTACTTCAGCCGTACCATACA
++30
+339352556?@A@;??>>?>?
+@33
+CAGGACTCCAACTTGTAACTATTC
++33
+++*+-93315<<=<<?>>?A<?@@
+@34
+GAGTCTAAACTGCTCTTCTTCGCTTT
++34
+/+./42032A;@?@>><><0787:>?
+@35
+AAGTACGTTTGCAGAAACTGTGTTGCACAACTGAAAAAACGCCGATCACT
++35
+5365<=<<=AA@A@?@4A@>>>>?A@A2@<=48=>>>763:*::??>>?A
+@37
+ATAGAGTTGTGCTGCATTTTTAAGAATTC
++37
+///-10163>><><@@<?@<9<?>>>AA?
+@40
+GTTGAAACAGTGAGAACTGTGAAA
++40
+5555;;:<7BBBBBBBB?B?@??@
+@42
+CGACCTTGAGTTTGATTTCTCTGACAATGAACGGAC
++42
+30.089888<9=<<>8>>7>A@@AE<EEE<@@@???
+@43
+GGCTCAAATGATGGGTATCTTGGATGTAT
++43
+7777::<<3A?@?AA?9@@?<<@??>@?@
+@44
+GTTAAATCGATCTCCGACTATGCCGTTTTGCAATATACTCTATGATCAAG
++44
+2222::<;;EEE?E>@9@???@AA@@?A>>?;49:=*:<@<@?999974=
+@2
+GGTAATGAAATAAAATGTACAT
++2
+5555;<;:;A@=>4-/=<<AAA
+@3
+TTATGCATAATTAAAATAATTATTCTCTG
++3
+:::8<=<<=EEEEE5869/2?@@;EEE?E
+@4
+ATCCCAGCCCACTCAGCCTGTTAAGTTCGCT
++4
+5555978;:E>E;E?9@@?AA:AA;=8=>?@
+@5
+ATGACCAGAAGATCAAAACAGAACTCCTT
++5
+7777===<<<<<>?EEEEEA9@:@@AAA?
+@6
+TGTATTGAATGGAAGTTTTGCTTGCAACAATCTTCCAACTATTCATTGTA
++6
+6856::7::>>4=4:74::8:=9,=9<<<;;@@76:;:;48>><:::<07
+@8
+GGGTAACTCTTATTAATTTTTGTATTTGAAAAAGAAAAAACAATCTG
++8
+15*/898991<<=:=>E?EEEEEE@>6><<1<):2222244444@@@
+@10
+ATATTGTGGAAAAGGCAATCCAT
++10
+8897967669=8<9A?>A?@AA?
+@13
+TTATGCAAACGATAGTTAACGAGAAAGAGATTA
++13
+777/:;;;7EEEEBAAA9A@AA77?<;<9@?<@
+@14
+TGAAAAGACCGAGTTAGGAAGTT
++14
+323376933A????@@9<@@A@?
+@15
+GTCAAAGTTAAACCCCATGAATAAAAGCATGCAGTCAACTAAAAGTTAAC
++15
+323275686=,<<:>>>>=::<924,,<:???>?7;;2;<;4=<94<:78
+@16
+GTTTCAGCGAGAAAAATTACCCAGAGGACTTCGAGTCATAATGACGTTTG
++16
+7776<;;;3@?@<;,,,<<@@A???@<24.5852::=+97660777;;7A
+@19
+ACTTGCATCGCCACCAAATTC
++19
+111189888?>???6<=?>??
+@25
+TTTTTACATGTTTTTCGGAACTTTAAGTTTTGAC
++25
+993587==9;><;;22?>>??@?994=<:?@<@@
+@27
+CAAGAGTTACAGCCCTTTTCCACAAGCCTGTCGCCTGTCCAA
++27
+777;55646;6=<=>>>8>@?;<@<6<=42(98306897A@?
+@28
+GGAGAAGGTACTTGTACCCTCACCAATAGAAGAATTTCATGTACCCACAT
++28
+7785:=:::@?@@?B<BBBEEEEE>;A>ABBB>6;@@@4<<==:@@AA9>
+@29
+ACTGAGGTCTAACGATTCTAGTTTAC
++29
+222256779<<4==;===<<>>>><@
+@30
+TTACTTCAGCCGTACCATACA
++30
+339352556?@A@;??>>?>?
+@33
+CAGGACTCCAACTTGTAACTATTC
++33
+++*+-93315<<=<<?>>?A<?@@
+@34
+GAGTCTAAACTGCTCTTCTTCGCTTT
++34
+/+./42032A;@?@>><><0787:>?
+@35
+AAGTACGTTTGCAGAAACTGTGTTGCACAACTGAAAAAACGCCGATCACT
++35
+5365<=<<=AA@A@?@4A@>>>>?A@A2@<=48=>>>763:*::??>>?A
+@37
+ATAGAGTTGTGCTGCATTTTTAAGAATTC
++37
+///-10163>><><@@<?@<9<?>>>AA?
+@40
+GTTGAAACAGTGAGAACTGTGAAA
++40
+5555;;:<7BBBBBBBB?B?@??@
+@42
+CGACCTTGAGTTTGATTTCTCTGACAATGAACGGAC
++42
+30.089888<9=<<>8>>7>A@@AE<EEE<@@@???
+@43
+GGCTCAAATGATGGGTATCTTGGATGTAT
++43
+7777::<<3A?@?AA?9@@?<<@??>@?@
+@44
+GTTAAATCGATCTCCGACTATGCCGTTTTGCAATATACTCTATGATCAAG
++44
+2222::<;;EEE?E>@9@???@AA@@?A>>?;49:=*:<@<@?999974=
b
diff -r 000000000000 -r f33e9e6a6c88 test-data/r1.fastq.gz
b
Binary file test-data/r1.fastq.gz has changed
b
diff -r 000000000000 -r f33e9e6a6c88 test-data/r1_dedup.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/r1_dedup.fastq Wed Nov 23 07:49:05 2016 -0500
b
@@ -0,0 +1,100 @@
+@2
+GGTAATGAAATAAAATGTACAT
++2
+5555;<;:;A@=>4-/=<<AAA
+@3
+TTATGCATAATTAAAATAATTATTCTCTG
++3
+:::8<=<<=EEEEE5869/2?@@;EEE?E
+@4
+ATCCCAGCCCACTCAGCCTGTTAAGTTCGCT
++4
+5555978;:E>E;E?9@@?AA:AA;=8=>?@
+@5
+ATGACCAGAAGATCAAAACAGAACTCCTT
++5
+7777===<<<<<>?EEEEEA9@:@@AAA?
+@6
+TGTATTGAATGGAAGTTTTGCTTGCAACAATCTTCCAACTATTCATTGTA
++6
+6856::7::>>4=4:74::8:=9,=9<<<;;@@76:;:;48>><:::<07
+@8
+GGGTAACTCTTATTAATTTTTGTATTTGAAAAAGAAAAAACAATCTG
++8
+15*/898991<<=:=>E?EEEEEE@>6><<1<):2222244444@@@
+@10
+ATATTGTGGAAAAGGCAATCCAT
++10
+8897967669=8<9A?>A?@AA?
+@13
+TTATGCAAACGATAGTTAACGAGAAAGAGATTA
++13
+777/:;;;7EEEEBAAA9A@AA77?<;<9@?<@
+@14
+TGAAAAGACCGAGTTAGGAAGTT
++14
+323376933A????@@9<@@A@?
+@15
+GTCAAAGTTAAACCCCATGAATAAAAGCATGCAGTCAACTAAAAGTTAAC
++15
+323275686=,<<:>>>>=::<924,,<:???>?7;;2;<;4=<94<:78
+@16
+GTTTCAGCGAGAAAAATTACCCAGAGGACTTCGAGTCATAATGACGTTTG
++16
+7776<;;;3@?@<;,,,<<@@A???@<24.5852::=+97660777;;7A
+@19
+ACTTGCATCGCCACCAAATTC
++19
+111189888?>???6<=?>??
+@25
+TTTTTACATGTTTTTCGGAACTTTAAGTTTTGAC
++25
+993587==9;><;;22?>>??@?994=<:?@<@@
+@27
+CAAGAGTTACAGCCCTTTTCCACAAGCCTGTCGCCTGTCCAA
++27
+777;55646;6=<=>>>8>@?;<@<6<=42(98306897A@?
+@28
+GGAGAAGGTACTTGTACCCTCACCAATAGAAGAATTTCATGTACCCACAT
++28
+7785:=:::@?@@?B<BBBEEEEE>;A>ABBB>6;@@@4<<==:@@AA9>
+@29
+ACTGAGGTCTAACGATTCTAGTTTAC
++29
+222256779<<4==;===<<>>>><@
+@30
+TTACTTCAGCCGTACCATACA
++30
+339352556?@A@;??>>?>?
+@33
+CAGGACTCCAACTTGTAACTATTC
++33
+++*+-93315<<=<<?>>?A<?@@
+@34
+GAGTCTAAACTGCTCTTCTTCGCTTT
++34
+/+./42032A;@?@>><><0787:>?
+@35
+AAGTACGTTTGCAGAAACTGTGTTGCACAACTGAAAAAACGCCGATCACT
++35
+5365<=<<=AA@A@?@4A@>>>>?A@A2@<=48=>>>763:*::??>>?A
+@37
+ATAGAGTTGTGCTGCATTTTTAAGAATTC
++37
+///-10163>><><@@<?@<9<?>>>AA?
+@40
+GTTGAAACAGTGAGAACTGTGAAA
++40
+5555;;:<7BBBBBBBB?B?@??@
+@42
+CGACCTTGAGTTTGATTTCTCTGACAATGAACGGAC
++42
+30.089888<9=<<>8>>7>A@@AE<EEE<@@@???
+@43
+GGCTCAAATGATGGGTATCTTGGATGTAT
++43
+7777::<<3A?@?AA?9@@?<<@??>@?@
+@44
+GTTAAATCGATCTCCGACTATGCCGTTTTGCAATATACTCTATGATCAAG
++44
+2222::<;;EEE?E>@9@???@AA@@?A>>?;49:=*:<@<@?999974=
b
diff -r 000000000000 -r f33e9e6a6c88 test-data/r1_dedup.fastq.gz
b
Binary file test-data/r1_dedup.fastq.gz has changed
b
diff -r 000000000000 -r f33e9e6a6c88 test-data/r2.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/r2.fastq Wed Nov 23 07:49:05 2016 -0500
b
@@ -0,0 +1,200 @@
+@17748
+GATTGGTTGTGAAACCGTTTGCGGAGATAGATTATGCCCATTGTGGGTTG
++17748
+GGFGGGFGGGGGGGGGGGGGGDFFCDDE=FEFFDEFGGGFGCFDGFBD:C
+@17749
+GGTTTAAGAGTTTACATGTTTTGGGTAGCGATTACAATAATAAGGTGTTT
++17749
+CDFFDFFEFEGGGFGGGGGGGGGGGGGFGDGGGGGGGEFGGGFGEEEEED
+@17750
+TTGCAACTTGTCAGATGACGTCTCAGCAGCAGCATCTAAATCTTCTTCAT
++17750
+GGGEGGFGGGGGGGEGGGGGGGGFFFFFGGFGGDGG=GGGGDDGEGGGGG
+@17751
+CGTTGCCGTCATTCATCACACCTATCCCGTGAACAGTATGACAGAGTGG
++17751
+GDGGGGGGGDAFDDFFBFFFFFEFAEE?EC?EEE?B?C?BDC?EBDBBD
+@17752
+GTATCGATAGTAACGACAATTTGATTTTCACGCTTCCTTGACAAAACAA
++17752
+GGGGGGGGGGGGGGGGGDGGGGGBGGGGFGCFFDGGGFGFEFDGGGFGC
+@17754
+GTTGCAAAAAGTAGGGCATTTGCTGAAAAACCACCAAAACTTGTCGATCT
++17754
+GGGGGGGGGFGFGGGGGGGFGGGGEGGGGGGGGGGGGFDGEGEGDFGGGG
+@17755
+GGTTTAATTACCACCGTGAAAGGTTTAGAAAGCTGACGTTTCGAGCGTTA
++17755
+GFGGGGGDGFGGGGGGGGGEFFFFGGFGGGGEGGDGGDGGFGGDGGGFG:
+@17756
+TGTTATGGAAAGAGATATGTTTGGAAAACACAATGCAAAGCACTTGTTGA
++17756
+GGGGGGGGFAA@?(:=87;GFFGEDGBGGGFDDDGDGGGFDFGGDEAF?A
+@17757
+TACCGTGCACTAGGGTTTTGACCCAATTTTTAGCAAAATGTTGCAAAAAG
++17757
+GGGGGGGFGGGGGEFFGGGEGGGFFE?FFGGFFFGFGGGGGGGG5?DDC;
+@17758
+TGACTCGTTCCCTGGGTAGGGAGAGAGGACACTGCAATCTCAAACATCGA
++17758
+GFEGGGGGGGGGFGEAADEEECECABEEBGGEFGBG?GDAEEEEEG?GD5
+@17759
+TTTGCACAAAAAGTTGCTTATCTCGAGGAGTTTGACAAGTTTCGGTGGTT
++17759
+GGGGGGGGGGGFGGGGGGGDGGGGDDFFEFGGGGGDDBFDGFDGGFGGFD
+@17760
+GAAAAACAAGAGCAGCAAAGGCAGAGGGAAGATGGGTCGGTGTAGTTGTC
++17760
+GFGGGGGCGGGGDDGGFGBGGDDFGDDBGBGGFGDECFGEEEBEEEEEEF
+@17761
+GCTAAGTTATCGTTTGCGTATTATAACGCGCTCTAGCCATGTCAGTTAAC
++17761
+@EED=E?D?B9EDDE5=BE=EAEE?E?BBDCB?DD5?DDA?5--.@@@6D
+@17762
+CAGTACATAACTATTAAAGAAATCCAACTCACAAATGCTTGGAAGGCGAG
++17762
+GGGGGGGGEGGGGGGGGGGGGFGGGFGFFGGGFGGGDFDGFGGGGFGGGG
+@17763
+TGGCAAACTTTTGACGGTCATAACAAGCAAGTGTGCAGGACGTTTCTAAG
++17763
+GGGGDGGFFFFGGGEGAGFFGGGGGGGFDGDEGBFD?DDGEEFFFAB?F?
+@17764
+TTCATCCACAGTGGACAGAAACAACATCATTCTTACTGAGTTGTTGTTTC
++17764
+GGGFGGGFGGGEGGFGAGGGGGGGGEGEFFGGGGGEGGGEGDGGBGGGD;
+@17765
+TCCTCTCAGATTTGGGTACTTTCAATTGCCGAAAACTGATTTTGAATACT
++17765
+?ACA5CC?CD=DDD=;;C@?A>A-AA?A?CAAA-DDDDDADD=A5:?5::
+@17766
+AGAAACGATTTAAAGCCATATAGCCCCCGAAACTCAGACTTTATCCTTCC
++17766
+GGGGGGGGGGGFEGGGGGGGGGGFGGGGGGGGGGFGFGGFGGFDFFBFFA
+@17767
+GATAAGATGACGTACTCTCCTAGATACGTCTTCGAATCTGAGACGGTCTC
++17767
+DFEF?FGGGGGGBGGGGGGDGGGF?=FFAGAGGFF-FG5ED?BDEC:ECA
+@17768
+TTGTCGATCTCCTCGAGATAAGCAACTTTTTCTGCAAAGTACCGTGCACT
++17768
+FGGGGGGGGGGGGGF=EAGFGGEGFDGFGG?GGFGGGGGDEEDFC:AEDA
+@17769
+ACTACACGACTGTCCCAGTCTGTAGTGGATCGTGCTAGTGTCTCATTT
++17769
+GGGGGBGGGFFFFFGGGGGGGGGGGGGGFGG?DE?C?CCBB:BAB@CC
+@17770
+AGAGTGTTTTCCTTGTCACGCCTTATTAAGATAAAAGTCTGAAACAGTTC
++17770
+FFDFGGGGGFFGGGDFGGGEGFGGGGGBGAGGGFGEGGGFDE?EFFDFD5
+@17771
+CCATTGGAGACATGTTTATGAACGAATGTGAACAGGTATTTATCATACAA
++17771
+GGGGGGGFFGGGFGGGGGGGGDGGFDDGGFDGGGGADGGFFDFFDE?EDD
+@17772
+TCGTCATCTGTGCATCTGTCCTCTAATAGATCATAGGCGAGAACTAATC
++17772
+CCC@DD?BD=@CCCADDB=DCDCDDDDC?DB:BBBD?A?CA?B5:=7@5
+@17773
+GTTGTTCACACTAGAGACATAAATTAAAACATCAAAACAAGGCTCTTTTT
++17773
+BD=B?DBB?DADCDDE:AEBDABBEEEE=?DDABEEEADC8:B5DDDADC
+@17748
+GATTGGTTGTGAAACCGTTTGCGGAGATAGATTATGCCCATTGTGGGTTG
++17748
+GGFGGGFGGGGGGGGGGGGGGDFFCDDE=FEFFDEFGGGFGCFDGFBD:C
+@17749
+GGTTTAAGAGTTTACATGTTTTGGGTAGCGATTACAATAATAAGGTGTTT
++17749
+CDFFDFFEFEGGGFGGGGGGGGGGGGGFGDGGGGGGGEFGGGFGEEEEED
+@17750
+TTGCAACTTGTCAGATGACGTCTCAGCAGCAGCATCTAAATCTTCTTCAT
++17750
+GGGEGGFGGGGGGGEGGGGGGGGFFFFFGGFGGDGG=GGGGDDGEGGGGG
+@17751
+CGTTGCCGTCATTCATCACACCTATCCCGTGAACAGTATGACAGAGTGG
++17751
+GDGGGGGGGDAFDDFFBFFFFFEFAEE?EC?EEE?B?C?BDC?EBDBBD
+@17752
+GTATCGATAGTAACGACAATTTGATTTTCACGCTTCCTTGACAAAACAA
++17752
+GGGGGGGGGGGGGGGGGDGGGGGBGGGGFGCFFDGGGFGFEFDGGGFGC
+@17754
+GTTGCAAAAAGTAGGGCATTTGCTGAAAAACCACCAAAACTTGTCGATCT
++17754
+GGGGGGGGGFGFGGGGGGGFGGGGEGGGGGGGGGGGGFDGEGEGDFGGGG
+@17755
+GGTTTAATTACCACCGTGAAAGGTTTAGAAAGCTGACGTTTCGAGCGTTA
++17755
+GFGGGGGDGFGGGGGGGGGEFFFFGGFGGGGEGGDGGDGGFGGDGGGFG:
+@17756
+TGTTATGGAAAGAGATATGTTTGGAAAACACAATGCAAAGCACTTGTTGA
++17756
+GGGGGGGGFAA@?(:=87;GFFGEDGBGGGFDDDGDGGGFDFGGDEAF?A
+@17757
+TACCGTGCACTAGGGTTTTGACCCAATTTTTAGCAAAATGTTGCAAAAAG
++17757
+GGGGGGGFGGGGGEFFGGGEGGGFFE?FFGGFFFGFGGGGGGGG5?DDC;
+@17758
+TGACTCGTTCCCTGGGTAGGGAGAGAGGACACTGCAATCTCAAACATCGA
++17758
+GFEGGGGGGGGGFGEAADEEECECABEEBGGEFGBG?GDAEEEEEG?GD5
+@17759
+TTTGCACAAAAAGTTGCTTATCTCGAGGAGTTTGACAAGTTTCGGTGGTT
++17759
+GGGGGGGGGGGFGGGGGGGDGGGGDDFFEFGGGGGDDBFDGFDGGFGGFD
+@17760
+GAAAAACAAGAGCAGCAAAGGCAGAGGGAAGATGGGTCGGTGTAGTTGTC
++17760
+GFGGGGGCGGGGDDGGFGBGGDDFGDDBGBGGFGDECFGEEEBEEEEEEF
+@17761
+GCTAAGTTATCGTTTGCGTATTATAACGCGCTCTAGCCATGTCAGTTAAC
++17761
+@EED=E?D?B9EDDE5=BE=EAEE?E?BBDCB?DD5?DDA?5--.@@@6D
+@17762
+CAGTACATAACTATTAAAGAAATCCAACTCACAAATGCTTGGAAGGCGAG
++17762
+GGGGGGGGEGGGGGGGGGGGGFGGGFGFFGGGFGGGDFDGFGGGGFGGGG
+@17763
+TGGCAAACTTTTGACGGTCATAACAAGCAAGTGTGCAGGACGTTTCTAAG
++17763
+GGGGDGGFFFFGGGEGAGFFGGGGGGGFDGDEGBFD?DDGEEFFFAB?F?
+@17764
+TTCATCCACAGTGGACAGAAACAACATCATTCTTACTGAGTTGTTGTTTC
++17764
+GGGFGGGFGGGEGGFGAGGGGGGGGEGEFFGGGGGEGGGEGDGGBGGGD;
+@17765
+TCCTCTCAGATTTGGGTACTTTCAATTGCCGAAAACTGATTTTGAATACT
++17765
+?ACA5CC?CD=DDD=;;C@?A>A-AA?A?CAAA-DDDDDADD=A5:?5::
+@17766
+AGAAACGATTTAAAGCCATATAGCCCCCGAAACTCAGACTTTATCCTTCC
++17766
+GGGGGGGGGGGFEGGGGGGGGGGFGGGGGGGGGGFGFGGFGGFDFFBFFA
+@17767
+GATAAGATGACGTACTCTCCTAGATACGTCTTCGAATCTGAGACGGTCTC
++17767
+DFEF?FGGGGGGBGGGGGGDGGGF?=FFAGAGGFF-FG5ED?BDEC:ECA
+@17768
+TTGTCGATCTCCTCGAGATAAGCAACTTTTTCTGCAAAGTACCGTGCACT
++17768
+FGGGGGGGGGGGGGF=EAGFGGEGFDGFGG?GGFGGGGGDEEDFC:AEDA
+@17769
+ACTACACGACTGTCCCAGTCTGTAGTGGATCGTGCTAGTGTCTCATTT
++17769
+GGGGGBGGGFFFFFGGGGGGGGGGGGGGFGG?DE?C?CCBB:BAB@CC
+@17770
+AGAGTGTTTTCCTTGTCACGCCTTATTAAGATAAAAGTCTGAAACAGTTC
++17770
+FFDFGGGGGFFGGGDFGGGEGFGGGGGBGAGGGFGEGGGFDE?EFFDFD5
+@17771
+CCATTGGAGACATGTTTATGAACGAATGTGAACAGGTATTTATCATACAA
++17771
+GGGGGGGFFGGGFGGGGGGGGDGGFDDGGFDGGGGADGGFFDFFDE?EDD
+@17772
+TCGTCATCTGTGCATCTGTCCTCTAATAGATCATAGGCGAGAACTAATC
++17772
+CCC@DD?BD=@CCCADDB=DCDCDDDDC?DB:BBBD?A?CA?B5:=7@5
+@17773
+GTTGTTCACACTAGAGACATAAATTAAAACATCAAAACAAGGCTCTTTTT
++17773
+BD=B?DBB?DADCDDE:AEBDABBEEEE=?DDABEEEADC8:B5DDDADC
b
diff -r 000000000000 -r f33e9e6a6c88 test-data/r2.fastq.gz
b
Binary file test-data/r2.fastq.gz has changed
b
diff -r 000000000000 -r f33e9e6a6c88 test-data/r2_dedup.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/r2_dedup.fastq Wed Nov 23 07:49:05 2016 -0500
b
@@ -0,0 +1,100 @@
+@17748
+GATTGGTTGTGAAACCGTTTGCGGAGATAGATTATGCCCATTGTGGGTTG
++17748
+GGFGGGFGGGGGGGGGGGGGGDFFCDDE=FEFFDEFGGGFGCFDGFBD:C
+@17749
+GGTTTAAGAGTTTACATGTTTTGGGTAGCGATTACAATAATAAGGTGTTT
++17749
+CDFFDFFEFEGGGFGGGGGGGGGGGGGFGDGGGGGGGEFGGGFGEEEEED
+@17750
+TTGCAACTTGTCAGATGACGTCTCAGCAGCAGCATCTAAATCTTCTTCAT
++17750
+GGGEGGFGGGGGGGEGGGGGGGGFFFFFGGFGGDGG=GGGGDDGEGGGGG
+@17751
+CGTTGCCGTCATTCATCACACCTATCCCGTGAACAGTATGACAGAGTGG
++17751
+GDGGGGGGGDAFDDFFBFFFFFEFAEE?EC?EEE?B?C?BDC?EBDBBD
+@17752
+GTATCGATAGTAACGACAATTTGATTTTCACGCTTCCTTGACAAAACAA
++17752
+GGGGGGGGGGGGGGGGGDGGGGGBGGGGFGCFFDGGGFGFEFDGGGFGC
+@17754
+GTTGCAAAAAGTAGGGCATTTGCTGAAAAACCACCAAAACTTGTCGATCT
++17754
+GGGGGGGGGFGFGGGGGGGFGGGGEGGGGGGGGGGGGFDGEGEGDFGGGG
+@17755
+GGTTTAATTACCACCGTGAAAGGTTTAGAAAGCTGACGTTTCGAGCGTTA
++17755
+GFGGGGGDGFGGGGGGGGGEFFFFGGFGGGGEGGDGGDGGFGGDGGGFG:
+@17756
+TGTTATGGAAAGAGATATGTTTGGAAAACACAATGCAAAGCACTTGTTGA
++17756
+GGGGGGGGFAA@?(:=87;GFFGEDGBGGGFDDDGDGGGFDFGGDEAF?A
+@17757
+TACCGTGCACTAGGGTTTTGACCCAATTTTTAGCAAAATGTTGCAAAAAG
++17757
+GGGGGGGFGGGGGEFFGGGEGGGFFE?FFGGFFFGFGGGGGGGG5?DDC;
+@17758
+TGACTCGTTCCCTGGGTAGGGAGAGAGGACACTGCAATCTCAAACATCGA
++17758
+GFEGGGGGGGGGFGEAADEEECECABEEBGGEFGBG?GDAEEEEEG?GD5
+@17759
+TTTGCACAAAAAGTTGCTTATCTCGAGGAGTTTGACAAGTTTCGGTGGTT
++17759
+GGGGGGGGGGGFGGGGGGGDGGGGDDFFEFGGGGGDDBFDGFDGGFGGFD
+@17760
+GAAAAACAAGAGCAGCAAAGGCAGAGGGAAGATGGGTCGGTGTAGTTGTC
++17760
+GFGGGGGCGGGGDDGGFGBGGDDFGDDBGBGGFGDECFGEEEBEEEEEEF
+@17761
+GCTAAGTTATCGTTTGCGTATTATAACGCGCTCTAGCCATGTCAGTTAAC
++17761
+@EED=E?D?B9EDDE5=BE=EAEE?E?BBDCB?DD5?DDA?5--.@@@6D
+@17762
+CAGTACATAACTATTAAAGAAATCCAACTCACAAATGCTTGGAAGGCGAG
++17762
+GGGGGGGGEGGGGGGGGGGGGFGGGFGFFGGGFGGGDFDGFGGGGFGGGG
+@17763
+TGGCAAACTTTTGACGGTCATAACAAGCAAGTGTGCAGGACGTTTCTAAG
++17763
+GGGGDGGFFFFGGGEGAGFFGGGGGGGFDGDEGBFD?DDGEEFFFAB?F?
+@17764
+TTCATCCACAGTGGACAGAAACAACATCATTCTTACTGAGTTGTTGTTTC
++17764
+GGGFGGGFGGGEGGFGAGGGGGGGGEGEFFGGGGGEGGGEGDGGBGGGD;
+@17765
+TCCTCTCAGATTTGGGTACTTTCAATTGCCGAAAACTGATTTTGAATACT
++17765
+?ACA5CC?CD=DDD=;;C@?A>A-AA?A?CAAA-DDDDDADD=A5:?5::
+@17766
+AGAAACGATTTAAAGCCATATAGCCCCCGAAACTCAGACTTTATCCTTCC
++17766
+GGGGGGGGGGGFEGGGGGGGGGGFGGGGGGGGGGFGFGGFGGFDFFBFFA
+@17767
+GATAAGATGACGTACTCTCCTAGATACGTCTTCGAATCTGAGACGGTCTC
++17767
+DFEF?FGGGGGGBGGGGGGDGGGF?=FFAGAGGFF-FG5ED?BDEC:ECA
+@17768
+TTGTCGATCTCCTCGAGATAAGCAACTTTTTCTGCAAAGTACCGTGCACT
++17768
+FGGGGGGGGGGGGGF=EAGFGGEGFDGFGG?GGFGGGGGDEEDFC:AEDA
+@17769
+ACTACACGACTGTCCCAGTCTGTAGTGGATCGTGCTAGTGTCTCATTT
++17769
+GGGGGBGGGFFFFFGGGGGGGGGGGGGGFGG?DE?C?CCBB:BAB@CC
+@17770
+AGAGTGTTTTCCTTGTCACGCCTTATTAAGATAAAAGTCTGAAACAGTTC
++17770
+FFDFGGGGGFFGGGDFGGGEGFGGGGGBGAGGGFGEGGGFDE?EFFDFD5
+@17771
+CCATTGGAGACATGTTTATGAACGAATGTGAACAGGTATTTATCATACAA
++17771
+GGGGGGGFFGGGFGGGGGGGGDGGFDDGGFDGGGGADGGFFDFFDE?EDD
+@17772
+TCGTCATCTGTGCATCTGTCCTCTAATAGATCATAGGCGAGAACTAATC
++17772
+CCC@DD?BD=@CCCADDB=DCDCDDDDC?DB:BBBD?A?CA?B5:=7@5
+@17773
+GTTGTTCACACTAGAGACATAAATTAAAACATCAAAACAAGGCTCTTTTT
++17773
+BD=B?DBB?DADCDDE:AEBDABBEEEE=?DDABEEEADC8:B5DDDADC
b
diff -r 000000000000 -r f33e9e6a6c88 test-data/r2_dedup.fastq.gz
b
Binary file test-data/r2_dedup.fastq.gz has changed
b
diff -r 000000000000 -r f33e9e6a6c88 test/test_dedup_hash.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_dedup_hash.py Wed Nov 23 07:49:05 2016 -0500
[
@@ -0,0 +1,63 @@
+import hashlib
+import inspect
+import os
+import subprocess
+import sys
+import tempfile
+
+
+currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+parent_dir = os.path.dirname(currentdir)
+sys.path.insert(0, os.path.join(parent_dir, 'dedup_hash/'))
+import dedup_hash
+
+
+TEST_DATA_DIR = os.path.join(parent_dir, 'test-data/')
+UNCOMPRESSED_IN = ['r1.fastq', 'r2.fastq']
+COMPRESSED_IN = ['r1.fastq.gz', 'r2.fastq.gz']
+UNCOMPRESSED_OUT = ['r1_dedup.fastq', 'r2_dedup.fastq']
+SINGLE_IN = ['r1.fastq']
+SINGLE_OUT = ['r1_dedup.fastq']
+
+
+
+def run(input):
+    args = prepare_args(input)
+    run_dedup(args)
+    compare_output(args)
+
+
+def compare_output(args):
+    ref_out1 = os.path.join(TEST_DATA_DIR, 'r1_dedup.fastq')
+    try:
+        assert md5(args['outfiles'][0]) == md5(ref_out1)
+    except AssertionError:
+        cmd = "diff -Nru %s %s" % (args['outfiles'][0], ref_out1)
+        subprocess.check_call(cmd.split(' '))
+    print('all good')
+
+
+def prepare_args(test_files):
+    infiles = [os.path.join(TEST_DATA_DIR, test_file) for test_file in test_files]
+    outfiles = [tempfile.NamedTemporaryFile(delete=False).name for test_file in test_files]  # Same number of output files as input files
+    kwargs = {'infiles': infiles,
+              'outfiles': outfiles,
+              'write_gzip': False}
+    return kwargs
+
+
+def run_dedup(kwargs):
+    fastq_pairs_instance = dedup_hash.get_unique_fastq_instance()
+    fastq_pairs_instance(**kwargs)
+
+def md5(fname):
+    hash_md5 = hashlib.md5()
+    with open(fname, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+
+if __name__ == '__main__':
+    run(UNCOMPRESSED_IN)
+    run(COMPRESSED_IN)
+    run(SINGLE_IN)
b
diff -r 000000000000 -r f33e9e6a6c88 tox.ini
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tox.ini Wed Nov 23 07:49:05 2016 -0500
[
@@ -0,0 +1,17 @@
+[tox]
+envlist = py{27,35}-lint
+source_dir = dedup_hash
+
+[testenv:py27-lint]
+commands = flake8 {[tox]source_dir} {[tox]source_dir} --ignore E501
+skip_install = True
+deps =
+    flake8
+    flake8-import-order
+
+[testenv:py35-lint]
+commands = flake8 {[tox]source_dir} {[tox]source_dir} --ignore E501
+skip_install = True
+deps =
+    flake8
+    flake8-import-order