# HG changeset patch # User mvdbeek # Date 1479905345 18000 # Node ID f33e9e6a6c88a01d0dc8c429642bc7e5acae51d2 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty diff -r 000000000000 -r f33e9e6a6c88 HISTORY.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/HISTORY.rst Wed Nov 23 07:49:05 2016 -0500 @@ -0,0 +1,20 @@ +.. :changelog: + +History +------- + +.. to_doc + +--------------------- +0.1.1 (2016-11-23) +--------------------- +* Make python2/3 compatible +* Use smhasher as default hasher and add options for cityhash and hashxx +* Testing enhancements + +* Initial version +--------------------- +0.1.0 (2016-11-16) +--------------------- + +* Initial version diff -r 000000000000 -r f33e9e6a6c88 LICENSE.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/LICENSE.txt Wed Nov 23 07:49:05 2016 -0500 @@ -0,0 +1,8 @@ +The MIT License (MIT) +Copyright (c) + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff -r 000000000000 -r f33e9e6a6c88 Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Makefile Wed Nov 23 07:49:05 2016 -0500 @@ -0,0 +1,53 @@ +# Location of virtualenv used for development. +VENV?=.venv +BRANCH=curie2 +GALAXY_REPO=https://github.com/mvdbeek/galaxy.git +CONDA_PREFIX?=.conda +CONDA_PREFIX_PATH?=`readlink -e .conda` +# Source virtualenv to execute command (flake8, sphinx, twine, etc...) +IN_VENV=if [ -f $(VENV)/bin/activate ]; then . $(VENV)/bin/activate; fi; +PLANEMO=$(IN_VENV) planemo + +setup-venv: ## setup a development virutalenv in current directory + if [ ! -d $(VENV) ]; then virtualenv $(VENV); exit; fi; + $(IN_VENV) pip install -r requirements.txt; + $(IN_VENV) pip install planemo + +lint-35: setup-venv + $(IN_VENV) pip install tox && $(IN_VENV) tox -e py35-lint + +lint-27: setup-venv + $(IN_VENV) pip install tox && $(IN_VENV) tox -e py27-lint + +db: + if [ ! -d db_gx_rev_0127.sqlite ]; then wget https://github.com/jmchilton/galaxy-downloads/raw/master/db_gx_rev_0127.sqlite ; exit; fi; + +setup_galaxy_clone: + if [ ! -d .galaxy ]; then git clone --depth=50 --branch $(BRANCH) $(GALAXY_REPO) .galaxy; exit; fi; + +planemo-test: db setup-venv setup_galaxy_clone + if [ ! -d $(CONDA_PREFIX) ]; then $(PLANEMO) conda_init --conda_prefix $(CONDA_PREFIX);fi && \ + $(PLANEMO) conda_install --conda_prefix $(CONDA_PREFIX_PATH) . && \ + $(PLANEMO) test \ + --galaxy_database_seed db_gx_rev_0127.sqlite \ + --galaxy_root .galaxy \ + --galaxy_source $(GALAXY_REPO) \ + --galaxy_branch $(BRANCH) \ + --conda_dependency_resolution \ + --conda_prefix $(CONDA_PREFIX_PATH) + +planemo-serve: db setup-venv setup_galaxy_clone + $(PLANEMO) serve \ + --galaxy_database_seed db_gx_rev_0127.sqlite \ + --galaxy_root .galaxy \ + --galaxy_source $(GALAXY_REPO) \ + --galaxy_branch $(BRANCH) \ + --conda_auto_install \ + --conda_dependency_resolution \ + --conda_prefix $(CONDA_PREFIX_PATH) + +py-test: + $(IN_VENV) python test/test_dedup_hash.py + +clean: + rm -Rf *.sqlite* .venv .conda .galaxy dist/ *.egg-info tool_test_output.* .tox || true diff -r 000000000000 -r f33e9e6a6c88 README.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.rst Wed Nov 23 07:49:05 2016 -0500 @@ -0,0 +1,26 @@ +.. image:: https://travis-ci.org/mvdbeek/dedup_hash.svg?branch=master + :target: https://travis-ci.org/mvdbeek/dedup_hash + +dedup_hash +---------------------------- + + +This is a commandline utility to remove exact duplicate reads +from paired-end fastq files. Reads are assumed to be in 2 separate +files. Read sequences are then concatenated and a short hash is calculated on +the concatenated sequence. If the hash has been previsouly seen the read will +be dropped from the output file. This means that reads that have the same +start and end coordinate, but differ in lengths will not be removed (but those +will be "flattened" to at most 1 occurence). + +This algorithm is very simple and fast, and saves memory as compared to +reading the whole fastq file into memory, such as fastuniq does. + +Installation +------------ + +depdup_city relies on the cityhash python package, +which supports python-2.7 exclusively. + +``pip install dedup_hash`` + diff -r 000000000000 -r f33e9e6a6c88 dedup_hash.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dedup_hash.xml Wed Nov 23 07:49:05 2016 -0500 @@ -0,0 +1,114 @@ + + with fast and memory-efficient sequence hashes + + smhasher + + + + + + + + + + + + + + + + + + + + + + + + + readtype['single_or_paired'] == 'se' + + + + + + readtype['single_or_paired'] == 'pe_sep' + + + + + + readtype['single_or_paired'] == 'pe_sep' + + + + + + readtype['single_or_paired'] == 'pe_collection' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + doi:10.1371/journal.pone.0052249 + + diff -r 000000000000 -r f33e9e6a6c88 dedup_hash/__init__.py diff -r 000000000000 -r f33e9e6a6c88 dedup_hash/__init__.pyc Binary file dedup_hash/__init__.pyc has changed diff -r 000000000000 -r f33e9e6a6c88 dedup_hash/dedup_hash.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dedup_hash/dedup_hash.py Wed Nov 23 07:49:05 2016 -0500 @@ -0,0 +1,161 @@ +import argparse +import gzip +import io +import sys +from itertools import cycle +try: + from itertools import izip +except ImportError: + pass # we can simply use zip in python3 + + +class UniqueFastqBase(object): + def __init__(self, + infiles, + outfiles, + write_gzip, + buffer_size=32768, + compresslevel=2, + hash_module="smhasher"): + self.seen_hashes = set() + self.infiles = infiles + self.outfiles = outfiles + self.write_gzip = write_gzip + self.buffer_size = buffer_size + self.compresslevel = compresslevel + self.hash_module = self.import_hash_module(hash_module) + self.cur_fastq_str_r1 = "" + self.cur_fastq_str_r2 = "" + self.cur_uniq = False + self.fastq_cycle = cycle([self.header_one_action, self.seq_action, self.header_two_action, self.qual_action]) + self.infiles = self.get_inputs() + self.outfiles = self.get_outputs() + self.process_files() + self.close_io() + + def import_hash_module(self, hash_module): + if hash_module == "smhasher": + from smhasher import murmur3_x64_64 + return murmur3_x64_64 + if hash_module == "CityHash64": + from cityhash import CityHash64 + return CityHash64 + if hash_module == "hashxx": + from pyhashxx import hashxx + return hashxx + + def get_input(self, infile): + if self._is_gzip(infile): + return io.BufferedReader(gzip.GzipFile(infile, 'rb'), buffer_size=self.buffer_size) + else: + return open(infile) + + def get_inputs(self): + return [self.get_input(infile) for infile in self.infiles] + + def get_outputs(self): + if self.write_gzip: + return [io.BufferedWriter(gzip.GzipFile(outfile, 'wb', compresslevel=self.compresslevel), buffer_size=self.buffer_size) for outfile in self.outfiles] + return [open(outfile, 'w') for outfile in self.outfiles] + + def close_io(self): + [infile.close() for infile in self.infiles] + [outfile.close() for outfile in self.outfiles] + + def _is_gzip(self, infile): + gzip_magic_byte = b"\x1f\x8b\x08" + with open(infile, 'rb') as input: + return gzip_magic_byte == input.read(len(gzip_magic_byte)) + + def process_files(self): + raise Exception('Not implemented') + + def seq_action(self, lines): + cur_hash = self.hash_module("".join(lines)) + if cur_hash in self.seen_hashes: + self.cur_uniq = False + else: + self.seen_hashes.add(cur_hash) + self.cur_uniq = True + self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)] + + def header_one_action(self, lines): + self.cur_uniq = False + self.cur_fastq_strs = lines + + def header_two_action(self, lines): + self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)] + + def qual_action(self, lines): + if self.cur_uniq: + self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)] + [outfile.write(string) for string, outfile in zip(self.cur_fastq_strs, self.outfiles)] + + +class UniqueFastqPairsPy2(UniqueFastqBase): + + def process_files(self): + for items in izip(self.fastq_cycle, *self.infiles): + fastq_item = items[0] + lines = items[1:] + fastq_item(lines) + + +class UniqueFastqPairsPy3(UniqueFastqBase): + + def process_files(self): + for items in zip(self.fastq_cycle, *self.infiles): + fastq_item = items[0] + lines = items[1:] + # The following might be slow, rework this to something smarter + # it it slows down too much. + fastq_item([l if isinstance(l, str) else l.decode() for l in lines]) + + +def get_args(): + parser = argparse.ArgumentParser(description='Get unique reads from fastq files') + parser.add_argument('--r1_in', required=True, help='Read1 input fastq file') + parser.add_argument('--r2_in', required=False, default=None, help='Read2 input fastq file') + parser.add_argument('--r1_out', required=True, help='Read1 output fastq file') + parser.add_argument('--r2_out', required=False, help='Read2 output fastq file') + parser.add_argument('--write_gzip', action='store_true', help="Compress output in gzip format?") + parser.add_argument('--buffer_size', default=32768, type=int, help="Set buffer size for reading gzip files") + parser.add_argument('--compresslevel', default=2, type=int, choices=list(range(1, 10)), help="Set compression level (1: fastest, 9: highest compression)") + parser.add_argument('--algo', default='smhasher', choices=['CityHash64', 'hashxx', 'smhasher'], help='Select hash algorithm') + return parser.parse_args() + + +def get_infiles(args): + if args.r2_in: + return [args.r1_in, args.r2_in] + else: + return [args.r1_in] + + +def get_outfiles(args): + if args.r2_out: + return [args.r1_out, args.r2_out] + else: + return [args.r1_out] + + +def get_unique_fastq_instance(): + if sys.version_info.major == 2: + return UniqueFastqPairsPy2 + elif sys.version_info.major == 3: + return UniqueFastqPairsPy3 + + +def main(): + args = get_args() + UniqueFastqPairs = get_unique_fastq_instance() + UniqueFastqPairs(infiles=get_infiles(args), + outfiles=get_outfiles(args), + write_gzip=args.write_gzip, + buffer_size=args.buffer_size, + compresslevel=args.compresslevel, + hash_module=args.algo) + + +if __name__ == '__main__': + main() diff -r 000000000000 -r f33e9e6a6c88 requirements.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/requirements.txt Wed Nov 23 07:49:05 2016 -0500 @@ -0,0 +1,1 @@ +smhasher diff -r 000000000000 -r f33e9e6a6c88 setup.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/setup.py Wed Nov 23 07:49:05 2016 -0500 @@ -0,0 +1,39 @@ +try: + from setuptools import setup +except ImportError: + from distutils.core import setup + +requirements = ['smhasher'] + +ENTRY_POINTS = ''' + [console_scripts] + dedup_hash=dedup_hash.dedup_hash:main +''' + +readme = open('README.rst').read() +history = open('HISTORY.rst').read().replace('.. :changelog:', '') + +setup( + name='dedup_hash', + version='0.1.1', + packages=['dedup_hash'], + install_requires=requirements, + long_description=readme + '\n\n' + history, + entry_points=ENTRY_POINTS, + keywords='Bioinformatics', + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Developers', + 'Environment :: Console', + 'Operating System :: POSIX', + 'Topic :: Scientific/Engineering :: Bio-Informatics', + 'Natural Language :: English', + "Programming Language :: Python :: 2", + 'Programming Language :: Python :: 2.7', + ], + url='https://github.com/mvdbeek/dedup_hash', + license='MIT', + author='Marius van den Beek', + author_email='m.vandenbeek@gmail.com', + description='Finds and discards exact duplicate reads in fastq files.' +) diff -r 000000000000 -r f33e9e6a6c88 test-data/r1.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/r1.fastq Wed Nov 23 07:49:05 2016 -0500 @@ -0,0 +1,200 @@ +@2 +GGTAATGAAATAAAATGTACAT ++2 +5555;<;:;A@=>4-/=<E;E?9@@?AA:AA;=8=>?@ +@5 +ATGACCAGAAGATCAAAACAGAACTCCTT ++5 +7777===<<<<<>?EEEEEA9@:@@AAA? +@6 +TGTATTGAATGGAAGTTTTGCTTGCAACAATCTTCCAACTATTCATTGTA ++6 +6856::7::>>4=4:74::8:=9,=9<<<;;@@76:;:;48>><:::<07 +@8 +GGGTAACTCTTATTAATTTTTGTATTTGAAAAAGAAAAAACAATCTG ++8 +15*/898991<<=:=>E?EEEEEE@>6><<1<):2222244444@@@ +@10 +ATATTGTGGAAAAGGCAATCCAT ++10 +8897967669=8<9A?>A?@AA? +@13 +TTATGCAAACGATAGTTAACGAGAAAGAGATTA ++13 +777/:;;;7EEEEBAAA9A@AA77?<;<9@?<@ +@14 +TGAAAAGACCGAGTTAGGAAGTT ++14 +323376933A????@@9<@@A@? +@15 +GTCAAAGTTAAACCCCATGAATAAAAGCATGCAGTCAACTAAAAGTTAAC ++15 +323275686=,<<:>>>>=::<924,,<:???>?7;;2;<;4=<94<:78 +@16 +GTTTCAGCGAGAAAAATTACCCAGAGGACTTCGAGTCATAATGACGTTTG ++16 +7776<;;;3@?@<;,,,<<@@A???@<24.5852::=+97660777;;7A +@19 +ACTTGCATCGCCACCAAATTC ++19 +111189888?>???6<=?>?? +@25 +TTTTTACATGTTTTTCGGAACTTTAAGTTTTGAC ++25 +993587==9;><;;22?>>??@?994=<:?@<@@ +@27 +CAAGAGTTACAGCCCTTTTCCACAAGCCTGTCGCCTGTCCAA ++27 +777;55646;6=<=>>>8>@?;<@<6<=42(98306897A@? +@28 +GGAGAAGGTACTTGTACCCTCACCAATAGAAGAATTTCATGTACCCACAT ++28 +7785:=:::@?@@?B;A>ABBB>6;@@@4<<==:@@AA9> +@29 +ACTGAGGTCTAACGATTCTAGTTTAC ++29 +222256779<<4==;===<<>>>><@ +@30 +TTACTTCAGCCGTACCATACA ++30 +339352556?@A@;??>>?>? +@33 +CAGGACTCCAACTTGTAACTATTC ++33 +++*+-93315<<=<>?A><><0787:>? +@35 +AAGTACGTTTGCAGAAACTGTGTTGCACAACTGAAAAAACGCCGATCACT ++35 +5365<=<<=AA@A@?@4A@>>>>?A@A2@<=48=>>>763:*::??>>?A +@37 +ATAGAGTTGTGCTGCATTTTTAAGAATTC ++37 +///-10163>><><@@>>AA? +@40 +GTTGAAACAGTGAGAACTGTGAAA ++40 +5555;;:<7BBBBBBBB?B?@??@ +@42 +CGACCTTGAGTTTGATTTCTCTGACAATGAACGGAC ++42 +30.089888<9=<<>8>>7>A@@AE@?@ +@44 +GTTAAATCGATCTCCGACTATGCCGTTTTGCAATATACTCTATGATCAAG ++44 +2222::<;;EEE?E>@9@???@AA@@?A>>?;49:=*:<@<@?999974= +@2 +GGTAATGAAATAAAATGTACAT ++2 +5555;<;:;A@=>4-/=<E;E?9@@?AA:AA;=8=>?@ +@5 +ATGACCAGAAGATCAAAACAGAACTCCTT ++5 +7777===<<<<<>?EEEEEA9@:@@AAA? +@6 +TGTATTGAATGGAAGTTTTGCTTGCAACAATCTTCCAACTATTCATTGTA ++6 +6856::7::>>4=4:74::8:=9,=9<<<;;@@76:;:;48>><:::<07 +@8 +GGGTAACTCTTATTAATTTTTGTATTTGAAAAAGAAAAAACAATCTG ++8 +15*/898991<<=:=>E?EEEEEE@>6><<1<):2222244444@@@ +@10 +ATATTGTGGAAAAGGCAATCCAT ++10 +8897967669=8<9A?>A?@AA? +@13 +TTATGCAAACGATAGTTAACGAGAAAGAGATTA ++13 +777/:;;;7EEEEBAAA9A@AA77?<;<9@?<@ +@14 +TGAAAAGACCGAGTTAGGAAGTT ++14 +323376933A????@@9<@@A@? +@15 +GTCAAAGTTAAACCCCATGAATAAAAGCATGCAGTCAACTAAAAGTTAAC ++15 +323275686=,<<:>>>>=::<924,,<:???>?7;;2;<;4=<94<:78 +@16 +GTTTCAGCGAGAAAAATTACCCAGAGGACTTCGAGTCATAATGACGTTTG ++16 +7776<;;;3@?@<;,,,<<@@A???@<24.5852::=+97660777;;7A +@19 +ACTTGCATCGCCACCAAATTC ++19 +111189888?>???6<=?>?? +@25 +TTTTTACATGTTTTTCGGAACTTTAAGTTTTGAC ++25 +993587==9;><;;22?>>??@?994=<:?@<@@ +@27 +CAAGAGTTACAGCCCTTTTCCACAAGCCTGTCGCCTGTCCAA ++27 +777;55646;6=<=>>>8>@?;<@<6<=42(98306897A@? +@28 +GGAGAAGGTACTTGTACCCTCACCAATAGAAGAATTTCATGTACCCACAT ++28 +7785:=:::@?@@?B;A>ABBB>6;@@@4<<==:@@AA9> +@29 +ACTGAGGTCTAACGATTCTAGTTTAC ++29 +222256779<<4==;===<<>>>><@ +@30 +TTACTTCAGCCGTACCATACA ++30 +339352556?@A@;??>>?>? +@33 +CAGGACTCCAACTTGTAACTATTC ++33 +++*+-93315<<=<>?A><><0787:>? +@35 +AAGTACGTTTGCAGAAACTGTGTTGCACAACTGAAAAAACGCCGATCACT ++35 +5365<=<<=AA@A@?@4A@>>>>?A@A2@<=48=>>>763:*::??>>?A +@37 +ATAGAGTTGTGCTGCATTTTTAAGAATTC ++37 +///-10163>><><@@>>AA? +@40 +GTTGAAACAGTGAGAACTGTGAAA ++40 +5555;;:<7BBBBBBBB?B?@??@ +@42 +CGACCTTGAGTTTGATTTCTCTGACAATGAACGGAC ++42 +30.089888<9=<<>8>>7>A@@AE@?@ +@44 +GTTAAATCGATCTCCGACTATGCCGTTTTGCAATATACTCTATGATCAAG ++44 +2222::<;;EEE?E>@9@???@AA@@?A>>?;49:=*:<@<@?999974= diff -r 000000000000 -r f33e9e6a6c88 test-data/r1.fastq.gz Binary file test-data/r1.fastq.gz has changed diff -r 000000000000 -r f33e9e6a6c88 test-data/r1_dedup.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/r1_dedup.fastq Wed Nov 23 07:49:05 2016 -0500 @@ -0,0 +1,100 @@ +@2 +GGTAATGAAATAAAATGTACAT ++2 +5555;<;:;A@=>4-/=<E;E?9@@?AA:AA;=8=>?@ +@5 +ATGACCAGAAGATCAAAACAGAACTCCTT ++5 +7777===<<<<<>?EEEEEA9@:@@AAA? +@6 +TGTATTGAATGGAAGTTTTGCTTGCAACAATCTTCCAACTATTCATTGTA ++6 +6856::7::>>4=4:74::8:=9,=9<<<;;@@76:;:;48>><:::<07 +@8 +GGGTAACTCTTATTAATTTTTGTATTTGAAAAAGAAAAAACAATCTG ++8 +15*/898991<<=:=>E?EEEEEE@>6><<1<):2222244444@@@ +@10 +ATATTGTGGAAAAGGCAATCCAT ++10 +8897967669=8<9A?>A?@AA? +@13 +TTATGCAAACGATAGTTAACGAGAAAGAGATTA ++13 +777/:;;;7EEEEBAAA9A@AA77?<;<9@?<@ +@14 +TGAAAAGACCGAGTTAGGAAGTT ++14 +323376933A????@@9<@@A@? +@15 +GTCAAAGTTAAACCCCATGAATAAAAGCATGCAGTCAACTAAAAGTTAAC ++15 +323275686=,<<:>>>>=::<924,,<:???>?7;;2;<;4=<94<:78 +@16 +GTTTCAGCGAGAAAAATTACCCAGAGGACTTCGAGTCATAATGACGTTTG ++16 +7776<;;;3@?@<;,,,<<@@A???@<24.5852::=+97660777;;7A +@19 +ACTTGCATCGCCACCAAATTC ++19 +111189888?>???6<=?>?? +@25 +TTTTTACATGTTTTTCGGAACTTTAAGTTTTGAC ++25 +993587==9;><;;22?>>??@?994=<:?@<@@ +@27 +CAAGAGTTACAGCCCTTTTCCACAAGCCTGTCGCCTGTCCAA ++27 +777;55646;6=<=>>>8>@?;<@<6<=42(98306897A@? +@28 +GGAGAAGGTACTTGTACCCTCACCAATAGAAGAATTTCATGTACCCACAT ++28 +7785:=:::@?@@?B;A>ABBB>6;@@@4<<==:@@AA9> +@29 +ACTGAGGTCTAACGATTCTAGTTTAC ++29 +222256779<<4==;===<<>>>><@ +@30 +TTACTTCAGCCGTACCATACA ++30 +339352556?@A@;??>>?>? +@33 +CAGGACTCCAACTTGTAACTATTC ++33 +++*+-93315<<=<>?A><><0787:>? +@35 +AAGTACGTTTGCAGAAACTGTGTTGCACAACTGAAAAAACGCCGATCACT ++35 +5365<=<<=AA@A@?@4A@>>>>?A@A2@<=48=>>>763:*::??>>?A +@37 +ATAGAGTTGTGCTGCATTTTTAAGAATTC ++37 +///-10163>><><@@>>AA? +@40 +GTTGAAACAGTGAGAACTGTGAAA ++40 +5555;;:<7BBBBBBBB?B?@??@ +@42 +CGACCTTGAGTTTGATTTCTCTGACAATGAACGGAC ++42 +30.089888<9=<<>8>>7>A@@AE@?@ +@44 +GTTAAATCGATCTCCGACTATGCCGTTTTGCAATATACTCTATGATCAAG ++44 +2222::<;;EEE?E>@9@???@AA@@?A>>?;49:=*:<@<@?999974= diff -r 000000000000 -r f33e9e6a6c88 test-data/r1_dedup.fastq.gz Binary file test-data/r1_dedup.fastq.gz has changed diff -r 000000000000 -r f33e9e6a6c88 test-data/r2.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/r2.fastq Wed Nov 23 07:49:05 2016 -0500 @@ -0,0 +1,200 @@ +@17748 +GATTGGTTGTGAAACCGTTTGCGGAGATAGATTATGCCCATTGTGGGTTG ++17748 +GGFGGGFGGGGGGGGGGGGGGDFFCDDE=FEFFDEFGGGFGCFDGFBD:C +@17749 +GGTTTAAGAGTTTACATGTTTTGGGTAGCGATTACAATAATAAGGTGTTT ++17749 +CDFFDFFEFEGGGFGGGGGGGGGGGGGFGDGGGGGGGEFGGGFGEEEEED +@17750 +TTGCAACTTGTCAGATGACGTCTCAGCAGCAGCATCTAAATCTTCTTCAT ++17750 +GGGEGGFGGGGGGGEGGGGGGGGFFFFFGGFGGDGG=GGGGDDGEGGGGG +@17751 +CGTTGCCGTCATTCATCACACCTATCCCGTGAACAGTATGACAGAGTGG ++17751 +GDGGGGGGGDAFDDFFBFFFFFEFAEE?EC?EEE?B?C?BDC?EBDBBD +@17752 +GTATCGATAGTAACGACAATTTGATTTTCACGCTTCCTTGACAAAACAA ++17752 +GGGGGGGGGGGGGGGGGDGGGGGBGGGGFGCFFDGGGFGFEFDGGGFGC +@17754 +GTTGCAAAAAGTAGGGCATTTGCTGAAAAACCACCAAAACTTGTCGATCT ++17754 +GGGGGGGGGFGFGGGGGGGFGGGGEGGGGGGGGGGGGFDGEGEGDFGGGG +@17755 +GGTTTAATTACCACCGTGAAAGGTTTAGAAAGCTGACGTTTCGAGCGTTA ++17755 +GFGGGGGDGFGGGGGGGGGEFFFFGGFGGGGEGGDGGDGGFGGDGGGFG: +@17756 +TGTTATGGAAAGAGATATGTTTGGAAAACACAATGCAAAGCACTTGTTGA ++17756 +GGGGGGGGFAA@?(:=87;GFFGEDGBGGGFDDDGDGGGFDFGGDEAF?A +@17757 +TACCGTGCACTAGGGTTTTGACCCAATTTTTAGCAAAATGTTGCAAAAAG ++17757 +GGGGGGGFGGGGGEFFGGGEGGGFFE?FFGGFFFGFGGGGGGGG5?DDC; +@17758 +TGACTCGTTCCCTGGGTAGGGAGAGAGGACACTGCAATCTCAAACATCGA ++17758 +GFEGGGGGGGGGFGEAADEEECECABEEBGGEFGBG?GDAEEEEEG?GD5 +@17759 +TTTGCACAAAAAGTTGCTTATCTCGAGGAGTTTGACAAGTTTCGGTGGTT ++17759 +GGGGGGGGGGGFGGGGGGGDGGGGDDFFEFGGGGGDDBFDGFDGGFGGFD +@17760 +GAAAAACAAGAGCAGCAAAGGCAGAGGGAAGATGGGTCGGTGTAGTTGTC ++17760 +GFGGGGGCGGGGDDGGFGBGGDDFGDDBGBGGFGDECFGEEEBEEEEEEF +@17761 +GCTAAGTTATCGTTTGCGTATTATAACGCGCTCTAGCCATGTCAGTTAAC ++17761 +@EED=E?D?B9EDDE5=BE=EAEE?E?BBDCB?DD5?DDA?5--.@@@6D +@17762 +CAGTACATAACTATTAAAGAAATCCAACTCACAAATGCTTGGAAGGCGAG ++17762 +GGGGGGGGEGGGGGGGGGGGGFGGGFGFFGGGFGGGDFDGFGGGGFGGGG +@17763 +TGGCAAACTTTTGACGGTCATAACAAGCAAGTGTGCAGGACGTTTCTAAG ++17763 +GGGGDGGFFFFGGGEGAGFFGGGGGGGFDGDEGBFD?DDGEEFFFAB?F? +@17764 +TTCATCCACAGTGGACAGAAACAACATCATTCTTACTGAGTTGTTGTTTC ++17764 +GGGFGGGFGGGEGGFGAGGGGGGGGEGEFFGGGGGEGGGEGDGGBGGGD; +@17765 +TCCTCTCAGATTTGGGTACTTTCAATTGCCGAAAACTGATTTTGAATACT ++17765 +?ACA5CC?CD=DDD=;;C@?A>A-AA?A?CAAA-DDDDDADD=A5:?5:: +@17766 +AGAAACGATTTAAAGCCATATAGCCCCCGAAACTCAGACTTTATCCTTCC ++17766 +GGGGGGGGGGGFEGGGGGGGGGGFGGGGGGGGGGFGFGGFGGFDFFBFFA +@17767 +GATAAGATGACGTACTCTCCTAGATACGTCTTCGAATCTGAGACGGTCTC ++17767 +DFEF?FGGGGGGBGGGGGGDGGGF?=FFAGAGGFF-FG5ED?BDEC:ECA +@17768 +TTGTCGATCTCCTCGAGATAAGCAACTTTTTCTGCAAAGTACCGTGCACT ++17768 +FGGGGGGGGGGGGGF=EAGFGGEGFDGFGG?GGFGGGGGDEEDFC:AEDA +@17769 +ACTACACGACTGTCCCAGTCTGTAGTGGATCGTGCTAGTGTCTCATTT ++17769 +GGGGGBGGGFFFFFGGGGGGGGGGGGGGFGG?DE?C?CCBB:BAB@CC +@17770 +AGAGTGTTTTCCTTGTCACGCCTTATTAAGATAAAAGTCTGAAACAGTTC ++17770 +FFDFGGGGGFFGGGDFGGGEGFGGGGGBGAGGGFGEGGGFDE?EFFDFD5 +@17771 +CCATTGGAGACATGTTTATGAACGAATGTGAACAGGTATTTATCATACAA ++17771 +GGGGGGGFFGGGFGGGGGGGGDGGFDDGGFDGGGGADGGFFDFFDE?EDD +@17772 +TCGTCATCTGTGCATCTGTCCTCTAATAGATCATAGGCGAGAACTAATC ++17772 +CCC@DD?BD=@CCCADDB=DCDCDDDDC?DB:BBBD?A?CA?B5:=7@5 +@17773 +GTTGTTCACACTAGAGACATAAATTAAAACATCAAAACAAGGCTCTTTTT ++17773 +BD=B?DBB?DADCDDE:AEBDABBEEEE=?DDABEEEADC8:B5DDDADC +@17748 +GATTGGTTGTGAAACCGTTTGCGGAGATAGATTATGCCCATTGTGGGTTG ++17748 +GGFGGGFGGGGGGGGGGGGGGDFFCDDE=FEFFDEFGGGFGCFDGFBD:C +@17749 +GGTTTAAGAGTTTACATGTTTTGGGTAGCGATTACAATAATAAGGTGTTT ++17749 +CDFFDFFEFEGGGFGGGGGGGGGGGGGFGDGGGGGGGEFGGGFGEEEEED +@17750 +TTGCAACTTGTCAGATGACGTCTCAGCAGCAGCATCTAAATCTTCTTCAT ++17750 +GGGEGGFGGGGGGGEGGGGGGGGFFFFFGGFGGDGG=GGGGDDGEGGGGG +@17751 +CGTTGCCGTCATTCATCACACCTATCCCGTGAACAGTATGACAGAGTGG ++17751 +GDGGGGGGGDAFDDFFBFFFFFEFAEE?EC?EEE?B?C?BDC?EBDBBD +@17752 +GTATCGATAGTAACGACAATTTGATTTTCACGCTTCCTTGACAAAACAA ++17752 +GGGGGGGGGGGGGGGGGDGGGGGBGGGGFGCFFDGGGFGFEFDGGGFGC +@17754 +GTTGCAAAAAGTAGGGCATTTGCTGAAAAACCACCAAAACTTGTCGATCT ++17754 +GGGGGGGGGFGFGGGGGGGFGGGGEGGGGGGGGGGGGFDGEGEGDFGGGG +@17755 +GGTTTAATTACCACCGTGAAAGGTTTAGAAAGCTGACGTTTCGAGCGTTA ++17755 +GFGGGGGDGFGGGGGGGGGEFFFFGGFGGGGEGGDGGDGGFGGDGGGFG: +@17756 +TGTTATGGAAAGAGATATGTTTGGAAAACACAATGCAAAGCACTTGTTGA ++17756 +GGGGGGGGFAA@?(:=87;GFFGEDGBGGGFDDDGDGGGFDFGGDEAF?A +@17757 +TACCGTGCACTAGGGTTTTGACCCAATTTTTAGCAAAATGTTGCAAAAAG ++17757 +GGGGGGGFGGGGGEFFGGGEGGGFFE?FFGGFFFGFGGGGGGGG5?DDC; +@17758 +TGACTCGTTCCCTGGGTAGGGAGAGAGGACACTGCAATCTCAAACATCGA ++17758 +GFEGGGGGGGGGFGEAADEEECECABEEBGGEFGBG?GDAEEEEEG?GD5 +@17759 +TTTGCACAAAAAGTTGCTTATCTCGAGGAGTTTGACAAGTTTCGGTGGTT ++17759 +GGGGGGGGGGGFGGGGGGGDGGGGDDFFEFGGGGGDDBFDGFDGGFGGFD +@17760 +GAAAAACAAGAGCAGCAAAGGCAGAGGGAAGATGGGTCGGTGTAGTTGTC ++17760 +GFGGGGGCGGGGDDGGFGBGGDDFGDDBGBGGFGDECFGEEEBEEEEEEF +@17761 +GCTAAGTTATCGTTTGCGTATTATAACGCGCTCTAGCCATGTCAGTTAAC ++17761 +@EED=E?D?B9EDDE5=BE=EAEE?E?BBDCB?DD5?DDA?5--.@@@6D +@17762 +CAGTACATAACTATTAAAGAAATCCAACTCACAAATGCTTGGAAGGCGAG ++17762 +GGGGGGGGEGGGGGGGGGGGGFGGGFGFFGGGFGGGDFDGFGGGGFGGGG +@17763 +TGGCAAACTTTTGACGGTCATAACAAGCAAGTGTGCAGGACGTTTCTAAG ++17763 +GGGGDGGFFFFGGGEGAGFFGGGGGGGFDGDEGBFD?DDGEEFFFAB?F? +@17764 +TTCATCCACAGTGGACAGAAACAACATCATTCTTACTGAGTTGTTGTTTC ++17764 +GGGFGGGFGGGEGGFGAGGGGGGGGEGEFFGGGGGEGGGEGDGGBGGGD; +@17765 +TCCTCTCAGATTTGGGTACTTTCAATTGCCGAAAACTGATTTTGAATACT ++17765 +?ACA5CC?CD=DDD=;;C@?A>A-AA?A?CAAA-DDDDDADD=A5:?5:: +@17766 +AGAAACGATTTAAAGCCATATAGCCCCCGAAACTCAGACTTTATCCTTCC ++17766 +GGGGGGGGGGGFEGGGGGGGGGGFGGGGGGGGGGFGFGGFGGFDFFBFFA +@17767 +GATAAGATGACGTACTCTCCTAGATACGTCTTCGAATCTGAGACGGTCTC ++17767 +DFEF?FGGGGGGBGGGGGGDGGGF?=FFAGAGGFF-FG5ED?BDEC:ECA +@17768 +TTGTCGATCTCCTCGAGATAAGCAACTTTTTCTGCAAAGTACCGTGCACT ++17768 +FGGGGGGGGGGGGGF=EAGFGGEGFDGFGG?GGFGGGGGDEEDFC:AEDA +@17769 +ACTACACGACTGTCCCAGTCTGTAGTGGATCGTGCTAGTGTCTCATTT ++17769 +GGGGGBGGGFFFFFGGGGGGGGGGGGGGFGG?DE?C?CCBB:BAB@CC +@17770 +AGAGTGTTTTCCTTGTCACGCCTTATTAAGATAAAAGTCTGAAACAGTTC ++17770 +FFDFGGGGGFFGGGDFGGGEGFGGGGGBGAGGGFGEGGGFDE?EFFDFD5 +@17771 +CCATTGGAGACATGTTTATGAACGAATGTGAACAGGTATTTATCATACAA ++17771 +GGGGGGGFFGGGFGGGGGGGGDGGFDDGGFDGGGGADGGFFDFFDE?EDD +@17772 +TCGTCATCTGTGCATCTGTCCTCTAATAGATCATAGGCGAGAACTAATC ++17772 +CCC@DD?BD=@CCCADDB=DCDCDDDDC?DB:BBBD?A?CA?B5:=7@5 +@17773 +GTTGTTCACACTAGAGACATAAATTAAAACATCAAAACAAGGCTCTTTTT ++17773 +BD=B?DBB?DADCDDE:AEBDABBEEEE=?DDABEEEADC8:B5DDDADC diff -r 000000000000 -r f33e9e6a6c88 test-data/r2.fastq.gz Binary file test-data/r2.fastq.gz has changed diff -r 000000000000 -r f33e9e6a6c88 test-data/r2_dedup.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/r2_dedup.fastq Wed Nov 23 07:49:05 2016 -0500 @@ -0,0 +1,100 @@ +@17748 +GATTGGTTGTGAAACCGTTTGCGGAGATAGATTATGCCCATTGTGGGTTG ++17748 +GGFGGGFGGGGGGGGGGGGGGDFFCDDE=FEFFDEFGGGFGCFDGFBD:C +@17749 +GGTTTAAGAGTTTACATGTTTTGGGTAGCGATTACAATAATAAGGTGTTT ++17749 +CDFFDFFEFEGGGFGGGGGGGGGGGGGFGDGGGGGGGEFGGGFGEEEEED +@17750 +TTGCAACTTGTCAGATGACGTCTCAGCAGCAGCATCTAAATCTTCTTCAT ++17750 +GGGEGGFGGGGGGGEGGGGGGGGFFFFFGGFGGDGG=GGGGDDGEGGGGG +@17751 +CGTTGCCGTCATTCATCACACCTATCCCGTGAACAGTATGACAGAGTGG ++17751 +GDGGGGGGGDAFDDFFBFFFFFEFAEE?EC?EEE?B?C?BDC?EBDBBD +@17752 +GTATCGATAGTAACGACAATTTGATTTTCACGCTTCCTTGACAAAACAA ++17752 +GGGGGGGGGGGGGGGGGDGGGGGBGGGGFGCFFDGGGFGFEFDGGGFGC +@17754 +GTTGCAAAAAGTAGGGCATTTGCTGAAAAACCACCAAAACTTGTCGATCT ++17754 +GGGGGGGGGFGFGGGGGGGFGGGGEGGGGGGGGGGGGFDGEGEGDFGGGG +@17755 +GGTTTAATTACCACCGTGAAAGGTTTAGAAAGCTGACGTTTCGAGCGTTA ++17755 +GFGGGGGDGFGGGGGGGGGEFFFFGGFGGGGEGGDGGDGGFGGDGGGFG: +@17756 +TGTTATGGAAAGAGATATGTTTGGAAAACACAATGCAAAGCACTTGTTGA ++17756 +GGGGGGGGFAA@?(:=87;GFFGEDGBGGGFDDDGDGGGFDFGGDEAF?A +@17757 +TACCGTGCACTAGGGTTTTGACCCAATTTTTAGCAAAATGTTGCAAAAAG ++17757 +GGGGGGGFGGGGGEFFGGGEGGGFFE?FFGGFFFGFGGGGGGGG5?DDC; +@17758 +TGACTCGTTCCCTGGGTAGGGAGAGAGGACACTGCAATCTCAAACATCGA ++17758 +GFEGGGGGGGGGFGEAADEEECECABEEBGGEFGBG?GDAEEEEEG?GD5 +@17759 +TTTGCACAAAAAGTTGCTTATCTCGAGGAGTTTGACAAGTTTCGGTGGTT ++17759 +GGGGGGGGGGGFGGGGGGGDGGGGDDFFEFGGGGGDDBFDGFDGGFGGFD +@17760 +GAAAAACAAGAGCAGCAAAGGCAGAGGGAAGATGGGTCGGTGTAGTTGTC ++17760 +GFGGGGGCGGGGDDGGFGBGGDDFGDDBGBGGFGDECFGEEEBEEEEEEF +@17761 +GCTAAGTTATCGTTTGCGTATTATAACGCGCTCTAGCCATGTCAGTTAAC ++17761 +@EED=E?D?B9EDDE5=BE=EAEE?E?BBDCB?DD5?DDA?5--.@@@6D +@17762 +CAGTACATAACTATTAAAGAAATCCAACTCACAAATGCTTGGAAGGCGAG ++17762 +GGGGGGGGEGGGGGGGGGGGGFGGGFGFFGGGFGGGDFDGFGGGGFGGGG +@17763 +TGGCAAACTTTTGACGGTCATAACAAGCAAGTGTGCAGGACGTTTCTAAG ++17763 +GGGGDGGFFFFGGGEGAGFFGGGGGGGFDGDEGBFD?DDGEEFFFAB?F? +@17764 +TTCATCCACAGTGGACAGAAACAACATCATTCTTACTGAGTTGTTGTTTC ++17764 +GGGFGGGFGGGEGGFGAGGGGGGGGEGEFFGGGGGEGGGEGDGGBGGGD; +@17765 +TCCTCTCAGATTTGGGTACTTTCAATTGCCGAAAACTGATTTTGAATACT ++17765 +?ACA5CC?CD=DDD=;;C@?A>A-AA?A?CAAA-DDDDDADD=A5:?5:: +@17766 +AGAAACGATTTAAAGCCATATAGCCCCCGAAACTCAGACTTTATCCTTCC ++17766 +GGGGGGGGGGGFEGGGGGGGGGGFGGGGGGGGGGFGFGGFGGFDFFBFFA +@17767 +GATAAGATGACGTACTCTCCTAGATACGTCTTCGAATCTGAGACGGTCTC ++17767 +DFEF?FGGGGGGBGGGGGGDGGGF?=FFAGAGGFF-FG5ED?BDEC:ECA +@17768 +TTGTCGATCTCCTCGAGATAAGCAACTTTTTCTGCAAAGTACCGTGCACT ++17768 +FGGGGGGGGGGGGGF=EAGFGGEGFDGFGG?GGFGGGGGDEEDFC:AEDA +@17769 +ACTACACGACTGTCCCAGTCTGTAGTGGATCGTGCTAGTGTCTCATTT ++17769 +GGGGGBGGGFFFFFGGGGGGGGGGGGGGFGG?DE?C?CCBB:BAB@CC +@17770 +AGAGTGTTTTCCTTGTCACGCCTTATTAAGATAAAAGTCTGAAACAGTTC ++17770 +FFDFGGGGGFFGGGDFGGGEGFGGGGGBGAGGGFGEGGGFDE?EFFDFD5 +@17771 +CCATTGGAGACATGTTTATGAACGAATGTGAACAGGTATTTATCATACAA ++17771 +GGGGGGGFFGGGFGGGGGGGGDGGFDDGGFDGGGGADGGFFDFFDE?EDD +@17772 +TCGTCATCTGTGCATCTGTCCTCTAATAGATCATAGGCGAGAACTAATC ++17772 +CCC@DD?BD=@CCCADDB=DCDCDDDDC?DB:BBBD?A?CA?B5:=7@5 +@17773 +GTTGTTCACACTAGAGACATAAATTAAAACATCAAAACAAGGCTCTTTTT ++17773 +BD=B?DBB?DADCDDE:AEBDABBEEEE=?DDABEEEADC8:B5DDDADC diff -r 000000000000 -r f33e9e6a6c88 test-data/r2_dedup.fastq.gz Binary file test-data/r2_dedup.fastq.gz has changed diff -r 000000000000 -r f33e9e6a6c88 test/test_dedup_hash.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_dedup_hash.py Wed Nov 23 07:49:05 2016 -0500 @@ -0,0 +1,63 @@ +import hashlib +import inspect +import os +import subprocess +import sys +import tempfile + + +currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +parent_dir = os.path.dirname(currentdir) +sys.path.insert(0, os.path.join(parent_dir, 'dedup_hash/')) +import dedup_hash + + +TEST_DATA_DIR = os.path.join(parent_dir, 'test-data/') +UNCOMPRESSED_IN = ['r1.fastq', 'r2.fastq'] +COMPRESSED_IN = ['r1.fastq.gz', 'r2.fastq.gz'] +UNCOMPRESSED_OUT = ['r1_dedup.fastq', 'r2_dedup.fastq'] +SINGLE_IN = ['r1.fastq'] +SINGLE_OUT = ['r1_dedup.fastq'] + + + +def run(input): + args = prepare_args(input) + run_dedup(args) + compare_output(args) + + +def compare_output(args): + ref_out1 = os.path.join(TEST_DATA_DIR, 'r1_dedup.fastq') + try: + assert md5(args['outfiles'][0]) == md5(ref_out1) + except AssertionError: + cmd = "diff -Nru %s %s" % (args['outfiles'][0], ref_out1) + subprocess.check_call(cmd.split(' ')) + print('all good') + + +def prepare_args(test_files): + infiles = [os.path.join(TEST_DATA_DIR, test_file) for test_file in test_files] + outfiles = [tempfile.NamedTemporaryFile(delete=False).name for test_file in test_files] # Same number of output files as input files + kwargs = {'infiles': infiles, + 'outfiles': outfiles, + 'write_gzip': False} + return kwargs + + +def run_dedup(kwargs): + fastq_pairs_instance = dedup_hash.get_unique_fastq_instance() + fastq_pairs_instance(**kwargs) + +def md5(fname): + hash_md5 = hashlib.md5() + with open(fname, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + +if __name__ == '__main__': + run(UNCOMPRESSED_IN) + run(COMPRESSED_IN) + run(SINGLE_IN) diff -r 000000000000 -r f33e9e6a6c88 tox.ini --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tox.ini Wed Nov 23 07:49:05 2016 -0500 @@ -0,0 +1,17 @@ +[tox] +envlist = py{27,35}-lint +source_dir = dedup_hash + +[testenv:py27-lint] +commands = flake8 {[tox]source_dir} {[tox]source_dir} --ignore E501 +skip_install = True +deps = + flake8 + flake8-import-order + +[testenv:py35-lint] +commands = flake8 {[tox]source_dir} {[tox]source_dir} --ignore E501 +skip_install = True +deps = + flake8 + flake8-import-order