Mercurial > repos > mvdbeek > dedup_hash
view dedup_hash/dedup_hash.py @ 0:f33e9e6a6c88 draft default tip
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
author | mvdbeek |
---|---|
date | Wed, 23 Nov 2016 07:49:05 -0500 |
parents | |
children |
line wrap: on
line source
import argparse import gzip import io import sys from itertools import cycle try: from itertools import izip except ImportError: pass # we can simply use zip in python3 class UniqueFastqBase(object): def __init__(self, infiles, outfiles, write_gzip, buffer_size=32768, compresslevel=2, hash_module="smhasher"): self.seen_hashes = set() self.infiles = infiles self.outfiles = outfiles self.write_gzip = write_gzip self.buffer_size = buffer_size self.compresslevel = compresslevel self.hash_module = self.import_hash_module(hash_module) self.cur_fastq_str_r1 = "" self.cur_fastq_str_r2 = "" self.cur_uniq = False self.fastq_cycle = cycle([self.header_one_action, self.seq_action, self.header_two_action, self.qual_action]) self.infiles = self.get_inputs() self.outfiles = self.get_outputs() self.process_files() self.close_io() def import_hash_module(self, hash_module): if hash_module == "smhasher": from smhasher import murmur3_x64_64 return murmur3_x64_64 if hash_module == "CityHash64": from cityhash import CityHash64 return CityHash64 if hash_module == "hashxx": from pyhashxx import hashxx return hashxx def get_input(self, infile): if self._is_gzip(infile): return io.BufferedReader(gzip.GzipFile(infile, 'rb'), buffer_size=self.buffer_size) else: return open(infile) def get_inputs(self): return [self.get_input(infile) for infile in self.infiles] def get_outputs(self): if self.write_gzip: return [io.BufferedWriter(gzip.GzipFile(outfile, 'wb', compresslevel=self.compresslevel), buffer_size=self.buffer_size) for outfile in self.outfiles] return [open(outfile, 'w') for outfile in self.outfiles] def close_io(self): [infile.close() for infile in self.infiles] [outfile.close() for outfile in self.outfiles] def _is_gzip(self, infile): gzip_magic_byte = b"\x1f\x8b\x08" with open(infile, 'rb') as input: return gzip_magic_byte == input.read(len(gzip_magic_byte)) def process_files(self): raise Exception('Not implemented') def seq_action(self, lines): cur_hash = self.hash_module("".join(lines)) if cur_hash in self.seen_hashes: self.cur_uniq = False else: self.seen_hashes.add(cur_hash) self.cur_uniq = True self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)] def header_one_action(self, lines): self.cur_uniq = False self.cur_fastq_strs = lines def header_two_action(self, lines): self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)] def qual_action(self, lines): if self.cur_uniq: self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)] [outfile.write(string) for string, outfile in zip(self.cur_fastq_strs, self.outfiles)] class UniqueFastqPairsPy2(UniqueFastqBase): def process_files(self): for items in izip(self.fastq_cycle, *self.infiles): fastq_item = items[0] lines = items[1:] fastq_item(lines) class UniqueFastqPairsPy3(UniqueFastqBase): def process_files(self): for items in zip(self.fastq_cycle, *self.infiles): fastq_item = items[0] lines = items[1:] # The following might be slow, rework this to something smarter # it it slows down too much. fastq_item([l if isinstance(l, str) else l.decode() for l in lines]) def get_args(): parser = argparse.ArgumentParser(description='Get unique reads from fastq files') parser.add_argument('--r1_in', required=True, help='Read1 input fastq file') parser.add_argument('--r2_in', required=False, default=None, help='Read2 input fastq file') parser.add_argument('--r1_out', required=True, help='Read1 output fastq file') parser.add_argument('--r2_out', required=False, help='Read2 output fastq file') parser.add_argument('--write_gzip', action='store_true', help="Compress output in gzip format?") parser.add_argument('--buffer_size', default=32768, type=int, help="Set buffer size for reading gzip files") parser.add_argument('--compresslevel', default=2, type=int, choices=list(range(1, 10)), help="Set compression level (1: fastest, 9: highest compression)") parser.add_argument('--algo', default='smhasher', choices=['CityHash64', 'hashxx', 'smhasher'], help='Select hash algorithm') return parser.parse_args() def get_infiles(args): if args.r2_in: return [args.r1_in, args.r2_in] else: return [args.r1_in] def get_outfiles(args): if args.r2_out: return [args.r1_out, args.r2_out] else: return [args.r1_out] def get_unique_fastq_instance(): if sys.version_info.major == 2: return UniqueFastqPairsPy2 elif sys.version_info.major == 3: return UniqueFastqPairsPy3 def main(): args = get_args() UniqueFastqPairs = get_unique_fastq_instance() UniqueFastqPairs(infiles=get_infiles(args), outfiles=get_outfiles(args), write_gzip=args.write_gzip, buffer_size=args.buffer_size, compresslevel=args.compresslevel, hash_module=args.algo) if __name__ == '__main__': main()