Mercurial > repos > mvdbeek > dedup_hash
comparison dedup_hash/dedup_hash.py @ 0:f33e9e6a6c88 draft default tip
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
author | mvdbeek |
---|---|
date | Wed, 23 Nov 2016 07:49:05 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f33e9e6a6c88 |
---|---|
1 import argparse | |
2 import gzip | |
3 import io | |
4 import sys | |
5 from itertools import cycle | |
6 try: | |
7 from itertools import izip | |
8 except ImportError: | |
9 pass # we can simply use zip in python3 | |
10 | |
11 | |
12 class UniqueFastqBase(object): | |
13 def __init__(self, | |
14 infiles, | |
15 outfiles, | |
16 write_gzip, | |
17 buffer_size=32768, | |
18 compresslevel=2, | |
19 hash_module="smhasher"): | |
20 self.seen_hashes = set() | |
21 self.infiles = infiles | |
22 self.outfiles = outfiles | |
23 self.write_gzip = write_gzip | |
24 self.buffer_size = buffer_size | |
25 self.compresslevel = compresslevel | |
26 self.hash_module = self.import_hash_module(hash_module) | |
27 self.cur_fastq_str_r1 = "" | |
28 self.cur_fastq_str_r2 = "" | |
29 self.cur_uniq = False | |
30 self.fastq_cycle = cycle([self.header_one_action, self.seq_action, self.header_two_action, self.qual_action]) | |
31 self.infiles = self.get_inputs() | |
32 self.outfiles = self.get_outputs() | |
33 self.process_files() | |
34 self.close_io() | |
35 | |
36 def import_hash_module(self, hash_module): | |
37 if hash_module == "smhasher": | |
38 from smhasher import murmur3_x64_64 | |
39 return murmur3_x64_64 | |
40 if hash_module == "CityHash64": | |
41 from cityhash import CityHash64 | |
42 return CityHash64 | |
43 if hash_module == "hashxx": | |
44 from pyhashxx import hashxx | |
45 return hashxx | |
46 | |
47 def get_input(self, infile): | |
48 if self._is_gzip(infile): | |
49 return io.BufferedReader(gzip.GzipFile(infile, 'rb'), buffer_size=self.buffer_size) | |
50 else: | |
51 return open(infile) | |
52 | |
53 def get_inputs(self): | |
54 return [self.get_input(infile) for infile in self.infiles] | |
55 | |
56 def get_outputs(self): | |
57 if self.write_gzip: | |
58 return [io.BufferedWriter(gzip.GzipFile(outfile, 'wb', compresslevel=self.compresslevel), buffer_size=self.buffer_size) for outfile in self.outfiles] | |
59 return [open(outfile, 'w') for outfile in self.outfiles] | |
60 | |
61 def close_io(self): | |
62 [infile.close() for infile in self.infiles] | |
63 [outfile.close() for outfile in self.outfiles] | |
64 | |
65 def _is_gzip(self, infile): | |
66 gzip_magic_byte = b"\x1f\x8b\x08" | |
67 with open(infile, 'rb') as input: | |
68 return gzip_magic_byte == input.read(len(gzip_magic_byte)) | |
69 | |
70 def process_files(self): | |
71 raise Exception('Not implemented') | |
72 | |
73 def seq_action(self, lines): | |
74 cur_hash = self.hash_module("".join(lines)) | |
75 if cur_hash in self.seen_hashes: | |
76 self.cur_uniq = False | |
77 else: | |
78 self.seen_hashes.add(cur_hash) | |
79 self.cur_uniq = True | |
80 self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)] | |
81 | |
82 def header_one_action(self, lines): | |
83 self.cur_uniq = False | |
84 self.cur_fastq_strs = lines | |
85 | |
86 def header_two_action(self, lines): | |
87 self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)] | |
88 | |
89 def qual_action(self, lines): | |
90 if self.cur_uniq: | |
91 self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)] | |
92 [outfile.write(string) for string, outfile in zip(self.cur_fastq_strs, self.outfiles)] | |
93 | |
94 | |
95 class UniqueFastqPairsPy2(UniqueFastqBase): | |
96 | |
97 def process_files(self): | |
98 for items in izip(self.fastq_cycle, *self.infiles): | |
99 fastq_item = items[0] | |
100 lines = items[1:] | |
101 fastq_item(lines) | |
102 | |
103 | |
104 class UniqueFastqPairsPy3(UniqueFastqBase): | |
105 | |
106 def process_files(self): | |
107 for items in zip(self.fastq_cycle, *self.infiles): | |
108 fastq_item = items[0] | |
109 lines = items[1:] | |
110 # The following might be slow, rework this to something smarter | |
111 # it it slows down too much. | |
112 fastq_item([l if isinstance(l, str) else l.decode() for l in lines]) | |
113 | |
114 | |
115 def get_args(): | |
116 parser = argparse.ArgumentParser(description='Get unique reads from fastq files') | |
117 parser.add_argument('--r1_in', required=True, help='Read1 input fastq file') | |
118 parser.add_argument('--r2_in', required=False, default=None, help='Read2 input fastq file') | |
119 parser.add_argument('--r1_out', required=True, help='Read1 output fastq file') | |
120 parser.add_argument('--r2_out', required=False, help='Read2 output fastq file') | |
121 parser.add_argument('--write_gzip', action='store_true', help="Compress output in gzip format?") | |
122 parser.add_argument('--buffer_size', default=32768, type=int, help="Set buffer size for reading gzip files") | |
123 parser.add_argument('--compresslevel', default=2, type=int, choices=list(range(1, 10)), help="Set compression level (1: fastest, 9: highest compression)") | |
124 parser.add_argument('--algo', default='smhasher', choices=['CityHash64', 'hashxx', 'smhasher'], help='Select hash algorithm') | |
125 return parser.parse_args() | |
126 | |
127 | |
128 def get_infiles(args): | |
129 if args.r2_in: | |
130 return [args.r1_in, args.r2_in] | |
131 else: | |
132 return [args.r1_in] | |
133 | |
134 | |
135 def get_outfiles(args): | |
136 if args.r2_out: | |
137 return [args.r1_out, args.r2_out] | |
138 else: | |
139 return [args.r1_out] | |
140 | |
141 | |
142 def get_unique_fastq_instance(): | |
143 if sys.version_info.major == 2: | |
144 return UniqueFastqPairsPy2 | |
145 elif sys.version_info.major == 3: | |
146 return UniqueFastqPairsPy3 | |
147 | |
148 | |
149 def main(): | |
150 args = get_args() | |
151 UniqueFastqPairs = get_unique_fastq_instance() | |
152 UniqueFastqPairs(infiles=get_infiles(args), | |
153 outfiles=get_outfiles(args), | |
154 write_gzip=args.write_gzip, | |
155 buffer_size=args.buffer_size, | |
156 compresslevel=args.compresslevel, | |
157 hash_module=args.algo) | |
158 | |
159 | |
160 if __name__ == '__main__': | |
161 main() |