comparison dedup_hash/dedup_hash.py @ 0:f33e9e6a6c88 draft default tip

planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
author mvdbeek
date Wed, 23 Nov 2016 07:49:05 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:f33e9e6a6c88
1 import argparse
2 import gzip
3 import io
4 import sys
5 from itertools import cycle
6 try:
7 from itertools import izip
8 except ImportError:
9 pass # we can simply use zip in python3
10
11
12 class UniqueFastqBase(object):
13 def __init__(self,
14 infiles,
15 outfiles,
16 write_gzip,
17 buffer_size=32768,
18 compresslevel=2,
19 hash_module="smhasher"):
20 self.seen_hashes = set()
21 self.infiles = infiles
22 self.outfiles = outfiles
23 self.write_gzip = write_gzip
24 self.buffer_size = buffer_size
25 self.compresslevel = compresslevel
26 self.hash_module = self.import_hash_module(hash_module)
27 self.cur_fastq_str_r1 = ""
28 self.cur_fastq_str_r2 = ""
29 self.cur_uniq = False
30 self.fastq_cycle = cycle([self.header_one_action, self.seq_action, self.header_two_action, self.qual_action])
31 self.infiles = self.get_inputs()
32 self.outfiles = self.get_outputs()
33 self.process_files()
34 self.close_io()
35
36 def import_hash_module(self, hash_module):
37 if hash_module == "smhasher":
38 from smhasher import murmur3_x64_64
39 return murmur3_x64_64
40 if hash_module == "CityHash64":
41 from cityhash import CityHash64
42 return CityHash64
43 if hash_module == "hashxx":
44 from pyhashxx import hashxx
45 return hashxx
46
47 def get_input(self, infile):
48 if self._is_gzip(infile):
49 return io.BufferedReader(gzip.GzipFile(infile, 'rb'), buffer_size=self.buffer_size)
50 else:
51 return open(infile)
52
53 def get_inputs(self):
54 return [self.get_input(infile) for infile in self.infiles]
55
56 def get_outputs(self):
57 if self.write_gzip:
58 return [io.BufferedWriter(gzip.GzipFile(outfile, 'wb', compresslevel=self.compresslevel), buffer_size=self.buffer_size) for outfile in self.outfiles]
59 return [open(outfile, 'w') for outfile in self.outfiles]
60
61 def close_io(self):
62 [infile.close() for infile in self.infiles]
63 [outfile.close() for outfile in self.outfiles]
64
65 def _is_gzip(self, infile):
66 gzip_magic_byte = b"\x1f\x8b\x08"
67 with open(infile, 'rb') as input:
68 return gzip_magic_byte == input.read(len(gzip_magic_byte))
69
70 def process_files(self):
71 raise Exception('Not implemented')
72
73 def seq_action(self, lines):
74 cur_hash = self.hash_module("".join(lines))
75 if cur_hash in self.seen_hashes:
76 self.cur_uniq = False
77 else:
78 self.seen_hashes.add(cur_hash)
79 self.cur_uniq = True
80 self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)]
81
82 def header_one_action(self, lines):
83 self.cur_uniq = False
84 self.cur_fastq_strs = lines
85
86 def header_two_action(self, lines):
87 self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)]
88
89 def qual_action(self, lines):
90 if self.cur_uniq:
91 self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)]
92 [outfile.write(string) for string, outfile in zip(self.cur_fastq_strs, self.outfiles)]
93
94
95 class UniqueFastqPairsPy2(UniqueFastqBase):
96
97 def process_files(self):
98 for items in izip(self.fastq_cycle, *self.infiles):
99 fastq_item = items[0]
100 lines = items[1:]
101 fastq_item(lines)
102
103
104 class UniqueFastqPairsPy3(UniqueFastqBase):
105
106 def process_files(self):
107 for items in zip(self.fastq_cycle, *self.infiles):
108 fastq_item = items[0]
109 lines = items[1:]
110 # The following might be slow, rework this to something smarter
111 # it it slows down too much.
112 fastq_item([l if isinstance(l, str) else l.decode() for l in lines])
113
114
115 def get_args():
116 parser = argparse.ArgumentParser(description='Get unique reads from fastq files')
117 parser.add_argument('--r1_in', required=True, help='Read1 input fastq file')
118 parser.add_argument('--r2_in', required=False, default=None, help='Read2 input fastq file')
119 parser.add_argument('--r1_out', required=True, help='Read1 output fastq file')
120 parser.add_argument('--r2_out', required=False, help='Read2 output fastq file')
121 parser.add_argument('--write_gzip', action='store_true', help="Compress output in gzip format?")
122 parser.add_argument('--buffer_size', default=32768, type=int, help="Set buffer size for reading gzip files")
123 parser.add_argument('--compresslevel', default=2, type=int, choices=list(range(1, 10)), help="Set compression level (1: fastest, 9: highest compression)")
124 parser.add_argument('--algo', default='smhasher', choices=['CityHash64', 'hashxx', 'smhasher'], help='Select hash algorithm')
125 return parser.parse_args()
126
127
128 def get_infiles(args):
129 if args.r2_in:
130 return [args.r1_in, args.r2_in]
131 else:
132 return [args.r1_in]
133
134
135 def get_outfiles(args):
136 if args.r2_out:
137 return [args.r1_out, args.r2_out]
138 else:
139 return [args.r1_out]
140
141
142 def get_unique_fastq_instance():
143 if sys.version_info.major == 2:
144 return UniqueFastqPairsPy2
145 elif sys.version_info.major == 3:
146 return UniqueFastqPairsPy3
147
148
149 def main():
150 args = get_args()
151 UniqueFastqPairs = get_unique_fastq_instance()
152 UniqueFastqPairs(infiles=get_infiles(args),
153 outfiles=get_outfiles(args),
154 write_gzip=args.write_gzip,
155 buffer_size=args.buffer_size,
156 compresslevel=args.compresslevel,
157 hash_module=args.algo)
158
159
160 if __name__ == '__main__':
161 main()