annotate dedup_hash/dedup_hash.py @ 0:f33e9e6a6c88 draft default tip

planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
author mvdbeek
date Wed, 23 Nov 2016 07:49:05 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
1 import argparse
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
2 import gzip
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
3 import io
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
4 import sys
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
5 from itertools import cycle
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
6 try:
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
7 from itertools import izip
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
8 except ImportError:
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
9 pass # we can simply use zip in python3
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
10
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
11
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
12 class UniqueFastqBase(object):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
13 def __init__(self,
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
14 infiles,
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
15 outfiles,
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
16 write_gzip,
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
17 buffer_size=32768,
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
18 compresslevel=2,
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
19 hash_module="smhasher"):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
20 self.seen_hashes = set()
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
21 self.infiles = infiles
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
22 self.outfiles = outfiles
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
23 self.write_gzip = write_gzip
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
24 self.buffer_size = buffer_size
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
25 self.compresslevel = compresslevel
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
26 self.hash_module = self.import_hash_module(hash_module)
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
27 self.cur_fastq_str_r1 = ""
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
28 self.cur_fastq_str_r2 = ""
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
29 self.cur_uniq = False
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
30 self.fastq_cycle = cycle([self.header_one_action, self.seq_action, self.header_two_action, self.qual_action])
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
31 self.infiles = self.get_inputs()
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
32 self.outfiles = self.get_outputs()
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
33 self.process_files()
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
34 self.close_io()
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
35
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
36 def import_hash_module(self, hash_module):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
37 if hash_module == "smhasher":
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
38 from smhasher import murmur3_x64_64
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
39 return murmur3_x64_64
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
40 if hash_module == "CityHash64":
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
41 from cityhash import CityHash64
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
42 return CityHash64
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
43 if hash_module == "hashxx":
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
44 from pyhashxx import hashxx
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
45 return hashxx
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
46
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
47 def get_input(self, infile):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
48 if self._is_gzip(infile):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
49 return io.BufferedReader(gzip.GzipFile(infile, 'rb'), buffer_size=self.buffer_size)
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
50 else:
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
51 return open(infile)
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
52
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
53 def get_inputs(self):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
54 return [self.get_input(infile) for infile in self.infiles]
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
55
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
56 def get_outputs(self):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
57 if self.write_gzip:
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
58 return [io.BufferedWriter(gzip.GzipFile(outfile, 'wb', compresslevel=self.compresslevel), buffer_size=self.buffer_size) for outfile in self.outfiles]
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
59 return [open(outfile, 'w') for outfile in self.outfiles]
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
60
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
61 def close_io(self):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
62 [infile.close() for infile in self.infiles]
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
63 [outfile.close() for outfile in self.outfiles]
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
64
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
65 def _is_gzip(self, infile):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
66 gzip_magic_byte = b"\x1f\x8b\x08"
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
67 with open(infile, 'rb') as input:
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
68 return gzip_magic_byte == input.read(len(gzip_magic_byte))
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
69
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
70 def process_files(self):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
71 raise Exception('Not implemented')
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
72
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
73 def seq_action(self, lines):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
74 cur_hash = self.hash_module("".join(lines))
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
75 if cur_hash in self.seen_hashes:
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
76 self.cur_uniq = False
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
77 else:
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
78 self.seen_hashes.add(cur_hash)
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
79 self.cur_uniq = True
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
80 self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)]
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
81
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
82 def header_one_action(self, lines):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
83 self.cur_uniq = False
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
84 self.cur_fastq_strs = lines
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
85
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
86 def header_two_action(self, lines):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
87 self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)]
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
88
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
89 def qual_action(self, lines):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
90 if self.cur_uniq:
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
91 self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)]
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
92 [outfile.write(string) for string, outfile in zip(self.cur_fastq_strs, self.outfiles)]
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
93
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
94
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
95 class UniqueFastqPairsPy2(UniqueFastqBase):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
96
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
97 def process_files(self):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
98 for items in izip(self.fastq_cycle, *self.infiles):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
99 fastq_item = items[0]
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
100 lines = items[1:]
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
101 fastq_item(lines)
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
102
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
103
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
104 class UniqueFastqPairsPy3(UniqueFastqBase):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
105
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
106 def process_files(self):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
107 for items in zip(self.fastq_cycle, *self.infiles):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
108 fastq_item = items[0]
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
109 lines = items[1:]
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
110 # The following might be slow, rework this to something smarter
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
111 # it it slows down too much.
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
112 fastq_item([l if isinstance(l, str) else l.decode() for l in lines])
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
113
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
114
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
115 def get_args():
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
116 parser = argparse.ArgumentParser(description='Get unique reads from fastq files')
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
117 parser.add_argument('--r1_in', required=True, help='Read1 input fastq file')
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
118 parser.add_argument('--r2_in', required=False, default=None, help='Read2 input fastq file')
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
119 parser.add_argument('--r1_out', required=True, help='Read1 output fastq file')
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
120 parser.add_argument('--r2_out', required=False, help='Read2 output fastq file')
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
121 parser.add_argument('--write_gzip', action='store_true', help="Compress output in gzip format?")
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
122 parser.add_argument('--buffer_size', default=32768, type=int, help="Set buffer size for reading gzip files")
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
123 parser.add_argument('--compresslevel', default=2, type=int, choices=list(range(1, 10)), help="Set compression level (1: fastest, 9: highest compression)")
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
124 parser.add_argument('--algo', default='smhasher', choices=['CityHash64', 'hashxx', 'smhasher'], help='Select hash algorithm')
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
125 return parser.parse_args()
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
126
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
127
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
128 def get_infiles(args):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
129 if args.r2_in:
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
130 return [args.r1_in, args.r2_in]
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
131 else:
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
132 return [args.r1_in]
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
133
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
134
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
135 def get_outfiles(args):
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
136 if args.r2_out:
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
137 return [args.r1_out, args.r2_out]
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
138 else:
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
139 return [args.r1_out]
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
140
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
141
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
142 def get_unique_fastq_instance():
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
143 if sys.version_info.major == 2:
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
144 return UniqueFastqPairsPy2
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
145 elif sys.version_info.major == 3:
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
146 return UniqueFastqPairsPy3
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
147
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
148
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
149 def main():
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
150 args = get_args()
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
151 UniqueFastqPairs = get_unique_fastq_instance()
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
152 UniqueFastqPairs(infiles=get_infiles(args),
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
153 outfiles=get_outfiles(args),
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
154 write_gzip=args.write_gzip,
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
155 buffer_size=args.buffer_size,
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
156 compresslevel=args.compresslevel,
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
157 hash_module=args.algo)
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
158
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
159
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
160 if __name__ == '__main__':
f33e9e6a6c88 planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff changeset
161 main()