Mercurial > repos > mvdbeek > dedup_hash
annotate dedup_hash/dedup_hash.py @ 0:f33e9e6a6c88 draft default tip
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
author | mvdbeek |
---|---|
date | Wed, 23 Nov 2016 07:49:05 -0500 |
parents | |
children |
rev | line source |
---|---|
0
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
1 import argparse |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
2 import gzip |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
3 import io |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
4 import sys |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
5 from itertools import cycle |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
6 try: |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
7 from itertools import izip |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
8 except ImportError: |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
9 pass # we can simply use zip in python3 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
10 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
11 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
12 class UniqueFastqBase(object): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
13 def __init__(self, |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
14 infiles, |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
15 outfiles, |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
16 write_gzip, |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
17 buffer_size=32768, |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
18 compresslevel=2, |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
19 hash_module="smhasher"): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
20 self.seen_hashes = set() |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
21 self.infiles = infiles |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
22 self.outfiles = outfiles |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
23 self.write_gzip = write_gzip |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
24 self.buffer_size = buffer_size |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
25 self.compresslevel = compresslevel |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
26 self.hash_module = self.import_hash_module(hash_module) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
27 self.cur_fastq_str_r1 = "" |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
28 self.cur_fastq_str_r2 = "" |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
29 self.cur_uniq = False |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
30 self.fastq_cycle = cycle([self.header_one_action, self.seq_action, self.header_two_action, self.qual_action]) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
31 self.infiles = self.get_inputs() |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
32 self.outfiles = self.get_outputs() |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
33 self.process_files() |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
34 self.close_io() |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
35 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
36 def import_hash_module(self, hash_module): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
37 if hash_module == "smhasher": |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
38 from smhasher import murmur3_x64_64 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
39 return murmur3_x64_64 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
40 if hash_module == "CityHash64": |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
41 from cityhash import CityHash64 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
42 return CityHash64 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
43 if hash_module == "hashxx": |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
44 from pyhashxx import hashxx |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
45 return hashxx |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
46 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
47 def get_input(self, infile): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
48 if self._is_gzip(infile): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
49 return io.BufferedReader(gzip.GzipFile(infile, 'rb'), buffer_size=self.buffer_size) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
50 else: |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
51 return open(infile) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
52 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
53 def get_inputs(self): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
54 return [self.get_input(infile) for infile in self.infiles] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
55 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
56 def get_outputs(self): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
57 if self.write_gzip: |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
58 return [io.BufferedWriter(gzip.GzipFile(outfile, 'wb', compresslevel=self.compresslevel), buffer_size=self.buffer_size) for outfile in self.outfiles] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
59 return [open(outfile, 'w') for outfile in self.outfiles] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
60 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
61 def close_io(self): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
62 [infile.close() for infile in self.infiles] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
63 [outfile.close() for outfile in self.outfiles] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
64 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
65 def _is_gzip(self, infile): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
66 gzip_magic_byte = b"\x1f\x8b\x08" |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
67 with open(infile, 'rb') as input: |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
68 return gzip_magic_byte == input.read(len(gzip_magic_byte)) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
69 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
70 def process_files(self): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
71 raise Exception('Not implemented') |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
72 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
73 def seq_action(self, lines): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
74 cur_hash = self.hash_module("".join(lines)) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
75 if cur_hash in self.seen_hashes: |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
76 self.cur_uniq = False |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
77 else: |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
78 self.seen_hashes.add(cur_hash) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
79 self.cur_uniq = True |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
80 self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
81 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
82 def header_one_action(self, lines): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
83 self.cur_uniq = False |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
84 self.cur_fastq_strs = lines |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
85 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
86 def header_two_action(self, lines): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
87 self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
88 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
89 def qual_action(self, lines): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
90 if self.cur_uniq: |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
91 self.cur_fastq_strs = ["".join((prev, cur)) for prev, cur in zip(self.cur_fastq_strs, lines)] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
92 [outfile.write(string) for string, outfile in zip(self.cur_fastq_strs, self.outfiles)] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
93 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
94 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
95 class UniqueFastqPairsPy2(UniqueFastqBase): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
96 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
97 def process_files(self): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
98 for items in izip(self.fastq_cycle, *self.infiles): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
99 fastq_item = items[0] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
100 lines = items[1:] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
101 fastq_item(lines) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
102 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
103 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
104 class UniqueFastqPairsPy3(UniqueFastqBase): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
105 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
106 def process_files(self): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
107 for items in zip(self.fastq_cycle, *self.infiles): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
108 fastq_item = items[0] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
109 lines = items[1:] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
110 # The following might be slow, rework this to something smarter |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
111 # it it slows down too much. |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
112 fastq_item([l if isinstance(l, str) else l.decode() for l in lines]) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
113 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
114 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
115 def get_args(): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
116 parser = argparse.ArgumentParser(description='Get unique reads from fastq files') |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
117 parser.add_argument('--r1_in', required=True, help='Read1 input fastq file') |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
118 parser.add_argument('--r2_in', required=False, default=None, help='Read2 input fastq file') |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
119 parser.add_argument('--r1_out', required=True, help='Read1 output fastq file') |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
120 parser.add_argument('--r2_out', required=False, help='Read2 output fastq file') |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
121 parser.add_argument('--write_gzip', action='store_true', help="Compress output in gzip format?") |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
122 parser.add_argument('--buffer_size', default=32768, type=int, help="Set buffer size for reading gzip files") |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
123 parser.add_argument('--compresslevel', default=2, type=int, choices=list(range(1, 10)), help="Set compression level (1: fastest, 9: highest compression)") |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
124 parser.add_argument('--algo', default='smhasher', choices=['CityHash64', 'hashxx', 'smhasher'], help='Select hash algorithm') |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
125 return parser.parse_args() |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
126 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
127 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
128 def get_infiles(args): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
129 if args.r2_in: |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
130 return [args.r1_in, args.r2_in] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
131 else: |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
132 return [args.r1_in] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
133 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
134 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
135 def get_outfiles(args): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
136 if args.r2_out: |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
137 return [args.r1_out, args.r2_out] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
138 else: |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
139 return [args.r1_out] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
140 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
141 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
142 def get_unique_fastq_instance(): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
143 if sys.version_info.major == 2: |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
144 return UniqueFastqPairsPy2 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
145 elif sys.version_info.major == 3: |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
146 return UniqueFastqPairsPy3 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
147 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
148 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
149 def main(): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
150 args = get_args() |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
151 UniqueFastqPairs = get_unique_fastq_instance() |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
152 UniqueFastqPairs(infiles=get_infiles(args), |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
153 outfiles=get_outfiles(args), |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
154 write_gzip=args.write_gzip, |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
155 buffer_size=args.buffer_size, |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
156 compresslevel=args.compresslevel, |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
157 hash_module=args.algo) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
158 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
159 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
160 if __name__ == '__main__': |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
161 main() |