Mercurial > repos > iuc > rcorrector
comparison FilterUncorrectabledPEfastq.py @ 0:9a0b65ad3c84 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/rcorrector commit eba2cb86379f6ee6899a1141226e191c63fadfa6
author | iuc |
---|---|
date | Thu, 13 Sep 2018 07:00:00 -0400 |
parents | |
children | 6703b98884a2 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9a0b65ad3c84 |
---|---|
1 """ | |
2 author: adam h freedman | |
3 afreedman405 at gmail.com | |
4 data: Fri Aug 26 10:55:18 EDT 2016 | |
5 This script takes as an input Rcorrector error corrected Illumina paired-reads | |
6 in fastq format and: | |
7 1. Removes any reads that Rcorrector indentifes as containing an error, | |
8 but can't be corrected, typically low complexity sequences. For these, | |
9 the header contains 'unfixable'. | |
10 2. Strips the ' cor' from headers of reads that Rcorrector fixed, to avoid | |
11 issues created by certain header formats for downstream tools. | |
12 3. Write a log with counts of (a) read pairs that were removed because one end | |
13 was unfixable, (b) corrected left and right reads, (c) total number of | |
14 read pairs containing at least one corrected read. | |
15 Currently, this script only handles paired-end data, and handle either unzipped | |
16 or gzipped files on the fly, so long as the gzipped files end with 'gz'. | |
17 """ | |
18 | |
19 # import sys | |
20 import argparse | |
21 import gzip | |
22 from itertools import izip_longest | |
23 # izip | |
24 from os.path import basename | |
25 | |
26 | |
27 def get_input_streams(r1file, r2file): | |
28 if r1file[-2:] == 'gz': | |
29 r1handle = gzip.open(r1file, 'rb') | |
30 r2handle = gzip.open(r2file, 'rb') | |
31 else: | |
32 r1handle = open(r1file, 'r') | |
33 r2handle = open(r2file, 'r') | |
34 return r1handle, r2handle | |
35 | |
36 | |
37 def grouper(iterable, n, fillvalue=None): | |
38 "Collect data into fixed-length chunks or blocks" | |
39 # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx | |
40 args = [iter(iterable)] * n | |
41 return izip_longest(fillvalue=fillvalue, * args) | |
42 | |
43 | |
44 if __name__ == "__main__": | |
45 parser = argparse.ArgumentParser(description="options for filtering and logging rCorrector fastq outputs") | |
46 parser.add_argument('-1', '--left_reads', dest='leftreads', type=str, help='R1 fastq file') | |
47 parser.add_argument('-2', '--right_reads', dest='rightreads', type=str, help='R2 fastq file') | |
48 parser.add_argument('-o', '--out_prefix', dest='outprefix', type=str, help="prefix for filtered fastq output") | |
49 opts = parser.parse_args() | |
50 r1out = open(opts.outprefix + '_' + basename(opts.leftreads).replace('.gz', ''), 'w') | |
51 r2out = open(opts.outprefix + '_' + basename(opts.rightreads).replace('.gz', ''), 'w') | |
52 r1_cor_count = 0 | |
53 r2_cor_count = 0 | |
54 pair_cor_count = 0 | |
55 unfix_count = 0 | |
56 r1_stream, r2_stream = get_input_streams(opts.leftreads, opts.rightreads) | |
57 with r1_stream as f1, r2_stream as f2: | |
58 R1 = grouper(f1, 4) | |
59 R2 = grouper(f2, 4) | |
60 counter = 0 | |
61 for entry in R1: | |
62 counter += 1 | |
63 if counter % 100000 == 0: | |
64 print "%s reads processed" % counter | |
65 head1, seq1, placeholder1, qual1 = [i.strip() for i in entry] | |
66 head2, seq2, placeholder2, qual2 = [j.strip() for j in R2.next()] | |
67 if 'unfixable' in head1 or 'unfixable' in head2: | |
68 unfix_count += 1 | |
69 else: | |
70 if 'cor' in head1: | |
71 r1_cor_count += 1 | |
72 if 'cor' in head2: | |
73 r2_cor_count += 1 | |
74 if 'cor' in head1 or 'cor' in head2: | |
75 pair_cor_count += 1 | |
76 head1 = head1.split('l:')[0][:-1] # keeps all before the low kmer count statistic and removes the trailing whitespace character | |
77 head2 = head2.split('l:')[0][:-1] | |
78 # head1 = head1.replace(' cor', '') | |
79 # head2 = head2.replace(' cor', '') | |
80 r1out.write('%s\n' % '\n'.join([head1, seq1, placeholder1, qual1])) | |
81 r2out.write('%s\n' % '\n'.join([head2, seq2, placeholder2, qual2])) | |
82 unfix_log = open('rmunfixable.log', 'w') | |
83 unfix_log.write('total PE reads:%s\nremoved PE reads:%s\nretained PE reads:%s\nR1 corrected:%s\nR2 corrected:%s\npairs corrected:%s\n' % (counter, unfix_count, counter - unfix_count, r1_cor_count, r2_cor_count, pair_cor_count)) | |
84 r1out.close() | |
85 r2out.close() |