Mercurial > repos > nick > sequence_content_trimmer
annotate trimmer.py @ 0:7f170cb06e2e draft
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
author | nick |
---|---|
date | Tue, 01 Dec 2015 21:33:27 -0500 |
parents | |
children | 464aee13e2df |
rev | line source |
---|---|
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
1 #!/usr/bin/env python |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
2 from __future__ import division |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
3 import sys |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
4 import argparse |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
5 import getreads |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
6 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
7 OPT_DEFAULTS = {'win_len':1, 'thres':1.0, 'filt_bases':'N'} |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
8 USAGE = "%(prog)s [options] [input_1.fq [input_2.fq output_1.fq output_2.fq]]" |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
9 DESCRIPTION = """Trim the 5' ends of reads by sequence content, e.g. by GC content or presence of |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
10 N's.""" |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
11 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
12 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
13 def main(argv): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
14 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
15 parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
16 parser.set_defaults(**OPT_DEFAULTS) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
17 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
18 parser.add_argument('infile1', metavar='reads_1.fq', nargs='?', type=argparse.FileType('r'), |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
19 default=sys.stdin, |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
20 help='Input reads (mate 1). Omit to read from stdin.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
21 parser.add_argument('infile2', metavar='reads_2.fq', nargs='?', type=argparse.FileType('r'), |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
22 help='Input reads (mate 2). If given, it will preserve pairs (if one read is filtered out ' |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
23 'entirely, the other will also be lost).') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
24 parser.add_argument('outfile1', metavar='reads.filt_1.fq', nargs='?', type=argparse.FileType('w'), |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
25 default=sys.stdout, |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
26 help='Output file for mate 1. WARNING: Will overwrite.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
27 parser.add_argument('outfile2', metavar='reads.filt_2.fq', nargs='?', type=argparse.FileType('w'), |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
28 help='Output file for mate 2. WARNING: Will overwrite.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
29 parser.add_argument('-f', '--format', dest='filetype', choices=('fasta', 'fastq'), |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
30 help='Input read format.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
31 parser.add_argument('-F', '--out-format', dest='out_filetype', choices=('fasta', 'fastq'), |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
32 help='Output read format. Default: whatever the input format is.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
33 parser.add_argument('-b', '--filt-bases', |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
34 help='The bases to filter on. Case-insensitive. Default: %(default)s.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
35 parser.add_argument('-t', '--thres', type=float, |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
36 help='The threshold. The read will be trimmed once the proportion of filter bases in the ' |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
37 'window exceed this fraction (not a percentage). Default: %(default)s.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
38 parser.add_argument('-w', '--window', dest='win_len', type=int, |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
39 help='Window size for trimming. Default: %(default)s.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
40 parser.add_argument('-i', '--invert', action='store_true', |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
41 help='Invert the filter bases: filter on bases NOT present in the --filt-bases.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
42 parser.add_argument('-m', '--min-length', type=int, |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
43 help='Set a minimum read length. Reads which are trimmed below this length will be filtered ' |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
44 'out (omitted entirely from the output). Read pairs will be preserved: both reads in a ' |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
45 'pair must exceed this length to be kept. Set to 0 to only omit empty reads.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
46 parser.add_argument('--error', |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
47 help='Fail with this error message (useful for Galaxy tool).') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
48 parser.add_argument('-A', '--acgt', action='store_true', |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
49 help='Filter on any non-ACGT base (shortcut for "--invert --filt-bases ACGT").') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
50 parser.add_argument('-I', '--iupac', action='store_true', |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
51 help='Filter on any non-IUPAC base (shortcut for "--invert --filt-bases ACGTUWSMKRYBDHVN-").') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
52 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
53 args = parser.parse_args(argv[1:]) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
54 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
55 if args.error: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
56 fail('Error: '+args.error) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
57 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
58 # Catch invalid argument combinations. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
59 if args.infile1 and args.infile2 and not (args.outfile1 and args.outfile2): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
60 fail('Error: If giving two input files (paired end), must specify both output files.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
61 # Determine filetypes, open input file parsers. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
62 filetype1 = get_filetype(args.infile1, args.filetype) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
63 file1_parser = iter(getreads.getparser(args.infile1, filetype=filetype1)) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
64 if args.infile2: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
65 paired = True |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
66 filetype2 = get_filetype(args.infile2, args.filetype) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
67 file2_parser = iter(getreads.getparser(args.infile2, filetype=filetype2)) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
68 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
69 filetype2 = None |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
70 file2_parser = None |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
71 paired = False |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
72 # Override output filetypes if it was specified on the command line. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
73 if args.out_filetype: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
74 filetype1 = args.out_filetype |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
75 filetype2 = args.out_filetype |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
76 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
77 # Determine the filter bases and whether to invert the selection. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
78 filt_bases = args.filt_bases |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
79 invert = args.invert |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
80 if args.acgt: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
81 filt_bases = 'ACGT' |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
82 invert = True |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
83 elif args.iupac: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
84 filt_bases = 'ACGTUWSMKRYBDHVN-' |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
85 invert = True |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
86 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
87 # Do the actual trimming. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
88 try: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
89 trim_reads(file1_parser, file2_parser, args.outfile1, args.outfile2, filetype1, filetype2, |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
90 paired, args.win_len, args.thres, filt_bases, invert, args.min_length) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
91 finally: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
92 for filehandle in (args.infile1, args.infile2, args.outfile1, args.outfile2): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
93 if filehandle and filehandle is not sys.stdin and filehandle is not sys.stdout: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
94 filehandle.close() |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
95 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
96 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
97 def trim_reads(file1_parser, file2_parser, outfile1, outfile2, filetype1, filetype2, paired, |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
98 win_len, thres, filt_bases, invert, min_length): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
99 """Trim all the reads in the input file(s), writing to the output file(s).""" |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
100 read1 = None |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
101 read2 = None |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
102 while True: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
103 # Read in the reads. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
104 try: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
105 read1 = next(file1_parser) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
106 if paired: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
107 read2 = next(file2_parser) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
108 except StopIteration: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
109 break |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
110 # Do trimming. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
111 read1.seq = trim_read(read1.seq, win_len, thres, filt_bases, invert) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
112 if filetype1 == 'fastq': |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
113 # If the output filetype is FASTQ, trim the quality scores too. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
114 # If there are no input quality scores (i.e. the input is FASTA), use dummy scores instead. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
115 # "z" is the highest alphanumeric score (PHRED 89), higher than any expected real score. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
116 qual1 = read1.qual or 'z' * len(read1.seq) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
117 read1.qual = qual1[:len(read1.seq)] |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
118 if paired: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
119 read2.seq = trim_read(read2.seq, win_len, thres, filt_bases, invert) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
120 if filetype2 == 'fastq': |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
121 qual2 = read2.qual or 'z' * len(read2.seq) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
122 read2.qual = qual2[:len(read2.seq)] |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
123 # Output reads if they both pass the minimum length threshold (if any was given). |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
124 if min_length is None or (len(read1.seq) >= min_length and len(read2.seq) >= min_length): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
125 write_read(outfile1, read1, filetype1) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
126 write_read(outfile2, read2, filetype2) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
127 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
128 # Output read if it passes the minimum length threshold (if any was given). |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
129 if min_length is None or len(read1.seq) >= min_length: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
130 write_read(outfile1, read1, filetype1) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
131 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
132 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
133 def get_filetype(infile, filetype_arg): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
134 if infile is sys.stdin: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
135 if filetype_arg: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
136 filetype = filetype_arg |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
137 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
138 fail('Error: You must specify the --format if reading from stdin.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
139 elif infile: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
140 if filetype_arg: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
141 filetype = filetype_arg |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
142 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
143 if infile.name.endswith('.fa') or infile.name.endswith('.fasta'): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
144 filetype = 'fasta' |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
145 elif infile.name.endswith('.fq') or infile.name.endswith('.fastq'): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
146 filetype = 'fastq' |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
147 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
148 fail('Error: Unrecognized file ending on "{}". Please specify the --format.'.format(infile)) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
149 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
150 fail('Error: infile is {}'.format(infile)) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
151 return filetype |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
152 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
153 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
154 def write_read(filehandle, read, filetype): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
155 if filetype == 'fasta': |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
156 filehandle.write('>{name}\n{seq}\n'.format(**vars(read))) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
157 elif filetype == 'fastq': |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
158 filehandle.write('@{name}\n{seq}\n+\n{qual}\n'.format(**vars(read))) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
159 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
160 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
161 def trim_read(seq, win_len, thres, filt_bases, invert): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
162 """Trim an individual read and return its trimmed sequence. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
163 This will track the frequency of bad bases in a window of length win_len, and trim once the |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
164 frequency goes below thres. The trim point will be just before the first (leftmost) bad base in |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
165 the window (the first window with a frequency below thres). The "bad" bases are the ones in |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
166 filt_bases if invert is False, or any base NOT in filt_bases if invert is True.""" |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
167 # Algorithm: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
168 # The window is a list which acts as a FIFO. As we scan from the left (3') end to the right (5') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
169 # end, we append new bases to the right end of the window and pop them from the left end. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
170 # Each base is only examined twice: when it enters the window and when it leaves it. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
171 # We keep a running total of the number of bad bases in bad_bases_count, incrementing it when bad |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
172 # bases enter the window and decrementing it when they leave. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
173 # We also track the location of bad bases in the window with bad_bases_coords so we can figure out |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
174 # where to cut if we have to trim. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
175 max_bad_bases = win_len * thres |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
176 window = [] |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
177 bad_bases_count = 0 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
178 bad_bases_coords = [] |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
179 for coord, base in enumerate(seq.upper()): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
180 # Shift window, adjust bad_bases_count and bad_bases_coords list. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
181 window.append(base) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
182 # Is the new base we're adding to the window a bad base? |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
183 if invert: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
184 bad_base = base not in filt_bases |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
185 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
186 bad_base = base in filt_bases |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
187 # If so, increment the total and add its coordinate to the window. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
188 if bad_base: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
189 bad_bases_count += 1 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
190 bad_bases_coords.append(coord) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
191 if len(window) > win_len: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
192 first_base = window.pop(0) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
193 # Is the base we're removing (the first base in the window) a bad base? |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
194 if invert: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
195 bad_base = first_base not in filt_bases |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
196 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
197 bad_base = first_base in filt_bases |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
198 # If so, decrement the total and remove its coordinate from the window. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
199 if bad_base: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
200 bad_bases_count -= 1 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
201 bad_bases_coords.pop(0) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
202 # print bad_bases_coords |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
203 # Are we over the threshold? |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
204 if bad_bases_count > max_bad_bases: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
205 break |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
206 # If we exceeded the threshold, trim the sequence at the first (leftmost) bad base in the window. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
207 if bad_bases_count > max_bad_bases: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
208 first_bad_base = bad_bases_coords[0] |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
209 return seq[0:first_bad_base] |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
210 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
211 return seq |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
212 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
213 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
214 def fail(message): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
215 sys.stderr.write(message+"\n") |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
216 sys.exit(1) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
217 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
218 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
219 if __name__ == '__main__': |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
220 sys.exit(main(sys.argv)) |