Mercurial > repos > nick > sequence_content_trimmer
annotate trimmer.py @ 1:464aee13e2df draft default tip
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
author | nick |
---|---|
date | Fri, 27 May 2022 23:29:45 +0000 |
parents | 7f170cb06e2e |
children |
rev | line source |
---|---|
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
1 #!/usr/bin/env python3 |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
2 import sys |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
3 import argparse |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
4 import collections |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
5 import getreads |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
6 |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
7 QUANT_ORDER = 5 |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
8 USAGE = "%(prog)s [options] [input_1.fq [input_2.fq output_1.fq output_2.fq]]" |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
9 DESCRIPTION = """Trim the 5' ends of reads by sequence content, e.g. by GC content or presence of |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
10 N's.""" |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
11 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
12 |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
13 def make_argparser(): |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
14 parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
15 parser.add_argument('infile1', metavar='reads_1.fq', nargs='?', type=argparse.FileType('r'), |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
16 default=sys.stdin, |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
17 help='Input reads (mate 1). Omit to read from stdin.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
18 parser.add_argument('infile2', metavar='reads_2.fq', nargs='?', type=argparse.FileType('r'), |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
19 help='Input reads (mate 2). If given, it will preserve pairs (if one read is filtered out ' |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
20 'entirely, the other will also be lost).') |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
21 parser.add_argument('outfile1', metavar='output_1.fq', nargs='?', type=argparse.FileType('w'), |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
22 default=sys.stdout, |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
23 help='Output file for mate 1. WARNING: Will overwrite.') |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
24 parser.add_argument('outfile2', metavar='output_2.fq', nargs='?', type=argparse.FileType('w'), |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
25 help='Output file for mate 2. WARNING: Will overwrite.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
26 parser.add_argument('-f', '--format', dest='filetype', choices=('fasta', 'fastq'), |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
27 help='Input read format.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
28 parser.add_argument('-F', '--out-format', dest='out_filetype', choices=('fasta', 'fastq'), |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
29 help='Output read format. Default: whatever the input format is.') |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
30 parser.add_argument('-b', '--filt-bases', default='N', |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
31 help='The bases to filter on. Case-insensitive. Default: %(default)s.') |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
32 parser.add_argument('-t', '--thres', type=float, default=0.5, |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
33 help='The threshold. The read will be trimmed once the proportion of filter bases in the ' |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
34 'window exceed this fraction (not a percentage). Default: %(default)s.') |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
35 parser.add_argument('-w', '--window', dest='win_len', type=int, default=1, |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
36 help='Window size for trimming. Default: %(default)s.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
37 parser.add_argument('-i', '--invert', action='store_true', |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
38 help='Invert the filter bases: filter on bases NOT present in the --filt-bases.') |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
39 parser.add_argument('-m', '--min-length', type=int, default=1, |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
40 help='Set a minimum read length. Reads which are trimmed below this length will be filtered ' |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
41 'out (omitted entirely from the output). Read pairs will be preserved: both reads in a ' |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
42 'pair must exceed this length to be kept. Set to 1 to only omit empty reads. ' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
43 'Default: %(default)s.') |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
44 parser.add_argument('-A', '--acgt', action='store_true', |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
45 help='Filter on any non-ACGT base (shortcut for "--invert --filt-bases ACGT").') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
46 parser.add_argument('-I', '--iupac', action='store_true', |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
47 help='Filter on any non-IUPAC base (shortcut for "--invert --filt-bases ACGTUWSMKRYBDHVN-").') |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
48 parser.add_argument('-q', '--quiet', action='store_true', |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
49 help='Don\'t print trimming stats on completion.') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
50 parser.add_argument('-T', '--tsv', dest='stats_format', default='human', |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
51 action='store_const', const='tsv') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
52 return parser |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
53 |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
54 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
55 def main(argv): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
56 parser = make_argparser() |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
57 args = parser.parse_args(argv[1:]) |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
58 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
59 # Catch invalid argument combinations. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
60 if args.infile1 and args.infile2 and not (args.outfile1 and args.outfile2): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
61 fail('Error: If giving two input files (paired end), must specify both output files.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
62 # Determine filetypes, open input file parsers. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
63 filetype1 = get_filetype(args.infile1, args.filetype) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
64 file1_parser = iter(getreads.getparser(args.infile1, filetype=filetype1)) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
65 if args.infile2: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
66 paired = True |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
67 filetype2 = get_filetype(args.infile2, args.filetype) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
68 file2_parser = iter(getreads.getparser(args.infile2, filetype=filetype2)) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
69 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
70 filetype2 = None |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
71 file2_parser = None |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
72 paired = False |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
73 # Override output filetypes if it was specified on the command line. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
74 if args.out_filetype: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
75 filetype1 = args.out_filetype |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
76 filetype2 = args.out_filetype |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
77 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
78 # Determine the filter bases and whether to invert the selection. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
79 filt_bases = args.filt_bases |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
80 invert = args.invert |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
81 if args.acgt: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
82 filt_bases = 'ACGT' |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
83 invert = True |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
84 elif args.iupac: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
85 filt_bases = 'ACGTUWSMKRYBDHVN-' |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
86 invert = True |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
87 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
88 # Do the actual trimming. |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
89 filters = {'win_len':args.win_len, 'thres':args.thres, 'filt_bases':filt_bases, 'invert':invert, |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
90 'min_len':args.min_length} |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
91 try: |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
92 stats = trim_reads(file1_parser, file2_parser, args.outfile1, args.outfile2, |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
93 filetype1, filetype2, paired, filters) |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
94 finally: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
95 for filehandle in (args.infile1, args.infile2, args.outfile1, args.outfile2): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
96 if filehandle and filehandle is not sys.stdin and filehandle is not sys.stdout: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
97 filehandle.close() |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
98 |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
99 if not args.quiet: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
100 print_stats(stats, args.stats_format) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
101 |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
102 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
103 def trim_reads(file1_parser, file2_parser, outfile1, outfile2, filetype1, filetype2, paired, |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
104 filters): |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
105 """Trim all the reads in the input file(s), writing to the output file(s).""" |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
106 min_len = filters['min_len'] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
107 trims1 = collections.Counter() |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
108 trims2 = collections.Counter() |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
109 omitted1 = collections.Counter() |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
110 omitted2 = collections.Counter() |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
111 read1 = None |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
112 read2 = None |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
113 while True: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
114 # Read in the reads. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
115 try: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
116 read1 = next(file1_parser) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
117 if paired: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
118 read2 = next(file2_parser) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
119 except StopIteration: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
120 break |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
121 # Do trimming. |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
122 read1, trim_len1 = trim_read(read1, filters, filetype1) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
123 trims1[trim_len1] += 1 |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
124 if paired: |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
125 read2, trim_len2 = trim_read(read2, filters, filetype2) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
126 trims2[trim_len2] += 1 |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
127 # Output reads if they both pass the minimum length threshold (if any was given). |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
128 if min_len is None or (len(read1.seq) >= min_len and len(read2.seq) >= min_len): |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
129 write_read(outfile1, read1, filetype1) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
130 write_read(outfile2, read2, filetype2) |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
131 else: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
132 if len(read1.seq) < min_len: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
133 omitted1[trim_len1] += 1 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
134 if len(read2.seq) < min_len: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
135 omitted2[trim_len2] += 1 |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
136 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
137 # Output read if it passes the minimum length threshold (if any was given). |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
138 if min_len is None or len(read1.seq) >= min_len: |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
139 write_read(outfile1, read1, filetype1) |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
140 else: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
141 omitted1[trim_len1] += 1 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
142 # Compile stats. |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
143 stats = {} |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
144 stats['reads'] = sum(trims1.values()) + sum(trims2.values()) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
145 stats['trimmed'] = stats['reads'] - trims1[0] - trims2[0] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
146 stats['omitted'] = sum(omitted1.values()) + sum(omitted2.values()) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
147 if paired: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
148 stats['trimmed1'] = stats['reads']//2 - trims1[0] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
149 stats['trimmed2'] = stats['reads']//2 - trims2[0] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
150 stats['omitted1'] = sum(omitted1.values()) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
151 stats['omitted2'] = sum(omitted2.values()) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
152 # Quintiles for trim lengths. |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
153 stats['quants'] = {'order':QUANT_ORDER} |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
154 if paired: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
155 stats['quants']['trim1'] = get_counter_quantiles(trims1, order=QUANT_ORDER) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
156 stats['quants']['trim2'] = get_counter_quantiles(trims2, order=QUANT_ORDER) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
157 stats['quants']['trim'] = get_counter_quantiles(trims1 + trims2, order=QUANT_ORDER) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
158 stats['quants']['omitted_trim1'] = get_counter_quantiles(omitted1, order=QUANT_ORDER) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
159 stats['quants']['omitted_trim2'] = get_counter_quantiles(omitted2, order=QUANT_ORDER) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
160 stats['quants']['omitted_trim'] = get_counter_quantiles(omitted1 + omitted2, order=QUANT_ORDER) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
161 else: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
162 stats['quants']['trim'] = get_counter_quantiles(trims1) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
163 stats['quants']['omitted_trim'] = get_counter_quantiles(omitted1) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
164 return stats |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
165 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
166 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
167 def get_filetype(infile, filetype_arg): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
168 if infile is sys.stdin: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
169 if filetype_arg: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
170 filetype = filetype_arg |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
171 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
172 fail('Error: You must specify the --format if reading from stdin.') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
173 elif infile: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
174 if filetype_arg: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
175 filetype = filetype_arg |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
176 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
177 if infile.name.endswith('.fa') or infile.name.endswith('.fasta'): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
178 filetype = 'fasta' |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
179 elif infile.name.endswith('.fq') or infile.name.endswith('.fastq'): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
180 filetype = 'fastq' |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
181 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
182 fail('Error: Unrecognized file ending on "{}". Please specify the --format.'.format(infile)) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
183 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
184 fail('Error: infile is {}'.format(infile)) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
185 return filetype |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
186 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
187 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
188 def write_read(filehandle, read, filetype): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
189 if filetype == 'fasta': |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
190 filehandle.write('>{name}\n{seq}\n'.format(**vars(read))) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
191 elif filetype == 'fastq': |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
192 filehandle.write('@{name}\n{seq}\n+\n{qual}\n'.format(**vars(read))) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
193 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
194 |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
195 def trim_read(read, filters, filetype): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
196 trimmed_seq = trim_seq(read.seq, **filters) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
197 trim_len = len(read.seq) - len(trimmed_seq) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
198 read.seq = trimmed_seq |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
199 if filetype == 'fastq': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
200 # If the output filetype is FASTQ, trim the quality scores too. |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
201 # If there are no input quality scores (i.e. the input is FASTA), use dummy scores instead. |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
202 # "z" is the highest alphanumeric score (PHRED 89), higher than any expected real score. |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
203 qual = read.qual or 'z' * len(read.seq) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
204 read.qual = qual[:len(read.seq)] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
205 return read, trim_len |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
206 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
207 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
208 def trim_seq(seq, win_len=1, thres=1.0, filt_bases='N', invert=False, **kwargs): |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
209 """Trim an individual read and return its trimmed sequence. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
210 This will track the frequency of bad bases in a window of length win_len, and trim once the |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
211 frequency goes below thres. The trim point will be just before the first (leftmost) bad base in |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
212 the window (the first window with a frequency below thres). The "bad" bases are the ones in |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
213 filt_bases if invert is False, or any base NOT in filt_bases if invert is True.""" |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
214 # Algorithm: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
215 # The window is a list which acts as a FIFO. As we scan from the left (3') end to the right (5') |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
216 # end, we append new bases to the right end of the window and pop them from the left end. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
217 # Each base is only examined twice: when it enters the window and when it leaves it. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
218 # We keep a running total of the number of bad bases in bad_bases_count, incrementing it when bad |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
219 # bases enter the window and decrementing it when they leave. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
220 # We also track the location of bad bases in the window with bad_bases_coords so we can figure out |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
221 # where to cut if we have to trim. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
222 max_bad_bases = win_len * thres |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
223 window = [] |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
224 bad_bases_count = 0 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
225 bad_bases_coords = [] |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
226 for coord, base in enumerate(seq.upper()): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
227 # Shift window, adjust bad_bases_count and bad_bases_coords list. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
228 window.append(base) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
229 # Is the new base we're adding to the window a bad base? |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
230 if invert: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
231 bad_base = base not in filt_bases |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
232 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
233 bad_base = base in filt_bases |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
234 # If so, increment the total and add its coordinate to the window. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
235 if bad_base: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
236 bad_bases_count += 1 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
237 bad_bases_coords.append(coord) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
238 if len(window) > win_len: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
239 first_base = window.pop(0) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
240 # Is the base we're removing (the first base in the window) a bad base? |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
241 if invert: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
242 bad_base = first_base not in filt_bases |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
243 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
244 bad_base = first_base in filt_bases |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
245 # If so, decrement the total and remove its coordinate from the window. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
246 if bad_base: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
247 bad_bases_count -= 1 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
248 bad_bases_coords.pop(0) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
249 # Are we over the threshold? |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
250 if bad_bases_count > max_bad_bases: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
251 break |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
252 # If we exceeded the threshold, trim the sequence at the first (leftmost) bad base in the window. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
253 if bad_bases_count > max_bad_bases: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
254 first_bad_base = bad_bases_coords[0] |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
255 return seq[0:first_bad_base] |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
256 else: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
257 return seq |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
258 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
259 |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
260 def get_counter_quantiles(counter, order=5): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
261 """Return an arbitrary set of quantiles (including min and max values). |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
262 `counter` is a collections.Counter. |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
263 `order` is which quantile to perform (4 = quartiles, 5 = quintiles). |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
264 Warning: This expects a counter which has counted at least `order` elements. |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
265 If it receives a counter with fewer elements, it will simply return `list(counter.elements())`. |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
266 This will have fewer than the usual order+1 elements, and may not fit normal expectations of |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
267 what "quantiles" should be.""" |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
268 quantiles = [] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
269 total = sum(counter.values()) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
270 if total <= order: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
271 return list(counter.elements()) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
272 span_size = total / order |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
273 # Sort the items and go through them, looking for the one at the break points. |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
274 items = list(sorted(counter.items(), key=lambda i: i[0])) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
275 quantiles.append(items[0][0]) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
276 total_seen = 0 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
277 current_span = 1 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
278 cut_point = int(round(current_span*span_size)) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
279 for item, count in items: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
280 total_seen += count |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
281 if total_seen >= cut_point: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
282 quantiles.append(item) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
283 current_span += 1 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
284 cut_point = int(round(current_span*span_size)) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
285 return quantiles |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
286 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
287 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
288 def print_stats(stats, format='human'): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
289 if format == 'human': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
290 lines = get_stats_lines_human(stats) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
291 elif format == 'tsv': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
292 lines = get_stats_lines_tsv(stats) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
293 else: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
294 fail('Error: Unrecognized format {!r}'.format(format)) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
295 sys.stderr.write('\n'.join(lines).format(**stats)+'\n') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
296 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
297 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
298 def get_stats_lines_human(stats): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
299 # Single-stat lines: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
300 lines = [ |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
301 'Total reads in input:\t{reads}', |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
302 'Reads trimmed:\t{trimmed}' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
303 ] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
304 if 'trimmed1' in stats and 'trimmed2' in stats: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
305 lines.append(' For mate 1:\t{trimmed1}') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
306 lines.append(' For mate 2:\t{trimmed2}') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
307 lines.append('Reads filtered out:\t{omitted}') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
308 if 'omitted1' in stats and 'omitted2' in stats: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
309 lines.append(' For mate 1:\t{omitted1}') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
310 lines.append(' For mate 2:\t{omitted2}') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
311 # Quantile lines: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
312 quantile_lines = [ |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
313 ('Bases trimmed quintiles', 'trim'), |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
314 (' For mate 1', 'trim1'), |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
315 (' For mate 2', 'trim2'), |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
316 ('Bases trimmed quintiles from filtered reads', 'omitted_trim'), |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
317 (' For mate 1', 'omitted_trim1'), |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
318 (' For mate 2', 'omitted_trim2') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
319 ] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
320 for desc, stat_name in quantile_lines: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
321 if stat_name in stats['quants']: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
322 quants_values = stats['quants'][stat_name] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
323 if quants_values: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
324 quants_str = ', '.join(map(str, quants_values)) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
325 else: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
326 quants_str = 'N/A' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
327 line = desc+':\t'+quants_str |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
328 lines.append(line) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
329 return lines |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
330 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
331 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
332 def get_stats_lines_tsv(stats): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
333 lines = ['{reads}'] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
334 if 'trimmed1' in stats and 'trimmed2' in stats: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
335 lines.append('{trimmed}\t{trimmed1}\t{trimmed2}') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
336 else: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
337 lines.append('{trimmed}') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
338 if 'omitted1' in stats and 'omitted2' in stats: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
339 lines.append('{omitted}\t{omitted1}\t{omitted2}') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
340 else: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
341 lines.append('{omitted}') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
342 for stat_name in ('trim', 'trim1', 'trim2', 'omitted_trim', 'omitted_trim1', 'omitted_trim2'): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
343 if stat_name in stats['quants']: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
344 quants_values = stats['quants'][stat_name] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
345 lines.append('\t'.join(map(str, quants_values))) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
346 return lines |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
347 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
348 |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
349 def fail(message): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
350 sys.stderr.write(message+"\n") |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
351 sys.exit(1) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
352 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
353 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
354 if __name__ == '__main__': |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
355 sys.exit(main(sys.argv)) |