annotate trimmer.py @ 1:464aee13e2df draft default tip

"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
author nick
date Fri, 27 May 2022 23:29:45 +0000
parents 7f170cb06e2e
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
1 #!/usr/bin/env python3
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
2 import sys
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
3 import argparse
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
4 import collections
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
5 import getreads
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
6
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
7 QUANT_ORDER = 5
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
8 USAGE = "%(prog)s [options] [input_1.fq [input_2.fq output_1.fq output_2.fq]]"
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
9 DESCRIPTION = """Trim the 5' ends of reads by sequence content, e.g. by GC content or presence of
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
10 N's."""
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
11
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
12
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
13 def make_argparser():
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
14 parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE)
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
15 parser.add_argument('infile1', metavar='reads_1.fq', nargs='?', type=argparse.FileType('r'),
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
16 default=sys.stdin,
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
17 help='Input reads (mate 1). Omit to read from stdin.')
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
18 parser.add_argument('infile2', metavar='reads_2.fq', nargs='?', type=argparse.FileType('r'),
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
19 help='Input reads (mate 2). If given, it will preserve pairs (if one read is filtered out '
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
20 'entirely, the other will also be lost).')
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
21 parser.add_argument('outfile1', metavar='output_1.fq', nargs='?', type=argparse.FileType('w'),
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
22 default=sys.stdout,
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
23 help='Output file for mate 1. WARNING: Will overwrite.')
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
24 parser.add_argument('outfile2', metavar='output_2.fq', nargs='?', type=argparse.FileType('w'),
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
25 help='Output file for mate 2. WARNING: Will overwrite.')
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
26 parser.add_argument('-f', '--format', dest='filetype', choices=('fasta', 'fastq'),
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
27 help='Input read format.')
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
28 parser.add_argument('-F', '--out-format', dest='out_filetype', choices=('fasta', 'fastq'),
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
29 help='Output read format. Default: whatever the input format is.')
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
30 parser.add_argument('-b', '--filt-bases', default='N',
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
31 help='The bases to filter on. Case-insensitive. Default: %(default)s.')
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
32 parser.add_argument('-t', '--thres', type=float, default=0.5,
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
33 help='The threshold. The read will be trimmed once the proportion of filter bases in the '
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
34 'window exceed this fraction (not a percentage). Default: %(default)s.')
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
35 parser.add_argument('-w', '--window', dest='win_len', type=int, default=1,
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
36 help='Window size for trimming. Default: %(default)s.')
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
37 parser.add_argument('-i', '--invert', action='store_true',
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
38 help='Invert the filter bases: filter on bases NOT present in the --filt-bases.')
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
39 parser.add_argument('-m', '--min-length', type=int, default=1,
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
40 help='Set a minimum read length. Reads which are trimmed below this length will be filtered '
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
41 'out (omitted entirely from the output). Read pairs will be preserved: both reads in a '
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
42 'pair must exceed this length to be kept. Set to 1 to only omit empty reads. '
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
43 'Default: %(default)s.')
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
44 parser.add_argument('-A', '--acgt', action='store_true',
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
45 help='Filter on any non-ACGT base (shortcut for "--invert --filt-bases ACGT").')
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
46 parser.add_argument('-I', '--iupac', action='store_true',
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
47 help='Filter on any non-IUPAC base (shortcut for "--invert --filt-bases ACGTUWSMKRYBDHVN-").')
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
48 parser.add_argument('-q', '--quiet', action='store_true',
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
49 help='Don\'t print trimming stats on completion.')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
50 parser.add_argument('-T', '--tsv', dest='stats_format', default='human',
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
51 action='store_const', const='tsv')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
52 return parser
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
53
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
54
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
55 def main(argv):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
56 parser = make_argparser()
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
57 args = parser.parse_args(argv[1:])
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
58
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
59 # Catch invalid argument combinations.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
60 if args.infile1 and args.infile2 and not (args.outfile1 and args.outfile2):
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
61 fail('Error: If giving two input files (paired end), must specify both output files.')
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
62 # Determine filetypes, open input file parsers.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
63 filetype1 = get_filetype(args.infile1, args.filetype)
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
64 file1_parser = iter(getreads.getparser(args.infile1, filetype=filetype1))
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
65 if args.infile2:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
66 paired = True
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
67 filetype2 = get_filetype(args.infile2, args.filetype)
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
68 file2_parser = iter(getreads.getparser(args.infile2, filetype=filetype2))
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
69 else:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
70 filetype2 = None
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
71 file2_parser = None
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
72 paired = False
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
73 # Override output filetypes if it was specified on the command line.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
74 if args.out_filetype:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
75 filetype1 = args.out_filetype
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
76 filetype2 = args.out_filetype
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
77
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
78 # Determine the filter bases and whether to invert the selection.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
79 filt_bases = args.filt_bases
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
80 invert = args.invert
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
81 if args.acgt:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
82 filt_bases = 'ACGT'
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
83 invert = True
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
84 elif args.iupac:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
85 filt_bases = 'ACGTUWSMKRYBDHVN-'
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
86 invert = True
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
87
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
88 # Do the actual trimming.
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
89 filters = {'win_len':args.win_len, 'thres':args.thres, 'filt_bases':filt_bases, 'invert':invert,
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
90 'min_len':args.min_length}
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
91 try:
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
92 stats = trim_reads(file1_parser, file2_parser, args.outfile1, args.outfile2,
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
93 filetype1, filetype2, paired, filters)
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
94 finally:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
95 for filehandle in (args.infile1, args.infile2, args.outfile1, args.outfile2):
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
96 if filehandle and filehandle is not sys.stdin and filehandle is not sys.stdout:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
97 filehandle.close()
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
98
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
99 if not args.quiet:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
100 print_stats(stats, args.stats_format)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
101
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
102
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
103 def trim_reads(file1_parser, file2_parser, outfile1, outfile2, filetype1, filetype2, paired,
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
104 filters):
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
105 """Trim all the reads in the input file(s), writing to the output file(s)."""
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
106 min_len = filters['min_len']
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
107 trims1 = collections.Counter()
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
108 trims2 = collections.Counter()
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
109 omitted1 = collections.Counter()
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
110 omitted2 = collections.Counter()
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
111 read1 = None
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
112 read2 = None
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
113 while True:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
114 # Read in the reads.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
115 try:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
116 read1 = next(file1_parser)
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
117 if paired:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
118 read2 = next(file2_parser)
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
119 except StopIteration:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
120 break
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
121 # Do trimming.
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
122 read1, trim_len1 = trim_read(read1, filters, filetype1)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
123 trims1[trim_len1] += 1
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
124 if paired:
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
125 read2, trim_len2 = trim_read(read2, filters, filetype2)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
126 trims2[trim_len2] += 1
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
127 # Output reads if they both pass the minimum length threshold (if any was given).
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
128 if min_len is None or (len(read1.seq) >= min_len and len(read2.seq) >= min_len):
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
129 write_read(outfile1, read1, filetype1)
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
130 write_read(outfile2, read2, filetype2)
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
131 else:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
132 if len(read1.seq) < min_len:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
133 omitted1[trim_len1] += 1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
134 if len(read2.seq) < min_len:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
135 omitted2[trim_len2] += 1
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
136 else:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
137 # Output read if it passes the minimum length threshold (if any was given).
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
138 if min_len is None or len(read1.seq) >= min_len:
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
139 write_read(outfile1, read1, filetype1)
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
140 else:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
141 omitted1[trim_len1] += 1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
142 # Compile stats.
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
143 stats = {}
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
144 stats['reads'] = sum(trims1.values()) + sum(trims2.values())
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
145 stats['trimmed'] = stats['reads'] - trims1[0] - trims2[0]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
146 stats['omitted'] = sum(omitted1.values()) + sum(omitted2.values())
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
147 if paired:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
148 stats['trimmed1'] = stats['reads']//2 - trims1[0]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
149 stats['trimmed2'] = stats['reads']//2 - trims2[0]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
150 stats['omitted1'] = sum(omitted1.values())
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
151 stats['omitted2'] = sum(omitted2.values())
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
152 # Quintiles for trim lengths.
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
153 stats['quants'] = {'order':QUANT_ORDER}
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
154 if paired:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
155 stats['quants']['trim1'] = get_counter_quantiles(trims1, order=QUANT_ORDER)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
156 stats['quants']['trim2'] = get_counter_quantiles(trims2, order=QUANT_ORDER)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
157 stats['quants']['trim'] = get_counter_quantiles(trims1 + trims2, order=QUANT_ORDER)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
158 stats['quants']['omitted_trim1'] = get_counter_quantiles(omitted1, order=QUANT_ORDER)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
159 stats['quants']['omitted_trim2'] = get_counter_quantiles(omitted2, order=QUANT_ORDER)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
160 stats['quants']['omitted_trim'] = get_counter_quantiles(omitted1 + omitted2, order=QUANT_ORDER)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
161 else:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
162 stats['quants']['trim'] = get_counter_quantiles(trims1)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
163 stats['quants']['omitted_trim'] = get_counter_quantiles(omitted1)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
164 return stats
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
165
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
166
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
167 def get_filetype(infile, filetype_arg):
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
168 if infile is sys.stdin:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
169 if filetype_arg:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
170 filetype = filetype_arg
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
171 else:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
172 fail('Error: You must specify the --format if reading from stdin.')
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
173 elif infile:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
174 if filetype_arg:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
175 filetype = filetype_arg
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
176 else:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
177 if infile.name.endswith('.fa') or infile.name.endswith('.fasta'):
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
178 filetype = 'fasta'
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
179 elif infile.name.endswith('.fq') or infile.name.endswith('.fastq'):
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
180 filetype = 'fastq'
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
181 else:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
182 fail('Error: Unrecognized file ending on "{}". Please specify the --format.'.format(infile))
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
183 else:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
184 fail('Error: infile is {}'.format(infile))
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
185 return filetype
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
186
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
187
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
188 def write_read(filehandle, read, filetype):
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
189 if filetype == 'fasta':
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
190 filehandle.write('>{name}\n{seq}\n'.format(**vars(read)))
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
191 elif filetype == 'fastq':
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
192 filehandle.write('@{name}\n{seq}\n+\n{qual}\n'.format(**vars(read)))
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
193
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
194
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
195 def trim_read(read, filters, filetype):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
196 trimmed_seq = trim_seq(read.seq, **filters)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
197 trim_len = len(read.seq) - len(trimmed_seq)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
198 read.seq = trimmed_seq
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
199 if filetype == 'fastq':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
200 # If the output filetype is FASTQ, trim the quality scores too.
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
201 # If there are no input quality scores (i.e. the input is FASTA), use dummy scores instead.
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
202 # "z" is the highest alphanumeric score (PHRED 89), higher than any expected real score.
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
203 qual = read.qual or 'z' * len(read.seq)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
204 read.qual = qual[:len(read.seq)]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
205 return read, trim_len
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
206
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
207
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
208 def trim_seq(seq, win_len=1, thres=1.0, filt_bases='N', invert=False, **kwargs):
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
209 """Trim an individual read and return its trimmed sequence.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
210 This will track the frequency of bad bases in a window of length win_len, and trim once the
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
211 frequency goes below thres. The trim point will be just before the first (leftmost) bad base in
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
212 the window (the first window with a frequency below thres). The "bad" bases are the ones in
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
213 filt_bases if invert is False, or any base NOT in filt_bases if invert is True."""
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
214 # Algorithm:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
215 # The window is a list which acts as a FIFO. As we scan from the left (3') end to the right (5')
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
216 # end, we append new bases to the right end of the window and pop them from the left end.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
217 # Each base is only examined twice: when it enters the window and when it leaves it.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
218 # We keep a running total of the number of bad bases in bad_bases_count, incrementing it when bad
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
219 # bases enter the window and decrementing it when they leave.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
220 # We also track the location of bad bases in the window with bad_bases_coords so we can figure out
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
221 # where to cut if we have to trim.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
222 max_bad_bases = win_len * thres
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
223 window = []
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
224 bad_bases_count = 0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
225 bad_bases_coords = []
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
226 for coord, base in enumerate(seq.upper()):
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
227 # Shift window, adjust bad_bases_count and bad_bases_coords list.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
228 window.append(base)
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
229 # Is the new base we're adding to the window a bad base?
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
230 if invert:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
231 bad_base = base not in filt_bases
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
232 else:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
233 bad_base = base in filt_bases
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
234 # If so, increment the total and add its coordinate to the window.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
235 if bad_base:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
236 bad_bases_count += 1
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
237 bad_bases_coords.append(coord)
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
238 if len(window) > win_len:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
239 first_base = window.pop(0)
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
240 # Is the base we're removing (the first base in the window) a bad base?
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
241 if invert:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
242 bad_base = first_base not in filt_bases
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
243 else:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
244 bad_base = first_base in filt_bases
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
245 # If so, decrement the total and remove its coordinate from the window.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
246 if bad_base:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
247 bad_bases_count -= 1
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
248 bad_bases_coords.pop(0)
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
249 # Are we over the threshold?
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
250 if bad_bases_count > max_bad_bases:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
251 break
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
252 # If we exceeded the threshold, trim the sequence at the first (leftmost) bad base in the window.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
253 if bad_bases_count > max_bad_bases:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
254 first_bad_base = bad_bases_coords[0]
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
255 return seq[0:first_bad_base]
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
256 else:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
257 return seq
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
258
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
259
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
260 def get_counter_quantiles(counter, order=5):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
261 """Return an arbitrary set of quantiles (including min and max values).
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
262 `counter` is a collections.Counter.
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
263 `order` is which quantile to perform (4 = quartiles, 5 = quintiles).
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
264 Warning: This expects a counter which has counted at least `order` elements.
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
265 If it receives a counter with fewer elements, it will simply return `list(counter.elements())`.
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
266 This will have fewer than the usual order+1 elements, and may not fit normal expectations of
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
267 what "quantiles" should be."""
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
268 quantiles = []
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
269 total = sum(counter.values())
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
270 if total <= order:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
271 return list(counter.elements())
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
272 span_size = total / order
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
273 # Sort the items and go through them, looking for the one at the break points.
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
274 items = list(sorted(counter.items(), key=lambda i: i[0]))
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
275 quantiles.append(items[0][0])
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
276 total_seen = 0
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
277 current_span = 1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
278 cut_point = int(round(current_span*span_size))
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
279 for item, count in items:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
280 total_seen += count
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
281 if total_seen >= cut_point:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
282 quantiles.append(item)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
283 current_span += 1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
284 cut_point = int(round(current_span*span_size))
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
285 return quantiles
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
286
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
287
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
288 def print_stats(stats, format='human'):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
289 if format == 'human':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
290 lines = get_stats_lines_human(stats)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
291 elif format == 'tsv':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
292 lines = get_stats_lines_tsv(stats)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
293 else:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
294 fail('Error: Unrecognized format {!r}'.format(format))
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
295 sys.stderr.write('\n'.join(lines).format(**stats)+'\n')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
296
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
297
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
298 def get_stats_lines_human(stats):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
299 # Single-stat lines:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
300 lines = [
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
301 'Total reads in input:\t{reads}',
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
302 'Reads trimmed:\t{trimmed}'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
303 ]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
304 if 'trimmed1' in stats and 'trimmed2' in stats:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
305 lines.append(' For mate 1:\t{trimmed1}')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
306 lines.append(' For mate 2:\t{trimmed2}')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
307 lines.append('Reads filtered out:\t{omitted}')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
308 if 'omitted1' in stats and 'omitted2' in stats:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
309 lines.append(' For mate 1:\t{omitted1}')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
310 lines.append(' For mate 2:\t{omitted2}')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
311 # Quantile lines:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
312 quantile_lines = [
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
313 ('Bases trimmed quintiles', 'trim'),
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
314 (' For mate 1', 'trim1'),
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
315 (' For mate 2', 'trim2'),
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
316 ('Bases trimmed quintiles from filtered reads', 'omitted_trim'),
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
317 (' For mate 1', 'omitted_trim1'),
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
318 (' For mate 2', 'omitted_trim2')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
319 ]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
320 for desc, stat_name in quantile_lines:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
321 if stat_name in stats['quants']:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
322 quants_values = stats['quants'][stat_name]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
323 if quants_values:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
324 quants_str = ', '.join(map(str, quants_values))
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
325 else:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
326 quants_str = 'N/A'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
327 line = desc+':\t'+quants_str
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
328 lines.append(line)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
329 return lines
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
330
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
331
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
332 def get_stats_lines_tsv(stats):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
333 lines = ['{reads}']
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
334 if 'trimmed1' in stats and 'trimmed2' in stats:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
335 lines.append('{trimmed}\t{trimmed1}\t{trimmed2}')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
336 else:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
337 lines.append('{trimmed}')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
338 if 'omitted1' in stats and 'omitted2' in stats:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
339 lines.append('{omitted}\t{omitted1}\t{omitted2}')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
340 else:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
341 lines.append('{omitted}')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
342 for stat_name in ('trim', 'trim1', 'trim2', 'omitted_trim', 'omitted_trim1', 'omitted_trim2'):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
343 if stat_name in stats['quants']:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
344 quants_values = stats['quants'][stat_name]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
345 lines.append('\t'.join(map(str, quants_values)))
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
346 return lines
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
347
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
348
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
349 def fail(message):
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
350 sys.stderr.write(message+"\n")
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
351 sys.exit(1)
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
352
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
353
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
354 if __name__ == '__main__':
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
355 sys.exit(main(sys.argv))