Mercurial > repos > nick > sequence_content_trimmer
comparison trimmer.py @ 1:464aee13e2df draft default tip
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
| author | nick |
|---|---|
| date | Fri, 27 May 2022 23:29:45 +0000 |
| parents | 7f170cb06e2e |
| children |
comparison
equal
deleted
inserted
replaced
| 0:7f170cb06e2e | 1:464aee13e2df |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python3 |
| 2 from __future__ import division | |
| 3 import sys | 2 import sys |
| 4 import argparse | 3 import argparse |
| 4 import collections | |
| 5 import getreads | 5 import getreads |
| 6 | 6 |
| 7 OPT_DEFAULTS = {'win_len':1, 'thres':1.0, 'filt_bases':'N'} | 7 QUANT_ORDER = 5 |
| 8 USAGE = "%(prog)s [options] [input_1.fq [input_2.fq output_1.fq output_2.fq]]" | 8 USAGE = "%(prog)s [options] [input_1.fq [input_2.fq output_1.fq output_2.fq]]" |
| 9 DESCRIPTION = """Trim the 5' ends of reads by sequence content, e.g. by GC content or presence of | 9 DESCRIPTION = """Trim the 5' ends of reads by sequence content, e.g. by GC content or presence of |
| 10 N's.""" | 10 N's.""" |
| 11 | 11 |
| 12 | 12 |
| 13 def main(argv): | 13 def make_argparser(): |
| 14 | |
| 15 parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE) | 14 parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE) |
| 16 parser.set_defaults(**OPT_DEFAULTS) | |
| 17 | |
| 18 parser.add_argument('infile1', metavar='reads_1.fq', nargs='?', type=argparse.FileType('r'), | 15 parser.add_argument('infile1', metavar='reads_1.fq', nargs='?', type=argparse.FileType('r'), |
| 19 default=sys.stdin, | 16 default=sys.stdin, |
| 20 help='Input reads (mate 1). Omit to read from stdin.') | 17 help='Input reads (mate 1). Omit to read from stdin.') |
| 21 parser.add_argument('infile2', metavar='reads_2.fq', nargs='?', type=argparse.FileType('r'), | 18 parser.add_argument('infile2', metavar='reads_2.fq', nargs='?', type=argparse.FileType('r'), |
| 22 help='Input reads (mate 2). If given, it will preserve pairs (if one read is filtered out ' | 19 help='Input reads (mate 2). If given, it will preserve pairs (if one read is filtered out ' |
| 23 'entirely, the other will also be lost).') | 20 'entirely, the other will also be lost).') |
| 24 parser.add_argument('outfile1', metavar='reads.filt_1.fq', nargs='?', type=argparse.FileType('w'), | 21 parser.add_argument('outfile1', metavar='output_1.fq', nargs='?', type=argparse.FileType('w'), |
| 25 default=sys.stdout, | 22 default=sys.stdout, |
| 26 help='Output file for mate 1. WARNING: Will overwrite.') | 23 help='Output file for mate 1. WARNING: Will overwrite.') |
| 27 parser.add_argument('outfile2', metavar='reads.filt_2.fq', nargs='?', type=argparse.FileType('w'), | 24 parser.add_argument('outfile2', metavar='output_2.fq', nargs='?', type=argparse.FileType('w'), |
| 28 help='Output file for mate 2. WARNING: Will overwrite.') | 25 help='Output file for mate 2. WARNING: Will overwrite.') |
| 29 parser.add_argument('-f', '--format', dest='filetype', choices=('fasta', 'fastq'), | 26 parser.add_argument('-f', '--format', dest='filetype', choices=('fasta', 'fastq'), |
| 30 help='Input read format.') | 27 help='Input read format.') |
| 31 parser.add_argument('-F', '--out-format', dest='out_filetype', choices=('fasta', 'fastq'), | 28 parser.add_argument('-F', '--out-format', dest='out_filetype', choices=('fasta', 'fastq'), |
| 32 help='Output read format. Default: whatever the input format is.') | 29 help='Output read format. Default: whatever the input format is.') |
| 33 parser.add_argument('-b', '--filt-bases', | 30 parser.add_argument('-b', '--filt-bases', default='N', |
| 34 help='The bases to filter on. Case-insensitive. Default: %(default)s.') | 31 help='The bases to filter on. Case-insensitive. Default: %(default)s.') |
| 35 parser.add_argument('-t', '--thres', type=float, | 32 parser.add_argument('-t', '--thres', type=float, default=0.5, |
| 36 help='The threshold. The read will be trimmed once the proportion of filter bases in the ' | 33 help='The threshold. The read will be trimmed once the proportion of filter bases in the ' |
| 37 'window exceed this fraction (not a percentage). Default: %(default)s.') | 34 'window exceed this fraction (not a percentage). Default: %(default)s.') |
| 38 parser.add_argument('-w', '--window', dest='win_len', type=int, | 35 parser.add_argument('-w', '--window', dest='win_len', type=int, default=1, |
| 39 help='Window size for trimming. Default: %(default)s.') | 36 help='Window size for trimming. Default: %(default)s.') |
| 40 parser.add_argument('-i', '--invert', action='store_true', | 37 parser.add_argument('-i', '--invert', action='store_true', |
| 41 help='Invert the filter bases: filter on bases NOT present in the --filt-bases.') | 38 help='Invert the filter bases: filter on bases NOT present in the --filt-bases.') |
| 42 parser.add_argument('-m', '--min-length', type=int, | 39 parser.add_argument('-m', '--min-length', type=int, default=1, |
| 43 help='Set a minimum read length. Reads which are trimmed below this length will be filtered ' | 40 help='Set a minimum read length. Reads which are trimmed below this length will be filtered ' |
| 44 'out (omitted entirely from the output). Read pairs will be preserved: both reads in a ' | 41 'out (omitted entirely from the output). Read pairs will be preserved: both reads in a ' |
| 45 'pair must exceed this length to be kept. Set to 0 to only omit empty reads.') | 42 'pair must exceed this length to be kept. Set to 1 to only omit empty reads. ' |
| 46 parser.add_argument('--error', | 43 'Default: %(default)s.') |
| 47 help='Fail with this error message (useful for Galaxy tool).') | |
| 48 parser.add_argument('-A', '--acgt', action='store_true', | 44 parser.add_argument('-A', '--acgt', action='store_true', |
| 49 help='Filter on any non-ACGT base (shortcut for "--invert --filt-bases ACGT").') | 45 help='Filter on any non-ACGT base (shortcut for "--invert --filt-bases ACGT").') |
| 50 parser.add_argument('-I', '--iupac', action='store_true', | 46 parser.add_argument('-I', '--iupac', action='store_true', |
| 51 help='Filter on any non-IUPAC base (shortcut for "--invert --filt-bases ACGTUWSMKRYBDHVN-").') | 47 help='Filter on any non-IUPAC base (shortcut for "--invert --filt-bases ACGTUWSMKRYBDHVN-").') |
| 52 | 48 parser.add_argument('-q', '--quiet', action='store_true', |
| 49 help='Don\'t print trimming stats on completion.') | |
| 50 parser.add_argument('-T', '--tsv', dest='stats_format', default='human', | |
| 51 action='store_const', const='tsv') | |
| 52 return parser | |
| 53 | |
| 54 | |
| 55 def main(argv): | |
| 56 parser = make_argparser() | |
| 53 args = parser.parse_args(argv[1:]) | 57 args = parser.parse_args(argv[1:]) |
| 54 | |
| 55 if args.error: | |
| 56 fail('Error: '+args.error) | |
| 57 | 58 |
| 58 # Catch invalid argument combinations. | 59 # Catch invalid argument combinations. |
| 59 if args.infile1 and args.infile2 and not (args.outfile1 and args.outfile2): | 60 if args.infile1 and args.infile2 and not (args.outfile1 and args.outfile2): |
| 60 fail('Error: If giving two input files (paired end), must specify both output files.') | 61 fail('Error: If giving two input files (paired end), must specify both output files.') |
| 61 # Determine filetypes, open input file parsers. | 62 # Determine filetypes, open input file parsers. |
| 83 elif args.iupac: | 84 elif args.iupac: |
| 84 filt_bases = 'ACGTUWSMKRYBDHVN-' | 85 filt_bases = 'ACGTUWSMKRYBDHVN-' |
| 85 invert = True | 86 invert = True |
| 86 | 87 |
| 87 # Do the actual trimming. | 88 # Do the actual trimming. |
| 89 filters = {'win_len':args.win_len, 'thres':args.thres, 'filt_bases':filt_bases, 'invert':invert, | |
| 90 'min_len':args.min_length} | |
| 88 try: | 91 try: |
| 89 trim_reads(file1_parser, file2_parser, args.outfile1, args.outfile2, filetype1, filetype2, | 92 stats = trim_reads(file1_parser, file2_parser, args.outfile1, args.outfile2, |
| 90 paired, args.win_len, args.thres, filt_bases, invert, args.min_length) | 93 filetype1, filetype2, paired, filters) |
| 91 finally: | 94 finally: |
| 92 for filehandle in (args.infile1, args.infile2, args.outfile1, args.outfile2): | 95 for filehandle in (args.infile1, args.infile2, args.outfile1, args.outfile2): |
| 93 if filehandle and filehandle is not sys.stdin and filehandle is not sys.stdout: | 96 if filehandle and filehandle is not sys.stdin and filehandle is not sys.stdout: |
| 94 filehandle.close() | 97 filehandle.close() |
| 95 | 98 |
| 99 if not args.quiet: | |
| 100 print_stats(stats, args.stats_format) | |
| 101 | |
| 96 | 102 |
| 97 def trim_reads(file1_parser, file2_parser, outfile1, outfile2, filetype1, filetype2, paired, | 103 def trim_reads(file1_parser, file2_parser, outfile1, outfile2, filetype1, filetype2, paired, |
| 98 win_len, thres, filt_bases, invert, min_length): | 104 filters): |
| 99 """Trim all the reads in the input file(s), writing to the output file(s).""" | 105 """Trim all the reads in the input file(s), writing to the output file(s).""" |
| 106 min_len = filters['min_len'] | |
| 107 trims1 = collections.Counter() | |
| 108 trims2 = collections.Counter() | |
| 109 omitted1 = collections.Counter() | |
| 110 omitted2 = collections.Counter() | |
| 100 read1 = None | 111 read1 = None |
| 101 read2 = None | 112 read2 = None |
| 102 while True: | 113 while True: |
| 103 # Read in the reads. | 114 # Read in the reads. |
| 104 try: | 115 try: |
| 106 if paired: | 117 if paired: |
| 107 read2 = next(file2_parser) | 118 read2 = next(file2_parser) |
| 108 except StopIteration: | 119 except StopIteration: |
| 109 break | 120 break |
| 110 # Do trimming. | 121 # Do trimming. |
| 111 read1.seq = trim_read(read1.seq, win_len, thres, filt_bases, invert) | 122 read1, trim_len1 = trim_read(read1, filters, filetype1) |
| 112 if filetype1 == 'fastq': | 123 trims1[trim_len1] += 1 |
| 113 # If the output filetype is FASTQ, trim the quality scores too. | |
| 114 # If there are no input quality scores (i.e. the input is FASTA), use dummy scores instead. | |
| 115 # "z" is the highest alphanumeric score (PHRED 89), higher than any expected real score. | |
| 116 qual1 = read1.qual or 'z' * len(read1.seq) | |
| 117 read1.qual = qual1[:len(read1.seq)] | |
| 118 if paired: | 124 if paired: |
| 119 read2.seq = trim_read(read2.seq, win_len, thres, filt_bases, invert) | 125 read2, trim_len2 = trim_read(read2, filters, filetype2) |
| 120 if filetype2 == 'fastq': | 126 trims2[trim_len2] += 1 |
| 121 qual2 = read2.qual or 'z' * len(read2.seq) | |
| 122 read2.qual = qual2[:len(read2.seq)] | |
| 123 # Output reads if they both pass the minimum length threshold (if any was given). | 127 # Output reads if they both pass the minimum length threshold (if any was given). |
| 124 if min_length is None or (len(read1.seq) >= min_length and len(read2.seq) >= min_length): | 128 if min_len is None or (len(read1.seq) >= min_len and len(read2.seq) >= min_len): |
| 125 write_read(outfile1, read1, filetype1) | 129 write_read(outfile1, read1, filetype1) |
| 126 write_read(outfile2, read2, filetype2) | 130 write_read(outfile2, read2, filetype2) |
| 131 else: | |
| 132 if len(read1.seq) < min_len: | |
| 133 omitted1[trim_len1] += 1 | |
| 134 if len(read2.seq) < min_len: | |
| 135 omitted2[trim_len2] += 1 | |
| 127 else: | 136 else: |
| 128 # Output read if it passes the minimum length threshold (if any was given). | 137 # Output read if it passes the minimum length threshold (if any was given). |
| 129 if min_length is None or len(read1.seq) >= min_length: | 138 if min_len is None or len(read1.seq) >= min_len: |
| 130 write_read(outfile1, read1, filetype1) | 139 write_read(outfile1, read1, filetype1) |
| 140 else: | |
| 141 omitted1[trim_len1] += 1 | |
| 142 # Compile stats. | |
| 143 stats = {} | |
| 144 stats['reads'] = sum(trims1.values()) + sum(trims2.values()) | |
| 145 stats['trimmed'] = stats['reads'] - trims1[0] - trims2[0] | |
| 146 stats['omitted'] = sum(omitted1.values()) + sum(omitted2.values()) | |
| 147 if paired: | |
| 148 stats['trimmed1'] = stats['reads']//2 - trims1[0] | |
| 149 stats['trimmed2'] = stats['reads']//2 - trims2[0] | |
| 150 stats['omitted1'] = sum(omitted1.values()) | |
| 151 stats['omitted2'] = sum(omitted2.values()) | |
| 152 # Quintiles for trim lengths. | |
| 153 stats['quants'] = {'order':QUANT_ORDER} | |
| 154 if paired: | |
| 155 stats['quants']['trim1'] = get_counter_quantiles(trims1, order=QUANT_ORDER) | |
| 156 stats['quants']['trim2'] = get_counter_quantiles(trims2, order=QUANT_ORDER) | |
| 157 stats['quants']['trim'] = get_counter_quantiles(trims1 + trims2, order=QUANT_ORDER) | |
| 158 stats['quants']['omitted_trim1'] = get_counter_quantiles(omitted1, order=QUANT_ORDER) | |
| 159 stats['quants']['omitted_trim2'] = get_counter_quantiles(omitted2, order=QUANT_ORDER) | |
| 160 stats['quants']['omitted_trim'] = get_counter_quantiles(omitted1 + omitted2, order=QUANT_ORDER) | |
| 161 else: | |
| 162 stats['quants']['trim'] = get_counter_quantiles(trims1) | |
| 163 stats['quants']['omitted_trim'] = get_counter_quantiles(omitted1) | |
| 164 return stats | |
| 131 | 165 |
| 132 | 166 |
| 133 def get_filetype(infile, filetype_arg): | 167 def get_filetype(infile, filetype_arg): |
| 134 if infile is sys.stdin: | 168 if infile is sys.stdin: |
| 135 if filetype_arg: | 169 if filetype_arg: |
| 156 filehandle.write('>{name}\n{seq}\n'.format(**vars(read))) | 190 filehandle.write('>{name}\n{seq}\n'.format(**vars(read))) |
| 157 elif filetype == 'fastq': | 191 elif filetype == 'fastq': |
| 158 filehandle.write('@{name}\n{seq}\n+\n{qual}\n'.format(**vars(read))) | 192 filehandle.write('@{name}\n{seq}\n+\n{qual}\n'.format(**vars(read))) |
| 159 | 193 |
| 160 | 194 |
| 161 def trim_read(seq, win_len, thres, filt_bases, invert): | 195 def trim_read(read, filters, filetype): |
| 196 trimmed_seq = trim_seq(read.seq, **filters) | |
| 197 trim_len = len(read.seq) - len(trimmed_seq) | |
| 198 read.seq = trimmed_seq | |
| 199 if filetype == 'fastq': | |
| 200 # If the output filetype is FASTQ, trim the quality scores too. | |
| 201 # If there are no input quality scores (i.e. the input is FASTA), use dummy scores instead. | |
| 202 # "z" is the highest alphanumeric score (PHRED 89), higher than any expected real score. | |
| 203 qual = read.qual or 'z' * len(read.seq) | |
| 204 read.qual = qual[:len(read.seq)] | |
| 205 return read, trim_len | |
| 206 | |
| 207 | |
| 208 def trim_seq(seq, win_len=1, thres=1.0, filt_bases='N', invert=False, **kwargs): | |
| 162 """Trim an individual read and return its trimmed sequence. | 209 """Trim an individual read and return its trimmed sequence. |
| 163 This will track the frequency of bad bases in a window of length win_len, and trim once the | 210 This will track the frequency of bad bases in a window of length win_len, and trim once the |
| 164 frequency goes below thres. The trim point will be just before the first (leftmost) bad base in | 211 frequency goes below thres. The trim point will be just before the first (leftmost) bad base in |
| 165 the window (the first window with a frequency below thres). The "bad" bases are the ones in | 212 the window (the first window with a frequency below thres). The "bad" bases are the ones in |
| 166 filt_bases if invert is False, or any base NOT in filt_bases if invert is True.""" | 213 filt_bases if invert is False, or any base NOT in filt_bases if invert is True.""" |
| 197 bad_base = first_base in filt_bases | 244 bad_base = first_base in filt_bases |
| 198 # If so, decrement the total and remove its coordinate from the window. | 245 # If so, decrement the total and remove its coordinate from the window. |
| 199 if bad_base: | 246 if bad_base: |
| 200 bad_bases_count -= 1 | 247 bad_bases_count -= 1 |
| 201 bad_bases_coords.pop(0) | 248 bad_bases_coords.pop(0) |
| 202 # print bad_bases_coords | |
| 203 # Are we over the threshold? | 249 # Are we over the threshold? |
| 204 if bad_bases_count > max_bad_bases: | 250 if bad_bases_count > max_bad_bases: |
| 205 break | 251 break |
| 206 # If we exceeded the threshold, trim the sequence at the first (leftmost) bad base in the window. | 252 # If we exceeded the threshold, trim the sequence at the first (leftmost) bad base in the window. |
| 207 if bad_bases_count > max_bad_bases: | 253 if bad_bases_count > max_bad_bases: |
| 209 return seq[0:first_bad_base] | 255 return seq[0:first_bad_base] |
| 210 else: | 256 else: |
| 211 return seq | 257 return seq |
| 212 | 258 |
| 213 | 259 |
| 260 def get_counter_quantiles(counter, order=5): | |
| 261 """Return an arbitrary set of quantiles (including min and max values). | |
| 262 `counter` is a collections.Counter. | |
| 263 `order` is which quantile to perform (4 = quartiles, 5 = quintiles). | |
| 264 Warning: This expects a counter which has counted at least `order` elements. | |
| 265 If it receives a counter with fewer elements, it will simply return `list(counter.elements())`. | |
| 266 This will have fewer than the usual order+1 elements, and may not fit normal expectations of | |
| 267 what "quantiles" should be.""" | |
| 268 quantiles = [] | |
| 269 total = sum(counter.values()) | |
| 270 if total <= order: | |
| 271 return list(counter.elements()) | |
| 272 span_size = total / order | |
| 273 # Sort the items and go through them, looking for the one at the break points. | |
| 274 items = list(sorted(counter.items(), key=lambda i: i[0])) | |
| 275 quantiles.append(items[0][0]) | |
| 276 total_seen = 0 | |
| 277 current_span = 1 | |
| 278 cut_point = int(round(current_span*span_size)) | |
| 279 for item, count in items: | |
| 280 total_seen += count | |
| 281 if total_seen >= cut_point: | |
| 282 quantiles.append(item) | |
| 283 current_span += 1 | |
| 284 cut_point = int(round(current_span*span_size)) | |
| 285 return quantiles | |
| 286 | |
| 287 | |
| 288 def print_stats(stats, format='human'): | |
| 289 if format == 'human': | |
| 290 lines = get_stats_lines_human(stats) | |
| 291 elif format == 'tsv': | |
| 292 lines = get_stats_lines_tsv(stats) | |
| 293 else: | |
| 294 fail('Error: Unrecognized format {!r}'.format(format)) | |
| 295 sys.stderr.write('\n'.join(lines).format(**stats)+'\n') | |
| 296 | |
| 297 | |
| 298 def get_stats_lines_human(stats): | |
| 299 # Single-stat lines: | |
| 300 lines = [ | |
| 301 'Total reads in input:\t{reads}', | |
| 302 'Reads trimmed:\t{trimmed}' | |
| 303 ] | |
| 304 if 'trimmed1' in stats and 'trimmed2' in stats: | |
| 305 lines.append(' For mate 1:\t{trimmed1}') | |
| 306 lines.append(' For mate 2:\t{trimmed2}') | |
| 307 lines.append('Reads filtered out:\t{omitted}') | |
| 308 if 'omitted1' in stats and 'omitted2' in stats: | |
| 309 lines.append(' For mate 1:\t{omitted1}') | |
| 310 lines.append(' For mate 2:\t{omitted2}') | |
| 311 # Quantile lines: | |
| 312 quantile_lines = [ | |
| 313 ('Bases trimmed quintiles', 'trim'), | |
| 314 (' For mate 1', 'trim1'), | |
| 315 (' For mate 2', 'trim2'), | |
| 316 ('Bases trimmed quintiles from filtered reads', 'omitted_trim'), | |
| 317 (' For mate 1', 'omitted_trim1'), | |
| 318 (' For mate 2', 'omitted_trim2') | |
| 319 ] | |
| 320 for desc, stat_name in quantile_lines: | |
| 321 if stat_name in stats['quants']: | |
| 322 quants_values = stats['quants'][stat_name] | |
| 323 if quants_values: | |
| 324 quants_str = ', '.join(map(str, quants_values)) | |
| 325 else: | |
| 326 quants_str = 'N/A' | |
| 327 line = desc+':\t'+quants_str | |
| 328 lines.append(line) | |
| 329 return lines | |
| 330 | |
| 331 | |
| 332 def get_stats_lines_tsv(stats): | |
| 333 lines = ['{reads}'] | |
| 334 if 'trimmed1' in stats and 'trimmed2' in stats: | |
| 335 lines.append('{trimmed}\t{trimmed1}\t{trimmed2}') | |
| 336 else: | |
| 337 lines.append('{trimmed}') | |
| 338 if 'omitted1' in stats and 'omitted2' in stats: | |
| 339 lines.append('{omitted}\t{omitted1}\t{omitted2}') | |
| 340 else: | |
| 341 lines.append('{omitted}') | |
| 342 for stat_name in ('trim', 'trim1', 'trim2', 'omitted_trim', 'omitted_trim1', 'omitted_trim2'): | |
| 343 if stat_name in stats['quants']: | |
| 344 quants_values = stats['quants'][stat_name] | |
| 345 lines.append('\t'.join(map(str, quants_values))) | |
| 346 return lines | |
| 347 | |
| 348 | |
| 214 def fail(message): | 349 def fail(message): |
| 215 sys.stderr.write(message+"\n") | 350 sys.stderr.write(message+"\n") |
| 216 sys.exit(1) | 351 sys.exit(1) |
| 217 | 352 |
| 218 | 353 |
