Mercurial > repos > nick > sequence_content_trimmer

diff trimmer.py @ 1:464aee13e2df draft default tip
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
author: nick
date: Fri, 27 May 2022 23:29:45 +0000
parents: 7f170cb06e2e
--- a/trimmer.py	Tue Dec 01 21:33:27 2015 -0500
+++ b/trimmer.py	Fri May 27 23:29:45 2022 +0000
@@ -1,59 +1,60 @@
-#!/usr/bin/env python
-from __future__ import division
+#!/usr/bin/env python3
 import sys
 import argparse
+import collections
 import getreads
 
-OPT_DEFAULTS = {'win_len':1, 'thres':1.0, 'filt_bases':'N'}
+QUANT_ORDER = 5
 USAGE = "%(prog)s [options] [input_1.fq [input_2.fq output_1.fq output_2.fq]]"
 DESCRIPTION = """Trim the 5' ends of reads by sequence content, e.g. by GC content or presence of
 N's."""
 
 
-def main(argv):
-
+def make_argparser():
   parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE)
-  parser.set_defaults(**OPT_DEFAULTS)
-
   parser.add_argument('infile1', metavar='reads_1.fq', nargs='?', type=argparse.FileType('r'),
     default=sys.stdin,
     help='Input reads (mate 1). Omit to read from stdin.')
   parser.add_argument('infile2', metavar='reads_2.fq', nargs='?', type=argparse.FileType('r'),
     help='Input reads (mate 2). If given, it will preserve pairs (if one read is filtered out '
          'entirely, the other will also be lost).')
-  parser.add_argument('outfile1', metavar='reads.filt_1.fq', nargs='?', type=argparse.FileType('w'),
+  parser.add_argument('outfile1', metavar='output_1.fq', nargs='?', type=argparse.FileType('w'),
     default=sys.stdout,
     help='Output file for mate 1. WARNING: Will overwrite.')
-  parser.add_argument('outfile2', metavar='reads.filt_2.fq', nargs='?', type=argparse.FileType('w'),
+  parser.add_argument('outfile2', metavar='output_2.fq', nargs='?', type=argparse.FileType('w'),
     help='Output file for mate 2. WARNING: Will overwrite.')
   parser.add_argument('-f', '--format', dest='filetype', choices=('fasta', 'fastq'),
     help='Input read format.')
   parser.add_argument('-F', '--out-format', dest='out_filetype', choices=('fasta', 'fastq'),
     help='Output read format. Default: whatever the input format is.')
-  parser.add_argument('-b', '--filt-bases',
+  parser.add_argument('-b', '--filt-bases', default='N',
     help='The bases to filter on. Case-insensitive. Default: %(default)s.')
-  parser.add_argument('-t', '--thres', type=float,
+  parser.add_argument('-t', '--thres', type=float, default=0.5,
     help='The threshold. The read will be trimmed once the proportion of filter bases in the '
          'window exceed this fraction (not a percentage). Default: %(default)s.')
-  parser.add_argument('-w', '--window', dest='win_len', type=int,
+  parser.add_argument('-w', '--window', dest='win_len', type=int, default=1,
     help='Window size for trimming. Default: %(default)s.')
   parser.add_argument('-i', '--invert', action='store_true',
     help='Invert the filter bases: filter on bases NOT present in the --filt-bases.')
-  parser.add_argument('-m', '--min-length', type=int,
+  parser.add_argument('-m', '--min-length', type=int, default=1,
     help='Set a minimum read length. Reads which are trimmed below this length will be filtered '
          'out (omitted entirely from the output). Read pairs will be preserved: both reads in a '
-         'pair must exceed this length to be kept. Set to 0 to only omit empty reads.')
-  parser.add_argument('--error',
-    help='Fail with this error message (useful for Galaxy tool).')
+         'pair must exceed this length to be kept. Set to 1 to only omit empty reads. '
+         'Default: %(default)s.')
   parser.add_argument('-A', '--acgt', action='store_true',
     help='Filter on any non-ACGT base (shortcut for "--invert --filt-bases ACGT").')
   parser.add_argument('-I', '--iupac', action='store_true',
     help='Filter on any non-IUPAC base (shortcut for "--invert --filt-bases ACGTUWSMKRYBDHVN-").')
-
-  args = parser.parse_args(argv[1:])
+  parser.add_argument('-q', '--quiet', action='store_true',
+    help='Don\'t print trimming stats on completion.')
+  parser.add_argument('-T', '--tsv', dest='stats_format', default='human',
+                      action='store_const', const='tsv')
+  return parser
 
-  if args.error:
-    fail('Error: '+args.error)
+
+def main(argv):
+  parser = make_argparser()
+  args = parser.parse_args(argv[1:])
 
   # Catch invalid argument combinations.
   if args.infile1 and args.infile2 and not (args.outfile1 and args.outfile2):
@@ -85,18 +86,28 @@
     invert = True
 
   # Do the actual trimming.
+  filters = {'win_len':args.win_len, 'thres':args.thres, 'filt_bases':filt_bases, 'invert':invert,
+             'min_len':args.min_length}
   try:
-    trim_reads(file1_parser, file2_parser, args.outfile1, args.outfile2, filetype1, filetype2,
-               paired, args.win_len, args.thres, filt_bases, invert, args.min_length)
+    stats = trim_reads(file1_parser, file2_parser, args.outfile1, args.outfile2,
+                       filetype1, filetype2, paired, filters)
   finally:
     for filehandle in (args.infile1, args.infile2, args.outfile1, args.outfile2):
       if filehandle and filehandle is not sys.stdin and filehandle is not sys.stdout:
         filehandle.close()
 
+  if not args.quiet:
+    print_stats(stats, args.stats_format)
+
 
 def trim_reads(file1_parser, file2_parser, outfile1, outfile2, filetype1, filetype2, paired,
-               win_len, thres, filt_bases, invert, min_length):
+               filters):
   """Trim all the reads in the input file(s), writing to the output file(s)."""
+  min_len = filters['min_len']
+  trims1 = collections.Counter()
+  trims2 = collections.Counter()
+  omitted1 = collections.Counter()
+  omitted2 = collections.Counter()
   read1 = None
   read2 = None
   while True:
@@ -108,26 +119,49 @@
     except StopIteration:
       break
     # Do trimming.
-    read1.seq = trim_read(read1.seq, win_len, thres, filt_bases, invert)
-    if filetype1 == 'fastq':
-      # If the output filetype is FASTQ, trim the quality scores too.
-      # If there are no input quality scores (i.e. the input is FASTA), use dummy scores instead.
-      # "z" is the highest alphanumeric score (PHRED 89), higher than any expected real score.
-      qual1 = read1.qual or 'z' * len(read1.seq)
-      read1.qual = qual1[:len(read1.seq)]
+    read1, trim_len1 = trim_read(read1, filters, filetype1)
+    trims1[trim_len1] += 1
     if paired:
-      read2.seq = trim_read(read2.seq, win_len, thres, filt_bases, invert)
-      if filetype2 == 'fastq':
-        qual2 = read2.qual or 'z' * len(read2.seq)
-        read2.qual = qual2[:len(read2.seq)]
+      read2, trim_len2 = trim_read(read2, filters, filetype2)
+      trims2[trim_len2] += 1
       # Output reads if they both pass the minimum length threshold (if any was given).
-      if min_length is None or (len(read1.seq) >= min_length and len(read2.seq) >= min_length):
+      if min_len is None or (len(read1.seq) >= min_len and len(read2.seq) >= min_len):
         write_read(outfile1, read1, filetype1)
         write_read(outfile2, read2, filetype2)
+      else:
+        if len(read1.seq) < min_len:
+          omitted1[trim_len1] += 1
+        if len(read2.seq) < min_len:
+          omitted2[trim_len2] += 1
     else:
       # Output read if it passes the minimum length threshold (if any was given).
-      if min_length is None or len(read1.seq) >= min_length:
+      if min_len is None or len(read1.seq) >= min_len:
         write_read(outfile1, read1, filetype1)
+      else:
+        omitted1[trim_len1] += 1
+  # Compile stats.
+  stats = {}
+  stats['reads'] = sum(trims1.values()) + sum(trims2.values())
+  stats['trimmed'] = stats['reads'] - trims1[0] - trims2[0]
+  stats['omitted'] = sum(omitted1.values()) + sum(omitted2.values())
+  if paired:
+    stats['trimmed1'] = stats['reads']//2 - trims1[0]
+    stats['trimmed2'] = stats['reads']//2 - trims2[0]
+    stats['omitted1'] = sum(omitted1.values())
+    stats['omitted2'] = sum(omitted2.values())
+  # Quintiles for trim lengths.
+  stats['quants'] = {'order':QUANT_ORDER}
+  if paired:
+    stats['quants']['trim1'] = get_counter_quantiles(trims1, order=QUANT_ORDER)
+    stats['quants']['trim2'] = get_counter_quantiles(trims2, order=QUANT_ORDER)
+    stats['quants']['trim'] = get_counter_quantiles(trims1 + trims2, order=QUANT_ORDER)
+    stats['quants']['omitted_trim1'] = get_counter_quantiles(omitted1, order=QUANT_ORDER)
+    stats['quants']['omitted_trim2'] = get_counter_quantiles(omitted2, order=QUANT_ORDER)
+    stats['quants']['omitted_trim'] = get_counter_quantiles(omitted1 + omitted2, order=QUANT_ORDER)
+  else:
+    stats['quants']['trim'] = get_counter_quantiles(trims1)
+    stats['quants']['omitted_trim'] = get_counter_quantiles(omitted1)
+  return stats
 
 
 def get_filetype(infile, filetype_arg):
@@ -158,7 +192,20 @@
     filehandle.write('@{name}\n{seq}\n+\n{qual}\n'.format(**vars(read)))
 
 
-def trim_read(seq, win_len, thres, filt_bases, invert):
+def trim_read(read, filters, filetype):
+  trimmed_seq = trim_seq(read.seq, **filters)
+  trim_len = len(read.seq) - len(trimmed_seq)
+  read.seq = trimmed_seq
+  if filetype == 'fastq':
+    # If the output filetype is FASTQ, trim the quality scores too.
+    # If there are no input quality scores (i.e. the input is FASTA), use dummy scores instead.
+    # "z" is the highest alphanumeric score (PHRED 89), higher than any expected real score.
+    qual = read.qual or 'z' * len(read.seq)
+    read.qual = qual[:len(read.seq)]
+  return read, trim_len
+
+
+def trim_seq(seq, win_len=1, thres=1.0, filt_bases='N', invert=False, **kwargs):
   """Trim an individual read and return its trimmed sequence.
   This will track the frequency of bad bases in a window of length win_len, and trim once the
   frequency goes below thres. The trim point will be just before the first (leftmost) bad base in
@@ -199,7 +246,6 @@
       if bad_base:
         bad_bases_count -= 1
         bad_bases_coords.pop(0)
-    # print bad_bases_coords
     # Are we over the threshold?
     if bad_bases_count > max_bad_bases:
       break
@@ -211,6 +257,95 @@
     return seq
 
 
+def get_counter_quantiles(counter, order=5):
+  """Return an arbitrary set of quantiles (including min and max values).
+  `counter` is a collections.Counter.
+  `order` is which quantile to perform (4 = quartiles, 5 = quintiles).
+  Warning: This expects a counter which has counted at least `order` elements.
+  If it receives a counter with fewer elements, it will simply return `list(counter.elements())`.
+  This will have fewer than the usual order+1 elements, and may not fit normal expectations of
+  what "quantiles" should be."""
+  quantiles = []
+  total = sum(counter.values())
+  if total <= order:
+    return list(counter.elements())
+  span_size = total / order
+  # Sort the items and go through them, looking for the one at the break points.
+  items = list(sorted(counter.items(), key=lambda i: i[0]))
+  quantiles.append(items[0][0])
+  total_seen = 0
+  current_span = 1
+  cut_point = int(round(current_span*span_size))
+  for item, count in items:
+    total_seen += count
+    if total_seen >= cut_point:
+      quantiles.append(item)
+      current_span += 1
+      cut_point = int(round(current_span*span_size))
+  return quantiles
+
+
+def print_stats(stats, format='human'):
+  if format == 'human':
+    lines = get_stats_lines_human(stats)
+  elif format == 'tsv':
+    lines = get_stats_lines_tsv(stats)
+  else:
+    fail('Error: Unrecognized format {!r}'.format(format))
+  sys.stderr.write('\n'.join(lines).format(**stats)+'\n')
+
+
+def get_stats_lines_human(stats):
+  # Single-stat lines:
+  lines = [
+    'Total reads in input:\t{reads}',
+    'Reads trimmed:\t{trimmed}'
+  ]
+  if 'trimmed1' in stats and 'trimmed2' in stats:
+    lines.append('  For mate 1:\t{trimmed1}')
+    lines.append('  For mate 2:\t{trimmed2}')
+  lines.append('Reads filtered out:\t{omitted}')
+  if 'omitted1' in stats and 'omitted2' in stats:
+    lines.append('  For mate 1:\t{omitted1}')
+    lines.append('  For mate 2:\t{omitted2}')
+  # Quantile lines:
+  quantile_lines = [
+    ('Bases trimmed quintiles', 'trim'),
+    ('  For mate 1', 'trim1'),
+    ('  For mate 2', 'trim2'),
+    ('Bases trimmed quintiles from filtered reads', 'omitted_trim'),
+    ('  For mate 1', 'omitted_trim1'),
+    ('  For mate 2', 'omitted_trim2')
+  ]
+  for desc, stat_name in quantile_lines:
+    if stat_name in stats['quants']:
+      quants_values = stats['quants'][stat_name]
+      if quants_values:
+        quants_str = ', '.join(map(str, quants_values))
+      else:
+        quants_str = 'N/A'
+      line = desc+':\t'+quants_str
+      lines.append(line)
+  return lines
+
+
+def get_stats_lines_tsv(stats):
+  lines = ['{reads}']
+  if 'trimmed1' in stats and 'trimmed2' in stats:
+    lines.append('{trimmed}\t{trimmed1}\t{trimmed2}')
+  else:
+    lines.append('{trimmed}')
+  if 'omitted1' in stats and 'omitted2' in stats:
+    lines.append('{omitted}\t{omitted1}\t{omitted2}')
+  else:
+    lines.append('{omitted}')
+  for stat_name in ('trim', 'trim1', 'trim2', 'omitted_trim', 'omitted_trim1', 'omitted_trim2'):
+    if stat_name in stats['quants']:
+      quants_values = stats['quants'][stat_name]
+      lines.append('\t'.join(map(str, quants_values)))
+  return lines
+
+
 def fail(message):
   sys.stderr.write(message+"\n")
   sys.exit(1)
author	nick
date	Fri, 27 May 2022 23:29:45 +0000
parents	7f170cb06e2e
children