Previous changeset 0:7f170cb06e2e (2015-12-01) |
Commit message:
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty" |
modified:
getreads.py trimmer.py trimmer.xml |
b |
diff -r 7f170cb06e2e -r 464aee13e2df getreads.py --- a/getreads.py Tue Dec 01 21:33:27 2015 -0500 +++ b/getreads.py Fri May 27 23:29:45 2022 +0000 |
[ |
b'@@ -1,3 +1,9 @@\n+#!/usr/bin/env python3\n+import argparse\n+import logging\n+import os\n+import sys\n+import types\n """A simple parser for FASTA, FASTQ, SAM, etc. Create generators that just return the read name and\n sequence.\n All format parsers follow this API:\n@@ -12,18 +18,42 @@\n qual: The quality scores (unless the format is FASTA).\n """\n \n+# Available formats.\n+FORMATS = (\'fasta\', \'fastq\', \'sam\', \'tsv\', \'lines\')\n \n-def getparser(filehandle, filetype=\'fasta\'):\n+QUAL_OFFSETS = {\'sanger\':33, \'solexa\':64}\n+\n+\n+def getparser(input, filetype, qual_format=\'sanger\', name_col=1, seq_col=2, qual_col=3):\n+ # Detect whether the input is an open file or a path.\n+ # Return the appropriate reader.\n if filetype == \'fasta\':\n- return FastaReader(filehandle)\n+ return FastaReader(input)\n elif filetype == \'fastq\':\n- return FastqReader(filehandle)\n+ return FastqReader(input, qual_format=qual_format)\n elif filetype == \'sam\':\n- return SamReader(filehandle)\n+ return SamReader(input, qual_format=qual_format)\n elif filetype == \'tsv\':\n- return TsvReader(filehandle)\n+ return TsvReader(input, qual_format=qual_format,\n+ name_col=name_col, seq_col=seq_col, qual_col=qual_col)\n+ elif filetype == \'lines\':\n+ return LineReader(input)\n else:\n- raise ValueError(\'Illegal argument: filetype=\\\'\'+filetype+\'\\\'\')\n+ raise ValueError(\'Unrecognized format: {!r}\'.format(filetype))\n+\n+\n+def detect_input_type(obj):\n+ """Is this an open filehandle, or is it a file path (string)?"""\n+ try:\n+ os.path.isfile(obj)\n+ return \'path\'\n+ except TypeError:\n+ if isinstance(obj, types.GeneratorType):\n+ return \'generator\'\n+ elif hasattr(obj, \'read\') and hasattr(obj, \'close\'):\n+ return \'file\'\n+ else:\n+ return None\n \n \n class FormatError(Exception):\n@@ -33,19 +63,79 @@\n \n \n class Read(object):\n- def __init__(self, name=\'\', seq=\'\', id_=\'\', qual=\'\'):\n+ def __init__(self, name=\'\', seq=\'\', id_=\'\', qual=\'\', qual_format=\'sanger\'):\n self.name = name\n self.seq = seq\n- self.id = id_\n self.qual = qual\n+ if id_ or not self.name:\n+ self.id = id_\n+ elif self.name:\n+ self.id = self.name.split()[0]\n+ self.offset = QUAL_OFFSETS[qual_format]\n+ @property\n+ def scores(self):\n+ if self.qual is None:\n+ return None\n+ scores = []\n+ for qual_char in self.qual:\n+ scores.append(ord(qual_char) - self.offset)\n+ return scores\n+ def to_fasta(self):\n+ return f\'>{self.name}\\n{self.seq}\'\n+ def to_fastq(self):\n+ return f\'@{self.name}\\n{self.seq}\\n+\\n{self.qual}\'\n+ def __str__(self):\n+ if self.qual:\n+ return self.to_fastq()\n+ else:\n+ return self.to_fasta()\n+ def __repr__(self):\n+ kwarg_strs = []\n+ for kwarg in \'name\', \'seq\', \'id_\', \'qual\':\n+ attr = kwarg.rstrip(\'_\')\n+ raw_value = getattr(self, attr)\n+ if raw_value is not None and len(raw_value) >= 200:\n+ value = raw_value[:199]+\'\xe2\x80\xa6\'\n+ else:\n+ value = raw_value\n+ if value:\n+ kwarg_strs.append(f\'{kwarg}={value!r}\')\n+ return type(self).__name__+\'(\'+\', \'.join(kwarg_strs)+\')\'\n \n \n class Reader(object):\n """Base class for all other parsers."""\n- def __init__(self, filehandle):\n- self.filehandle = filehandle\n+ def __init__(self, input, **kwargs):\n+ self.input = input\n+ self.input_type = detect_input_type(input)\n+ if self.input_type not in (\'path\', \'file\', \'generator\'):\n+ raise ValueError(\'Input object {!r} not a file, string, or generator.\'.format(input))\n+ for key, value in kwargs.items():\n+ setattr(self, key, value)\n def __iter__(self):\n return self.parser()\n+ def bases(self):\n+ for read in self.parser():\n+ for base in read.seq:\n+ yield base\n+ def get_input_iterator(self):\n+ if self.input_type == \'path\':\n+ return open(self.input)\n+ else:\n+ return self.input\n+\n+\n+class LineReader(Reader):\n+ """A parser for the simplest format: Only the sequence, one line per read."""\n+ def p'..b' continue\n- if state == \'header\':\n- if not line.startswith(\'@\'):\n- raise FormatError(\'line state = "header" but line does not start with "@"\')\n- if read.seq:\n- yield read\n- read = Read()\n- read.name = line[1:] # remove \'@\'\n- if read.name:\n- read.id = read.name.split()[0]\n- state = \'sequence\'\n- elif state == \'sequence\':\n- if line.startswith(\'+\'):\n- state = \'plus\'\n- else:\n- read.seq += line\n- elif state == \'plus\' or state == \'quality\':\n- state = \'quality\'\n- togo = len(read.seq) - len(read.qual)\n- read.qual += line[:togo]\n- # The end of the quality lines is when we have a quality string as long as the sequence.\n- if len(read.qual) >= len(read.seq):\n- state = \'header\'\n+ input_iterator = self.get_input_iterator()\n+ try:\n+ read = None\n+ line_num = 0\n+ state = \'header\'\n+ while True:\n+ try:\n+ line_raw = next(input_iterator)\n+ except StopIteration:\n+ if read is not None:\n+ yield read\n+ return\n+ line_num += 1\n+ line = line_raw.rstrip(\'\\r\\n\')\n+ if state == \'header\':\n+ if not line.startswith(\'@\'):\n+ if line:\n+ raise FormatError(\'line state = "header" but line does not start with "@":\\n\'+line)\n+ else:\n+ # Allow empty lines.\n+ continue\n+ if read is not None:\n+ yield read\n+ read = Read(qual_format=self.qual_format)\n+ read.name = line[1:] # remove \'@\'\n+ if read.name:\n+ read.id = read.name.split()[0]\n+ state = \'sequence\'\n+ elif state == \'sequence\':\n+ if line.startswith(\'+\'):\n+ state = \'plus\'\n+ else:\n+ read.seq += line\n+ elif state == \'plus\' or state == \'quality\':\n+ if line.startswith(\'@\') and state == \'quality\':\n+ logging.warning(\'Looking for more quality scores but line starts with "@". This might \'\n+ \'be a header line and there were fewer quality scores than bases: {}\'\n+ .format(line[:69]))\n+ state = \'quality\'\n+ togo = len(read.seq) - len(read.qual)\n+ read.qual += line[:togo]\n+ # The end of the quality lines is when we have a quality string as long as the sequence.\n+ if len(read.qual) >= len(read.seq):\n+ state = \'header\'\n+ finally:\n+ if self.input_type == \'path\':\n+ input_iterator.close()\n+\n+\n+DESCRIPTION = \'Test parser by parsing an input file and printing its contents.\'\n+\n+\n+def make_argparser():\n+ parser = argparse.ArgumentParser(description=DESCRIPTION)\n+ parser.add_argument(\'infile\', nargs=\'?\', type=argparse.FileType(\'r\'), default=sys.stdin,\n+ help=\'Input reads.\')\n+ parser.add_argument(\'-f\', \'--format\', choices=(\'fasta\', \'fastq\', \'sam\', \'tsv\', \'lines\'),\n+ help=\'Input read format. Will be detected from the filename, if given.\')\n+ return parser\n+\n+\n+def main(argv):\n+ parser = make_argparser()\n+ args = parser.parse_args(argv[1:])\n+ if args.format:\n+ format = args.format\n+ elif args.infile is sys.stdin:\n+ fail(\'Error: Must give a --format if reading from stdin.\')\n+ else:\n+ ext = os.path.splitext(args.infile.name)[1]\n+ if ext == \'.fq\':\n+ format = \'fastq\'\n+ elif ext == \'.fa\':\n+ format = \'fasta\'\n+ elif ext == \'.txt\':\n+ format = \'lines\'\n+ else:\n+ format = ext[1:]\n+ print(\'Reading input as format {!r}.\'.format(format))\n+ for i, read in enumerate(getparser(args.infile, filetype=format)):\n+ print(\'Read {} id/name: {!r}/{!r}\'.format(i+1, read.id, read.name))\n+ print(\'Read {} seq: {!r}\'.format(i+1, read.seq))\n+ print(\'Read {} qual: {!r}\'.format(i+1, read.qual))\n+\n+\n+def fail(message):\n+ sys.stderr.write(message+"\\n")\n+ sys.exit(1)\n+\n+\n+if __name__ == \'__main__\':\n+ sys.exit(main(sys.argv))\n' |
b |
diff -r 7f170cb06e2e -r 464aee13e2df trimmer.py --- a/trimmer.py Tue Dec 01 21:33:27 2015 -0500 +++ b/trimmer.py Fri May 27 23:29:45 2022 +0000 |
[ |
b'@@ -1,59 +1,60 @@\n-#!/usr/bin/env python\n-from __future__ import division\n+#!/usr/bin/env python3\n import sys\n import argparse\n+import collections\n import getreads\n \n-OPT_DEFAULTS = {\'win_len\':1, \'thres\':1.0, \'filt_bases\':\'N\'}\n+QUANT_ORDER = 5\n USAGE = "%(prog)s [options] [input_1.fq [input_2.fq output_1.fq output_2.fq]]"\n DESCRIPTION = """Trim the 5\' ends of reads by sequence content, e.g. by GC content or presence of\n N\'s."""\n \n \n-def main(argv):\n-\n+def make_argparser():\n parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE)\n- parser.set_defaults(**OPT_DEFAULTS)\n-\n parser.add_argument(\'infile1\', metavar=\'reads_1.fq\', nargs=\'?\', type=argparse.FileType(\'r\'),\n default=sys.stdin,\n help=\'Input reads (mate 1). Omit to read from stdin.\')\n parser.add_argument(\'infile2\', metavar=\'reads_2.fq\', nargs=\'?\', type=argparse.FileType(\'r\'),\n help=\'Input reads (mate 2). If given, it will preserve pairs (if one read is filtered out \'\n \'entirely, the other will also be lost).\')\n- parser.add_argument(\'outfile1\', metavar=\'reads.filt_1.fq\', nargs=\'?\', type=argparse.FileType(\'w\'),\n+ parser.add_argument(\'outfile1\', metavar=\'output_1.fq\', nargs=\'?\', type=argparse.FileType(\'w\'),\n default=sys.stdout,\n help=\'Output file for mate 1. WARNING: Will overwrite.\')\n- parser.add_argument(\'outfile2\', metavar=\'reads.filt_2.fq\', nargs=\'?\', type=argparse.FileType(\'w\'),\n+ parser.add_argument(\'outfile2\', metavar=\'output_2.fq\', nargs=\'?\', type=argparse.FileType(\'w\'),\n help=\'Output file for mate 2. WARNING: Will overwrite.\')\n parser.add_argument(\'-f\', \'--format\', dest=\'filetype\', choices=(\'fasta\', \'fastq\'),\n help=\'Input read format.\')\n parser.add_argument(\'-F\', \'--out-format\', dest=\'out_filetype\', choices=(\'fasta\', \'fastq\'),\n help=\'Output read format. Default: whatever the input format is.\')\n- parser.add_argument(\'-b\', \'--filt-bases\',\n+ parser.add_argument(\'-b\', \'--filt-bases\', default=\'N\',\n help=\'The bases to filter on. Case-insensitive. Default: %(default)s.\')\n- parser.add_argument(\'-t\', \'--thres\', type=float,\n+ parser.add_argument(\'-t\', \'--thres\', type=float, default=0.5,\n help=\'The threshold. The read will be trimmed once the proportion of filter bases in the \'\n \'window exceed this fraction (not a percentage). Default: %(default)s.\')\n- parser.add_argument(\'-w\', \'--window\', dest=\'win_len\', type=int,\n+ parser.add_argument(\'-w\', \'--window\', dest=\'win_len\', type=int, default=1,\n help=\'Window size for trimming. Default: %(default)s.\')\n parser.add_argument(\'-i\', \'--invert\', action=\'store_true\',\n help=\'Invert the filter bases: filter on bases NOT present in the --filt-bases.\')\n- parser.add_argument(\'-m\', \'--min-length\', type=int,\n+ parser.add_argument(\'-m\', \'--min-length\', type=int, default=1,\n help=\'Set a minimum read length. Reads which are trimmed below this length will be filtered \'\n \'out (omitted entirely from the output). Read pairs will be preserved: both reads in a \'\n- \'pair must exceed this length to be kept. Set to 0 to only omit empty reads.\')\n- parser.add_argument(\'--error\',\n- help=\'Fail with this error message (useful for Galaxy tool).\')\n+ \'pair must exceed this length to be kept. Set to 1 to only omit empty reads. \'\n+ \'Default: %(default)s.\')\n parser.add_argument(\'-A\', \'--acgt\', action=\'store_true\',\n help=\'Filter on any non-ACGT base (shortcut for "--invert --filt-bases ACGT").\')\n parser.add_argument(\'-I\', \'--iupac\', action=\'store_true\',\n help=\'Filter on any non-IUPAC base (shortcut for "--invert --filt-bases ACGTUWSMKRYBDHVN-").\')\n-\n- args = parser.parse_args(argv[1:])\n+ parser.add_argument(\'-q\', \'--quiet\', action=\'store_true\',\n+ help=\'Don\\\'t print trimming stats on completion.\')\n+ parser.add_argument(\'-T\', \'--tsv\', dest=\'stats_format\', default=\'human\',\n+ action=\'store_const\', const=\'tsv\')\n+ return parser\n \n- if args.error:\n- fail(\'Error: \'+args.error)\n+\n+def main(arg'..b'eal score.\n+ qual = read.qual or \'z\' * len(read.seq)\n+ read.qual = qual[:len(read.seq)]\n+ return read, trim_len\n+\n+\n+def trim_seq(seq, win_len=1, thres=1.0, filt_bases=\'N\', invert=False, **kwargs):\n """Trim an individual read and return its trimmed sequence.\n This will track the frequency of bad bases in a window of length win_len, and trim once the\n frequency goes below thres. The trim point will be just before the first (leftmost) bad base in\n@@ -199,7 +246,6 @@\n if bad_base:\n bad_bases_count -= 1\n bad_bases_coords.pop(0)\n- # print bad_bases_coords\n # Are we over the threshold?\n if bad_bases_count > max_bad_bases:\n break\n@@ -211,6 +257,95 @@\n return seq\n \n \n+def get_counter_quantiles(counter, order=5):\n+ """Return an arbitrary set of quantiles (including min and max values).\n+ `counter` is a collections.Counter.\n+ `order` is which quantile to perform (4 = quartiles, 5 = quintiles).\n+ Warning: This expects a counter which has counted at least `order` elements.\n+ If it receives a counter with fewer elements, it will simply return `list(counter.elements())`.\n+ This will have fewer than the usual order+1 elements, and may not fit normal expectations of\n+ what "quantiles" should be."""\n+ quantiles = []\n+ total = sum(counter.values())\n+ if total <= order:\n+ return list(counter.elements())\n+ span_size = total / order\n+ # Sort the items and go through them, looking for the one at the break points.\n+ items = list(sorted(counter.items(), key=lambda i: i[0]))\n+ quantiles.append(items[0][0])\n+ total_seen = 0\n+ current_span = 1\n+ cut_point = int(round(current_span*span_size))\n+ for item, count in items:\n+ total_seen += count\n+ if total_seen >= cut_point:\n+ quantiles.append(item)\n+ current_span += 1\n+ cut_point = int(round(current_span*span_size))\n+ return quantiles\n+\n+\n+def print_stats(stats, format=\'human\'):\n+ if format == \'human\':\n+ lines = get_stats_lines_human(stats)\n+ elif format == \'tsv\':\n+ lines = get_stats_lines_tsv(stats)\n+ else:\n+ fail(\'Error: Unrecognized format {!r}\'.format(format))\n+ sys.stderr.write(\'\\n\'.join(lines).format(**stats)+\'\\n\')\n+\n+\n+def get_stats_lines_human(stats):\n+ # Single-stat lines:\n+ lines = [\n+ \'Total reads in input:\\t{reads}\',\n+ \'Reads trimmed:\\t{trimmed}\'\n+ ]\n+ if \'trimmed1\' in stats and \'trimmed2\' in stats:\n+ lines.append(\' For mate 1:\\t{trimmed1}\')\n+ lines.append(\' For mate 2:\\t{trimmed2}\')\n+ lines.append(\'Reads filtered out:\\t{omitted}\')\n+ if \'omitted1\' in stats and \'omitted2\' in stats:\n+ lines.append(\' For mate 1:\\t{omitted1}\')\n+ lines.append(\' For mate 2:\\t{omitted2}\')\n+ # Quantile lines:\n+ quantile_lines = [\n+ (\'Bases trimmed quintiles\', \'trim\'),\n+ (\' For mate 1\', \'trim1\'),\n+ (\' For mate 2\', \'trim2\'),\n+ (\'Bases trimmed quintiles from filtered reads\', \'omitted_trim\'),\n+ (\' For mate 1\', \'omitted_trim1\'),\n+ (\' For mate 2\', \'omitted_trim2\')\n+ ]\n+ for desc, stat_name in quantile_lines:\n+ if stat_name in stats[\'quants\']:\n+ quants_values = stats[\'quants\'][stat_name]\n+ if quants_values:\n+ quants_str = \', \'.join(map(str, quants_values))\n+ else:\n+ quants_str = \'N/A\'\n+ line = desc+\':\\t\'+quants_str\n+ lines.append(line)\n+ return lines\n+\n+\n+def get_stats_lines_tsv(stats):\n+ lines = [\'{reads}\']\n+ if \'trimmed1\' in stats and \'trimmed2\' in stats:\n+ lines.append(\'{trimmed}\\t{trimmed1}\\t{trimmed2}\')\n+ else:\n+ lines.append(\'{trimmed}\')\n+ if \'omitted1\' in stats and \'omitted2\' in stats:\n+ lines.append(\'{omitted}\\t{omitted1}\\t{omitted2}\')\n+ else:\n+ lines.append(\'{omitted}\')\n+ for stat_name in (\'trim\', \'trim1\', \'trim2\', \'omitted_trim\', \'omitted_trim1\', \'omitted_trim2\'):\n+ if stat_name in stats[\'quants\']:\n+ quants_values = stats[\'quants\'][stat_name]\n+ lines.append(\'\\t\'.join(map(str, quants_values)))\n+ return lines\n+\n+\n def fail(message):\n sys.stderr.write(message+"\\n")\n sys.exit(1)\n' |
b |
diff -r 7f170cb06e2e -r 464aee13e2df trimmer.xml --- a/trimmer.xml Tue Dec 01 21:33:27 2015 -0500 +++ b/trimmer.xml Fri May 27 23:29:45 2022 +0000 |
[ |
@@ -1,27 +1,31 @@ -<tool id="sequence_content_trimmer" version="0.1" name="Sequence Content Trimmer"> +<?xml version="1.0"?> +<tool id="sequence_content_trimmer" version="0.2.3" name="Sequence Content Trimmer"> <description>trim reads based on certain bases</description> - <command interpreter="python"> - trimmer.py $input1 - #if $paired.is_paired: - $input2 $output1 $output2 - #if ('fasta' in $input1.extension and 'fastq' in $input2.extension) or ('fastq' in $input1.extension and 'fasta' in $input2.extension) - --error 'Both input files must be either fastq or fasta (no mixing the two).' + <command detect_errors="exit_code"><![CDATA[ + #if $paired.is_paired and (('fasta' in $input1.extension and 'fastq' in $input2.extension) or \ + ('fastq' in $input1.extension and 'fasta' in $input2.extension)) + echo 'Both input files must be either fastq or fasta (no mixing the two).' >&2 + #else + python '$__tool_directory__/trimmer.py' '$input1' + #if $paired.is_paired: + '$input2' '$output1' '$output2' + #end if + #if $input1.extension in ('fastq', 'fastqsanger', 'fastqillumina', 'fastqsolexa') + -f fastq + #elif $input1.extension == 'fasta' + -f fasta + #else + -f '$input1.extension' + #end if + -b '$bases' -t '$thres' -w '$win_len' $invert + #if $min_len.has_min_len: + -m '$min_len.value' + #end if + #if not $paired.is_paired: + > '$output1' #end if #end if - #if $input1.extension == 'fastq' or $input1.extension == 'fastqsanger' or $input1.extension == 'fastqillumina' or $input1.extension == 'fastqsolexa' - -f fastq - #elif $input1.extension == 'fasta' - -f fasta - #else - -f $input1.extension - #end if - -b $bases -t $thres -w $win_len $invert - #if $min_len.has_min_len: - -m $min_len.value - #end if - #if not $paired.is_paired: - > $output1 - #end if + ]]> </command> <inputs> <conditional name="paired"> @@ -49,8 +53,8 @@ </conditional> </inputs> <outputs> - <data name="output1" format_source="input1"/> - <data name="output2" format_source="input2"> + <data name="output1" format_source="input1" label="$tool.name on $on_string"/> + <data name="output2" format_source="input2" label="$tool.name on $on_string (mate 2)"> <filter>paired['is_paired']</filter> </data> </outputs> @@ -68,7 +72,7 @@ **How it works** -This will slide along the read with a window, and trim once the frequency of filter bases exceeds the frequency threshold (unless "Invert filter bases" is enabled, when it will trim once non-filter bases exceed the threshold). +This will slide along the read with a window, and trim once the frequency of filter bases exceeds the frequency threshold (unless "Invert filter bases" is enabled, in which case it will trim once non-filter bases exceed the threshold). The trim point will be just before the first (leftmost) filter base in the final window (the one where the frequency exceeded the threshold). |