changeset 5:1dada50cca8a

Support for cutadapt 0.9.5, added quality trimming and additional output options
author Lance Parsons <lparsons@princeton.edu>
date Fri, 22 Jul 2011 11:03:00 -0400
parents 0a872e59164c
children 98d05121d41e
files README cutadapt.xml cutadapt_galaxy_wrapper.py discard_stderr_wrapper.sh test-data/cutadapt_discard.out test-data/cutadapt_rest.fa test-data/cutadapt_rest.out test-data/cutadapt_rest2.out test-data/cutadapt_small.fastq test-data/cutadapt_small.out
diffstat 10 files changed, 141 insertions(+), 198 deletions(-) [+]
line wrap: on
line diff
--- a/README	Wed May 25 19:33:40 2011 -0400
+++ b/README	Fri Jul 22 11:03:00 2011 -0400
@@ -4,16 +4,21 @@
 ------------
 
 1 - Install the cutadapt package and make sure it is in path for Galaxy
-2 - Copy cutadapt.xml and cutadapt_galaxy_wrapper.py to $GALAXY_HOME/tools/cutadapt
+2 - Copy cutadapt.xml to $GALAXY_HOME/tools/cutadapt
 3 - Add the tool to the $GALAXY_HOME/tool_conf.xml tool-registry file
 
+Optional steps to setup and run Galaxy functional tests
+
+4 - Copy test-data/* to $GALAXY_HOME/test-data/
+5 - Set GALAXY_TEST_TOOL_CONF environment variable to a tool_conf.xml file that
+    contains the tools you want to test. (e.g. 'tool_conf.xml')
+6 - $GALAXY_HOME/run_functional_tests.sh -id cutadapt 
+    See the Galaxy Wiki for more information: http://wiki.g2.bx.psu.edu/
+
 
 Limitations
 -----------
 
-Colorspace data is not implemented
-Discard trimmed reads is not implemented (broken in cutadapt 0.9.3)
-Storing of "rest fo read" after the adapter (-r), too-short reads (--too-short-output), and untrimmed-reads (--untreimmed-output) are not implemented
-Quality cutoff (-q) not implemented
+Colorspace data support is not implemented
 Prefix and Suffix to read names not implemented
 Length-tag addition to read name not implemented
--- a/cutadapt.xml	Wed May 25 19:33:40 2011 -0400
+++ b/cutadapt.xml	Fri Jul 22 11:03:00 2011 -0400
@@ -1,35 +1,47 @@
-<tool id="cutadapt" name="Remove adapter sequences" version="0.9.4">
-	<description>from high-throughput sequence data</description>
+<tool id="cutadapt" name="Cutadapt" version="0.9.5.a">
+	<description>Remove adapter sequences from Fastq/Fasta</description>
 	<requirements>
 		<requirement type="python-module">cutadapt</requirement>
 	</requirements>
 
-	<command interpreter="sh">discard_stderr_wrapper.sh cutadapt
+	<command>cutadapt
 		#if $input.extension.startswith( "fastq"):
 		--format=fastq
 		#else
 		--format=$input.extension
 		#end if 
 		#for $a in $adapters
-		-a '${a.adapter_source.adapter}'
+		--adapter='${a.adapter_source.adapter}'
 		#end for
 		#for $aa in $anywhere_adapters
-		-b '${aa.anywhere_adapter_source.anywhere_adapter}'
+		--anywhere='${aa.anywhere_adapter_source.anywhere_adapter}'
 		#end for
-		-e $error_rate
-		-n $count
-		-O $overlap
+		--error-rate=$error_rate
+		--times=$count
+		--overlap=$overlap
 		#if str($min) != '0':
-		-m $min
+		--minimum-length=$min
 		#end if
 		#if str($max) != '0':
-		-M $max
+		--maximum-length=$max
+		#end if
+		#if str($quality_cutoff) != '0':
+		--quality-cutoff=$quality_cutoff
 		#end if
-		#if $discard:
-		--discard
+		$discard
+		--output='$output' 
+		#if str( $output_params.output_type ) == "additional":
+			#if $output_params.rest_file:
+			--rest-file=$rest_output
+			#end if
+			#if $output_params.too_short_file:
+			--too-short-output=$too_short_output
+			#end if
+			#if $output_params.untrimmed_file:
+			--untrimmed-output=$untrimmed_output
+			#end if
 		#end if
 		'$input'
-		--output='$output' 
 		> $report
 	</command>
 	<inputs>
@@ -81,19 +93,70 @@
 		<param name="error_rate" type="float" min="0" max="1" value="0.1" label="Maximum error rate" help="Maximum allowed error rate (no. of errors divided by the length of the matching region)." />
 		<param name="count" type="integer" min="1" value="1" label="Match times" help="Try to remove adapters at most COUNT times. Useful when an adapter gets appended multiple times." />
 		<param name="overlap" type="integer" min="1" value="3" label="Minimum overlap length" help="Minimum overlap length. If the overlap between the adapter and the sequence is shorter than LENGTH, the read is not modified." />
-		<param name="discard" type="boolean" checked="False" default="False" label="Discard Trimmed Reads" help="Discard reads that contain the adapter instead of trimming them. Use the 'Minimum overlap length' option in order to avoid throwing away too many randomly matching reads!" />
+		<param name="discard" type="boolean" value="false" truevalue="--discard" falsevalue="" label="Discard Trimmed Reads" help="Discard reads that contain the adapter instead of trimming them. Use the 'Minimum overlap length' option in order to avoid throwing away too many randomly matching reads!" />
 		<param name="min" type="integer" min="0" optional="true" value="0" label="Minimum length" help="Discard trimmed reads that are shorter than LENGTH.  Reads that are too short even before adapter removal are also discarded. In colorspace, an initial primer is not counted. Value of 0 means no minimum length." />
 		<param name="max" type="integer" min="0" optional="true" value="0" label="Maximum length" help="Discard trimmed reads that are longer than LENGTH.  Reads that are too long even before adapter removal are also discarded. In colorspace, an initial primer is not counted. Value of 0 means no maximum length." />
+		<param name="quality_cutoff" type="integer" min="0" optional="true" value="0" label="Quality cutoff" help="Trim low-quality ends from reads before adapter removal. The algorithm is the same as the one used by BWA (Subtract CUTOFF from all qualities; compute partial sums from all indices to the end of the sequence; cut sequence at the index at which the sum is minimal). Value of 0 means no quality trimming." />
+	        <conditional name="output_params">
+			<param name="output_type" type="select" label="Additional output options" help="By default all reads will be put in the same file.  However, reads with adapters matching in the middle, unmatched reads, and too-short reads can be saved in separate files.">
+				<option value="default">Default</option>
+				<option value="additional">Additional output files</option>
+			</param>
+			<when value="default" />
+			<when value="additional">
+				<param name="rest_file" type="boolean" value="false" label="Rest of Read" help="When the adapter matches in the middle of a read, write the rest (after the adapter) into a file."/>
+				<param name="too_short_file" type="boolean" value="false" label="Too Short Reads" help="Write reads that are too short (according to minimum length specified) to a file. (default: discard reads)"/>
+				<param name="untrimmed_file" type="boolean" value="false" label="Untrimmed Reads" help="Write reads that do not contain the adapter to a separate file, instead of writing them to the regular output file.  (default: output to same file as trimmed)"/>
+			</when>
+		</conditional>
 	</inputs>
 	<outputs>
 		<data format="txt" name="report" label="${tool.name} on ${on_string} (Report)" />
 		<data format="input" name="output" metadata_source="input"/>
+		<data format="input" name="rest_output" metadata_source="input" label="${tool.name} on ${on_string} (Rest of Reads)" >
+			<filter>(output_params['output_type'] == "additional")</filter>
+			<filter>(output_params['rest_file'] is True)</filter>
+		</data>
+		<data format="input" name="too_short_output" metadata_source="input" label="${tool.name} on ${on_string} (Too Short Reads)" >
+			<filter>(output_params['output_type'] == "additional")</filter>
+			<filter>(output_params['too_short_file'] is True)</filter>
+		</data>
+		<data format="input" name="untrimmed_output" metadata_source="input" label="${tool.name} on ${on_string} (Untrimmed Reads)" >
+			<filter>(output_params['output_type'] == "additional")</filter>
+			<filter>(output_params['untrimmed_file'] is True)</filter>
+		</data>
 	</outputs>
 
 	<tests>
 		<test>
-			<param name="input" value="fa_gc_content_input.fa"/>
-			<output name="out_file1" file="fa_gc_content_output.txt"/>
+			<param name="input" value="cutadapt_small.fastq" ftype="fastqsanger"/>
+			<param name="adapter_source_list" value="user"/>
+			<param name="adapter" value=""/>
+			<param name="anywhere_adapter_source_list" value="user"/>
+			<param name="anywhere_adapter" value="TTAGACATATCTCCGTCG"/>
+			<param name="output_type" value="default"/>
+			<output name="output" file="cutadapt_small.out"/>
+		</test>
+		<test>
+			<param name="input" value="cutadapt_small.fastq" ftype="fastqsanger"/>
+			<param name="adapter_source_list" value="user"/>
+			<param name="adapter" value="TTAGACATATCTCCGTCG"/>
+			<param name="anywhere_adapter_source_list" value="user"/>
+			<param name="anywhere_adapter" value=""/>
+			<param name="discard" value="true"/>
+			<param name="output_type" value="default"/>
+			<output name="output" file="cutadapt_discard.out"/>
+		</test>
+		<test>
+			<param name="input" value="cutadapt_rest.fa" ftype="fasta"/>
+			<param name="adapter_source_list" value="user"/>
+			<param name="adapter" value="ADAPTER"/>
+			<param name="anywhere_adapter_source_list" value="user"/>
+			<param name="anywhere_adapter" value=""/>
+			<param name="output_type" value="additional"/>
+			<param name="rest_file" value="true"/>
+			<output name="output" file="cutadapt_rest.out"/>
+			<output name="rest_output" file="cutadapt_rest2.out"/>
 		</test>
 	</tests>
 
--- a/cutadapt_galaxy_wrapper.py	Wed May 25 19:33:40 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,137 +0,0 @@
-#!/usr/bin/env python
-"""
-SYNOPSIS
-
-    cutadapt_galaxy_wrapper.py 
-        -i input_file
-        -o output_file
-        [-f format (fastq/fastq/etc.)]
-        [-a 3' adapter sequence]
-        [-b 3' or 5' anywhere adapter sequence]
-        [-e error_rate]
-        [-n count]
-        [-O overlap_length]
-        [--discard discard trimmed reads]
-        [-m minimum read length]
-        [-M maximum read length]
-        [-q quality cutoff]
-        [-h,--help] [-v,--verbose] [--version]
-
-DESCRIPTION
-
-   Wrapper for cutadapt running as a galaxy tool
-
-AUTHOR
-
-    Lance Parsons <lparsons@princeton.edu>
-
-LICENSE
-
-    This script is in the public domain, free from copyrights or restrictions.
-
-VERSION
-
-    $Id$
-"""
-
-import sys, os, traceback, optparse, shutil, subprocess, tempfile
-import re
-#from pexpect import run, spawn
-
-def stop_err( msg ):
-    sys.stderr.write( '%s\n' % msg )
-    sys.exit()
-
-def main ():
-
-    global options, args
-    # Setup Parameters 
-    params = []
-    if options.adapters != None:
-        params.append("-a %s" % " -a ".join(options.adapters))
-    if options.anywhere_adapters != None:
-        params.append("-b %s" % " -b ".join(options.anywhere_adapters))
-    if options.output_file != None:
-        params.append("-o %s" % options.output_file)
-    if options.error_rate != None:
-        params.append("-e %s" % options.error_rate)
-    if options.count != None:
-        params.append("-n %s" % options.count)
-    if options.overlap_length != None:
-        params.append("-O %s" % options.overlap_length)
-    if options.discard_trimmed:
-        params.append("--discard")
-    if options.minimum_length != None:
-        params.append("-m %s" % options.minimum_length)
-    if options.maximum_length != None:
-        params.append("-M %s" % options.maximum_length)
-    if options.cutoff != None:
-        params.append("-q %s" % options.cutoff)
-
-
-    # cutadapt relies on the extension to determine file format: .fasta or .fastq
-    input_name = '.'.join((options.input,options.format))
-    # make temp directory
-    tmp_dir = tempfile.mkdtemp()
-
-    try:
-        # make a link to the input file in the tmp_dir
-        input_file = os.path.join(tmp_dir,os.path.basename(input_name)) 
-        os.symlink( options.input, input_file) 
-        
-        # generate commandline
-        cmd = 'cutadapt %s %s' % (' '.join(params),input_file)
-        proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir,
-                                stdout=subprocess.PIPE,
-                               stderr=subprocess.PIPE)
-        (stdoutdata, stderrdata) = proc.communicate()
-        returncode = proc.returncode
-        if returncode != 0:
-            raise Exception, 'Execution of cutadapt failed.\n%s' % stderrdata
-        print stderrdata
-
-    finally:
-        # clean up temp dir
-        if os.path.exists( input_name ):
-            os.remove( input_name )
-        if os.path.exists( tmp_dir ):
-            shutil.rmtree( tmp_dir )
-
-if __name__ == '__main__':
-    try:
-        parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), usage=globals()['__doc__'], version='$Id$')
-        parser.add_option( '-i', '--input', dest='input', help='The sequence input file' )
-        parser.add_option( '-f', '--format', dest='format', default='fastq',
-                          help='The sequence input file format (default: fastq)' )
-        parser.add_option ('-a', '--adapter', action='append', dest='adapters', help='3\' adapter sequence(s)')
-        parser.add_option ('-b', '--anywhere', action='append', dest='anywhere_adapters', help='5\' or 3\' "anywhere" adapter sequence(s)')
-        parser.add_option ('-e', '--error-rate', dest='error_rate', help='Maximum allowed error rate')
-        parser.add_option ('-n', '--times', dest='count', help='Try to remove adapters COUNT times')
-        parser.add_option ('-O', '--overlap', dest='overlap_length', help='Minimum overlap length')
-        parser.add_option ('--discard', '--discard-trimmed', dest='discard_trimmed', action='store_true', default=False, help='Discard reads that contain the adapter')
-        parser.add_option ('-m', '--minimum-length', dest='minimum_length', help='Discard reads that are shorter than LENGTH')
-        parser.add_option ('-M', '--maximum-length', dest='maximum_length', help='Discard reads that are longer than LENGTH')
-        parser.add_option ('-q', '--quality-cutoff', dest='cutoff', help='Trim
-                           low quality ends from reads before adapter removal')
-        parser.add_option ('-o', '--output', dest='output_file', help='The modified sequences are written to the file')
-        (options, args) = parser.parse_args()
-        if options.input == None:
-             stop_err("Misssing option --input")
-        if options.output_file == None:
-             stop_err("Misssing option --output")
-        if not os.path.exists(options.input):
-            stop_err("Unable to read intput file: %s" % options.input)
-   #if len(args) < 1:
-        #    parser.error ('missing argument')
-        main()
-        sys.exit(0)
-    except KeyboardInterrupt, e: # Ctrl-C
-        raise e
-    except SystemExit, e: # sys.exit()
-        raise e
-    except Exception, e:
-        print 'ERROR, UNEXPECTED EXCEPTION'
-        print str(e)
-        traceback.print_exc()
-        os._exit(1)
-
--- a/discard_stderr_wrapper.sh	Wed May 25 19:33:40 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,40 +0,0 @@
-#!/bin/sh
-
-# STDERR wrapper - discards STDERR if command execution was OK.
-
-#
-# This script executes a given command line,
-# while saving the STDERR in a temporary file.
-#
-# When the command is completed, it checks to see if the exit code was zero.
-# if so - the command is assumed to have succeeded - the STDERR file is discarded.
-# if not - the command is assumed to have failed, and the STDERR file is dumped to the real STDERR
-#
-#
-# Use this wrapper for tools which insist on writting stuff to STDERR
-# even if they succeeded - which throws galaxy off balance.
-#
-#
-# Copyright 2009 (C) by Assaf Gordon
-# This file is distributed under the BSD license.
-#
-# Modified by Lance Parsons (2011)
-# Echo STDERR to STDOUT if return code was 0
-
-TMPFILE=$(mktemp -t tmp.XXXXXXXXXX) || exit 1
-#CWD=`pwd`
-#DIRECTORY=$(cd `dirname $0` && pwd)
-#cd $DIRECTORY
-"$@" 2> $TMPFILE
-
-EXITCODE=$?
-# Exitcode != 0 ?
-if [ "$EXITCODE" -ne "0" ]; then
-	cat $TMPFILE >&2
-else
-#	echo "Testing STDOUT"
-	cat $TMPFILE >&1
-fi
-rm $TMPFILE
-cd $CWD
-exit $EXITCODE
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/cutadapt_discard.out	Fri Jul 22 11:03:00 2011 -0400
@@ -0,0 +1,4 @@
+@prefix:1_13_1440/1
+CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/cutadapt_rest.fa	Fri Jul 22 11:03:00 2011 -0400
@@ -0,0 +1,10 @@
+>read1
+TESTINGADAPTERREST1
+>read2
+TESTINGADAPTERRESTING
+>read3
+TESTINGADAPTER
+>read4
+TESTINGADAPTERRESTLESS
+>read5
+TESTINGADAPTERRESTORE
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/cutadapt_rest.out	Fri Jul 22 11:03:00 2011 -0400
@@ -0,0 +1,10 @@
+>read1
+TESTING
+>read2
+TESTING
+>read3
+TESTING
+>read4
+TESTING
+>read5
+TESTING
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/cutadapt_rest2.out	Fri Jul 22 11:03:00 2011 -0400
@@ -0,0 +1,4 @@
+REST1
+RESTING
+RESTLESS
+RESTORE
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/cutadapt_small.fastq	Fri Jul 22 11:03:00 2011 -0400
@@ -0,0 +1,12 @@
+@prefix:1_13_573/1
+CGTCCGAANTAGCTACCACCCTGATTAGACAAAT
++
+)3%)&&&&!.1&(6:<'67..*,:75)'77&&&5
+@prefix:1_13_1259/1
+AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT
++
+;<:&:A;A!9<<<,7:<=3=;:<&<?<?8<;=<&
+@prefix:1_13_1440/1
+CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/cutadapt_small.out	Fri Jul 22 11:03:00 2011 -0400
@@ -0,0 +1,12 @@
+@prefix:1_13_573/1
+CGTCCGAANTAGCTACCACCCTGA
++
+)3%)&&&&!.1&(6:<'67..*,:
+@prefix:1_13_1259/1
+AGCCGCTANGACGGGTTGGCCC
++
+;<:&:A;A!9<<<,7:<=3=;:
+@prefix:1_13_1440/1
+CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC
++
+<=A:A=57!7<';<6?5;;6:+:=)71>70<,=: