# HG changeset patch # User Lance Parsons # Date 1305316441 14400 # Node ID 8b064ea1672296c6c224cb4af5e36cb685d88993 Initial version with multiple adapter support diff -r 000000000000 -r 8b064ea16722 README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Fri May 13 15:54:01 2011 -0400 @@ -0,0 +1,19 @@ +Galaxy tool definition for cutadapt (http://code.google.com/p/cutadapt/) + +Installation +------------ + +1 - Install the cutadapt package and make sure it is in path for Galaxy +2 - Copy cutadapt.xml and cutadapt_galaxy_wrapper.py to $GALAXY_HOME/tools/cutadapt +3 - Add the tool to the $GALAXY_HOME/tool_conf.xml tool-registry file + + +Limitations +----------- + +Colorspace data is not implemented +Discard trimmed reads is not implemented (broken in cutadapt 0.9.3) +Storing of "rest fo read" after the adapter (-r), too-short reads (--too-short-output), and untrimmed-reads (--untreimmed-output) are not implemented +Quality cutoff (-q) not implemented +Prefix and Suffix to read names not implemented +Length-tag addition to read name not implemented diff -r 000000000000 -r 8b064ea16722 cutadapt.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cutadapt.xml Fri May 13 15:54:01 2011 -0400 @@ -0,0 +1,168 @@ + + from high-throughput sequence data + + cutadapt + + + cutadapt_galaxy_wrapper.py + #if $input.extension.startswith( "fastq"): + --format=fastq + #else + --format=$input.extension + #end if + #for $a in $adapters + -a '${a.adapter_source.adapter}' + #end for + #for $aa in $anywhere_adapters + -b '${aa.anywhere_adapter_source.anywhere_adapter}' + #end for + -e $error_rate + -n $count + -O $overlap + #if str($min) != '0': + -m $min + #end if + #if str($max) != '0': + -M $max + #end if + --input='$input' + --output='$output' + > $report + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This tool removes adapter sequences from DNA high-throughput +sequencing data. This is usually necessary when the read length of the +machine is longer than the molecule that is sequenced, such as in +microRNA data. + +The tool is based on the opensource cutadapt_ tool. + +----- + +**Algorithm** + +cutadapt uses a simple semi-global alignment algorithm, without any special optimizations. +For speed, the algorithm is implemented as a Python extension module in calignmodule.c. + +The program is sufficiently fast for my purposes, but speedups should be simple to achieve. + + +**Partial adapter matches** + +Cutadapt correctly deals with partial adapter matches. As an example, suppose +your adapter sequence is "ADAPTER" (specified via 3' Adapters parameter). +If you have these input sequences: + +:: + + MYSEQUENCEADAPTER + MYSEQUENCEADAP + MYSEQUENCEADAPTERSOMETHINGELSE + +All of them will be trimmed to "MYSEQUENCE". If the sequence starts with an +adapter, like this: + +:: + + ADAPTERSOMETHING + +It will be empty after trimming. + +When the allowed error rate is sufficiently high, errors in +the adapter sequence are allowed. For example, ADABTER (1 mismatch), ADAPTR (1 deletion), +and ADAPPTER (1 insertion) will all be recognized if the error rate is set to 0.15. + + +**Allowing adapters anywhere** + +Cutadapt assumes that any adapter specified via the *3` Adapters* parameter +was ligated to the 3' end of the sequence. This is the correct assumption for +at least the SOLiD and Illumina small RNA protocols and probably others. + +If, on the other hand, your adapter can also be ligated to the 5' end (on +purpose or by accident), you should tell cutadapt so by using the *5' or 3' (Anywhere) +Adapters parameter. It will then use a different alignment algorithm and +correctly trim adapters that appear in the beginning of a read. An adapter +specified this way will also be found if it appears only partially in the +beginning of a read. For example, these sequences + +:: + + ADAPTERMYSEQUENCE + PTERMYSEQUENCE + +will be trimmed to "MYSEQUENCE". Note that the regular algorithm would trim +the first read to an empty sequence. + +This parameter currently does not work with color space data. + + +.. _cutadapt: http://code.google.com/p/cutadapt/ + + + diff -r 000000000000 -r 8b064ea16722 cutadapt_galaxy_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cutadapt_galaxy_wrapper.py Fri May 13 15:54:01 2011 -0400 @@ -0,0 +1,131 @@ +#!/usr/bin/env python +""" +SYNOPSIS + + cutadapt_galaxy_wrapper.py + -i input_file + -o output_file + [-f format (fastq/fastq/etc.)] + [-a 3' adapter sequence] + [-b 3' or 5' anywhere adapter sequence] + [-e error_rate] + [-n count] + [-O overlap_length] + [--discard discard trimmed reads] + [-m minimum read length] + [-M maximum read length] + [-h,--help] [-v,--verbose] [--version] + +DESCRIPTION + + Wrapper for cutadapt running as a galaxy tool + +AUTHOR + + Lance Parsons + +LICENSE + + This script is in the public domain, free from copyrights or restrictions. + +VERSION + + $Id$ +""" + +import sys, os, traceback, optparse, shutil, subprocess, tempfile +import re +#from pexpect import run, spawn + +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +def main (): + + global options, args + # Setup Parameters + params = [] + if options.adapters != None: + params.append("-a %s" % " -a ".join(options.adapters)) + if options.anywhere_adapters != None: + params.append("-b %s" % " -b ".join(options.anywhere_adapters)) + if options.output_file != None: + params.append("-o %s" % options.output_file) + if options.error_rate != None: + params.append("-e %s" % options.error_rate) + if options.count != None: + params.append("-n %s" % options.count) + if options.overlap_length != None: + params.append("-O %s" % options.overlap_length) + if options.discard_trimmed: + params.append("--discard") + if options.minimum_length != None: + params.append("-m %s" % options.minimum_length) + if options.maximum_length != None: + params.append("-M %s" % options.maximum_length) + + # cutadapt relies on the extension to determine file format: .fasta or .fastq + input_name = '.'.join((options.input,options.format)) + # make temp directory + tmp_dir = tempfile.mkdtemp() + + try: + # make a link to the input file in the tmp_dir + input_file = os.path.join(tmp_dir,os.path.basename(input_name)) + os.symlink( options.input, input_file) + + # generate commandline + cmd = 'cutadapt %s %s' % (' '.join(params),input_file) + proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + (stdoutdata, stderrdata) = proc.communicate() + returncode = proc.returncode + if returncode != 0: + raise Exception, 'Execution of cutadapt failed.\n%s' % stderrdata + print stderrdata + + finally: + # clean up temp dir + if os.path.exists( input_name ): + os.remove( input_name ) + if os.path.exists( tmp_dir ): + shutil.rmtree( tmp_dir ) + +if __name__ == '__main__': + try: + parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), usage=globals()['__doc__'], version='$Id$') + parser.add_option( '-i', '--input', dest='input', help='The sequence input file' ) + parser.add_option( '-f', '--format', dest='format', default='fastq', + help='The sequence input file format (default: fastq)' ) + parser.add_option ('-a', '--adapter', action='append', dest='adapters', help='3\' adapter sequence(s)') + parser.add_option ('-b', '--anywhere', action='append', dest='anywhere_adapters', help='5\' or 3\' "anywhere" adapter sequence(s)') + parser.add_option ('-e', '--error-rate', dest='error_rate', help='Maximum allowed error rate') + parser.add_option ('-n', '--times', dest='count', help='Try to remove adapters COUNT times') + parser.add_option ('-O', '--overlap', dest='overlap_length', help='Minimum overlap length') + parser.add_option ('--discard', '--discard-trimmed', dest='discard_trimmed', action='store_true', default=False, help='Discard reads that contain the adapter') + parser.add_option ('-m', '--minimum-length', dest='minimum_length', help='Discard reads that are shorter than LENGTH') + parser.add_option ('-M', '--maximum-length', dest='maximum_length', help='Discard reads that are longer than LENGTH') + parser.add_option ('-o', '--output', dest='output_file', help='The modified sequences are written to the file') + (options, args) = parser.parse_args() + if options.input == None: + stop_err("Misssing option --input") + if options.output_file == None: + stop_err("Misssing option --output") + if not os.path.exists(options.input): + stop_err("Unable to read intput file: %s" % options.input) + #if len(args) < 1: + # parser.error ('missing argument') + main() + sys.exit(0) + except KeyboardInterrupt, e: # Ctrl-C + raise e + except SystemExit, e: # sys.exit() + raise e + except Exception, e: + print 'ERROR, UNEXPECTED EXCEPTION' + print str(e) + traceback.print_exc() + os._exit(1) +