Mercurial > repos > lparsons > cutadapt
changeset 0:8b064ea16722
Initial version with multiple adapter support
author | Lance Parsons <lparsons@princeton.edu> |
---|---|
date | Fri, 13 May 2011 15:54:01 -0400 |
parents | |
children | 50cdd8870607 |
files | README cutadapt.xml cutadapt_galaxy_wrapper.py |
diffstat | 3 files changed, 318 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Fri May 13 15:54:01 2011 -0400 @@ -0,0 +1,19 @@ +Galaxy tool definition for cutadapt (http://code.google.com/p/cutadapt/) + +Installation +------------ + +1 - Install the cutadapt package and make sure it is in path for Galaxy +2 - Copy cutadapt.xml and cutadapt_galaxy_wrapper.py to $GALAXY_HOME/tools/cutadapt +3 - Add the tool to the $GALAXY_HOME/tool_conf.xml tool-registry file + + +Limitations +----------- + +Colorspace data is not implemented +Discard trimmed reads is not implemented (broken in cutadapt 0.9.3) +Storing of "rest fo read" after the adapter (-r), too-short reads (--too-short-output), and untrimmed-reads (--untreimmed-output) are not implemented +Quality cutoff (-q) not implemented +Prefix and Suffix to read names not implemented +Length-tag addition to read name not implemented
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cutadapt.xml Fri May 13 15:54:01 2011 -0400 @@ -0,0 +1,168 @@ +<tool id="cutadapt" name="Remove adapter sequences" version="0.9.3"> + <description>from high-throughput sequence data</description> + <requirements> + <requirement type="python-module">cutadapt</requirement> + </requirements> + + <command interpreter="python">cutadapt_galaxy_wrapper.py + #if $input.extension.startswith( "fastq"): + --format=fastq + #else + --format=$input.extension + #end if + #for $a in $adapters + -a '${a.adapter_source.adapter}' + #end for + #for $aa in $anywhere_adapters + -b '${aa.anywhere_adapter_source.anywhere_adapter}' + #end for + -e $error_rate + -n $count + -O $overlap + #if str($min) != '0': + -m $min + #end if + #if str($max) != '0': + -M $max + #end if + --input='$input' + --output='$output' + > $report + </command> + <inputs> + <param format="fastqsanger, fasta" name="input" type="data" optional="false" label="Fastq file to trim" length="100"/> + + <repeat name="adapters" title="3' Adapters"> + <conditional name="adapter_source"> + <param name="adapter_source_list" type="select" label="Source" > + <option value="prebuilt" selected="true">Standard (select from the list below)</option> + <option value="user">Enter custom sequence</option> + </param> + + <when value="user"> + <param name="adapter" size="30" label="Enter custom 3' adapter sequence" type="text" value="AATTGGCC" help="Sequence of an adapter that was ligated to the 3' end. The adapter itself and anything that follows is trimmed. If multiple adapters are specified, only the best matching adapter is trimmed."/> + </when> + + <when value="prebuilt"> + <param name="adapter" type="select" label="Choose 3' adapter" help="Sequence of an adapter that was ligated to the 3' end. The adapter itself and anything that follows is trimmed. If multiple adapters are specified, only the best matching adapter is trimmed."> + <options from_file="fastx_clipper_sequences.txt"> + <column name="name" index="1"/> + <column name="value" index="0"/> + </options> + </param> + </when> + </conditional> + </repeat> + + <repeat name="anywhere_adapters" title="5' or 3' (Anywhere) Adapters" help="Sequence of an adapter that was ligated to the 5' or 3' end. If the adapter is found within the read or overlapping the 3' end of the read, the behavior is the same as for the -a option. If the adapter overlaps the 5' end (beginning of the read), the initial portion of the read matching the adapter is trimmed, but anything that follows is kept. If multiple -a or -b options are given, only the best matching adapter is trimmed."> + <conditional name="anywhere_adapter_source"> + <param name="anywhere_adapter_source_list" type="select" label="Source"> + <option value="prebuilt" selected="true">Standard (select from the list below)</option> + <option value="user">Enter custom sequence</option> + </param> + + <when value="user"> + <param name="anywhere_adapter" size="30" label="Enter custom 5' or 3' adapter sequence" type="text" value="AATTGGCC" help="Sequence of an adapter that was ligated to the 5' or 3' end. If the adapter is found within the read or overlapping the 3' end of the read, the behavior is the same as for the -a option. If the adapter overlaps the 5' end (beginning of the read), the initial portion of the read matching the adapter is trimmed, but anything that follows is kept. If multiple -a or -b options are given, only the best matching adapter is trimmed."/> + </when> + <when value="prebuilt"> + <param name="anywhere_adapter" type="select" label="Choose 5' or 3' adapter" help="Sequence of an adapter that was ligated to the 5' or 3' end. If the adapter is found within the read or overlapping the 3' end of the read, the behavior is the same as for the -a option. If the adapter overlaps the 5' end (beginning of the read), the initial portion of the read matching the adapter is trimmed, but anything that follows is kept. If multiple -a or -b options are given, only the best matching adapter is trimmed."> + <options from_file="fastx_clipper_sequences.txt"> + <column name="name" index="1"/> + <column name="value" index="0"/> + </options> + </param> + </when> + </conditional> + </repeat> + + <param name="error_rate" type="float" min="0" max="1" value="0.1" label="Maximum error rate" help="Maximum allowed error rate (no. of errors divided by the length of the matching region)." /> + <param name="count" type="integer" min="1" value="1" label="Match times" help="Try to remove adapters at most COUNT times. Useful when an adapter gets appended multiple times." /> + <param name="overlap" type="integer" min="1" value="3" label="Minimum overlap length" help="Minimum overlap length. If the overlap between the adapter and the sequence is shorter than LENGTH, the read is not modified." /> + <!--<param name="discard" type="boolean" checked="false" label="Discard Trimmed Reads" help="Discard reads that contain the adapter instead of trimming them. Use the 'Minimum overlap length' option in order to avoid throwing away too many randomly matching reads!" />--> + <param name="min" type="integer" min="0" optional="true" value="0" label="Minimum length" help="Discard trimmed reads that are shorter than LENGTH. Reads that are too short even before adapter removal are also discarded. In colorspace, an initial primer is not counted. Value of 0 means no minimum length." /> + <param name="max" type="integer" min="0" optional="true" value="0" label="Maximum length" help="Discard trimmed reads that are longer than LENGTH. Reads that are too long even before adapter removal are also discarded. In colorspace, an initial primer is not counted. Value of 0 means no maximum length." /> + </inputs> + <outputs> + <data format="txt" name="report" label="${tool.name} on ${on_string} (Report)" /> + <data format="input" name="output" metadata_source="input"/> + </outputs> + + <tests> + <test> + <param name="input" value="fa_gc_content_input.fa"/> + <output name="out_file1" file="fa_gc_content_output.txt"/> + </test> + </tests> + + <help> +This tool removes adapter sequences from DNA high-throughput +sequencing data. This is usually necessary when the read length of the +machine is longer than the molecule that is sequenced, such as in +microRNA data. + +The tool is based on the opensource cutadapt_ tool. + +----- + +**Algorithm** + +cutadapt uses a simple semi-global alignment algorithm, without any special optimizations. +For speed, the algorithm is implemented as a Python extension module in calignmodule.c. + +The program is sufficiently fast for my purposes, but speedups should be simple to achieve. + + +**Partial adapter matches** + +Cutadapt correctly deals with partial adapter matches. As an example, suppose +your adapter sequence is "ADAPTER" (specified via 3' Adapters parameter). +If you have these input sequences: + +:: + + MYSEQUENCEADAPTER + MYSEQUENCEADAP + MYSEQUENCEADAPTERSOMETHINGELSE + +All of them will be trimmed to "MYSEQUENCE". If the sequence starts with an +adapter, like this: + +:: + + ADAPTERSOMETHING + +It will be empty after trimming. + +When the allowed error rate is sufficiently high, errors in +the adapter sequence are allowed. For example, ADABTER (1 mismatch), ADAPTR (1 deletion), +and ADAPPTER (1 insertion) will all be recognized if the error rate is set to 0.15. + + +**Allowing adapters anywhere** + +Cutadapt assumes that any adapter specified via the *3` Adapters* parameter +was ligated to the 3' end of the sequence. This is the correct assumption for +at least the SOLiD and Illumina small RNA protocols and probably others. + +If, on the other hand, your adapter can also be ligated to the 5' end (on +purpose or by accident), you should tell cutadapt so by using the *5' or 3' (Anywhere) +Adapters parameter. It will then use a different alignment algorithm and +correctly trim adapters that appear in the beginning of a read. An adapter +specified this way will also be found if it appears only partially in the +beginning of a read. For example, these sequences + +:: + + ADAPTERMYSEQUENCE + PTERMYSEQUENCE + +will be trimmed to "MYSEQUENCE". Note that the regular algorithm would trim +the first read to an empty sequence. + +This parameter currently does not work with color space data. + + +.. _cutadapt: http://code.google.com/p/cutadapt/ + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cutadapt_galaxy_wrapper.py Fri May 13 15:54:01 2011 -0400 @@ -0,0 +1,131 @@ +#!/usr/bin/env python +""" +SYNOPSIS + + cutadapt_galaxy_wrapper.py + -i input_file + -o output_file + [-f format (fastq/fastq/etc.)] + [-a 3' adapter sequence] + [-b 3' or 5' anywhere adapter sequence] + [-e error_rate] + [-n count] + [-O overlap_length] + [--discard discard trimmed reads] + [-m minimum read length] + [-M maximum read length] + [-h,--help] [-v,--verbose] [--version] + +DESCRIPTION + + Wrapper for cutadapt running as a galaxy tool + +AUTHOR + + Lance Parsons <lparsons@princeton.edu> + +LICENSE + + This script is in the public domain, free from copyrights or restrictions. + +VERSION + + $Id$ +""" + +import sys, os, traceback, optparse, shutil, subprocess, tempfile +import re +#from pexpect import run, spawn + +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +def main (): + + global options, args + # Setup Parameters + params = [] + if options.adapters != None: + params.append("-a %s" % " -a ".join(options.adapters)) + if options.anywhere_adapters != None: + params.append("-b %s" % " -b ".join(options.anywhere_adapters)) + if options.output_file != None: + params.append("-o %s" % options.output_file) + if options.error_rate != None: + params.append("-e %s" % options.error_rate) + if options.count != None: + params.append("-n %s" % options.count) + if options.overlap_length != None: + params.append("-O %s" % options.overlap_length) + if options.discard_trimmed: + params.append("--discard") + if options.minimum_length != None: + params.append("-m %s" % options.minimum_length) + if options.maximum_length != None: + params.append("-M %s" % options.maximum_length) + + # cutadapt relies on the extension to determine file format: .fasta or .fastq + input_name = '.'.join((options.input,options.format)) + # make temp directory + tmp_dir = tempfile.mkdtemp() + + try: + # make a link to the input file in the tmp_dir + input_file = os.path.join(tmp_dir,os.path.basename(input_name)) + os.symlink( options.input, input_file) + + # generate commandline + cmd = 'cutadapt %s %s' % (' '.join(params),input_file) + proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + (stdoutdata, stderrdata) = proc.communicate() + returncode = proc.returncode + if returncode != 0: + raise Exception, 'Execution of cutadapt failed.\n%s' % stderrdata + print stderrdata + + finally: + # clean up temp dir + if os.path.exists( input_name ): + os.remove( input_name ) + if os.path.exists( tmp_dir ): + shutil.rmtree( tmp_dir ) + +if __name__ == '__main__': + try: + parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), usage=globals()['__doc__'], version='$Id$') + parser.add_option( '-i', '--input', dest='input', help='The sequence input file' ) + parser.add_option( '-f', '--format', dest='format', default='fastq', + help='The sequence input file format (default: fastq)' ) + parser.add_option ('-a', '--adapter', action='append', dest='adapters', help='3\' adapter sequence(s)') + parser.add_option ('-b', '--anywhere', action='append', dest='anywhere_adapters', help='5\' or 3\' "anywhere" adapter sequence(s)') + parser.add_option ('-e', '--error-rate', dest='error_rate', help='Maximum allowed error rate') + parser.add_option ('-n', '--times', dest='count', help='Try to remove adapters COUNT times') + parser.add_option ('-O', '--overlap', dest='overlap_length', help='Minimum overlap length') + parser.add_option ('--discard', '--discard-trimmed', dest='discard_trimmed', action='store_true', default=False, help='Discard reads that contain the adapter') + parser.add_option ('-m', '--minimum-length', dest='minimum_length', help='Discard reads that are shorter than LENGTH') + parser.add_option ('-M', '--maximum-length', dest='maximum_length', help='Discard reads that are longer than LENGTH') + parser.add_option ('-o', '--output', dest='output_file', help='The modified sequences are written to the file') + (options, args) = parser.parse_args() + if options.input == None: + stop_err("Misssing option --input") + if options.output_file == None: + stop_err("Misssing option --output") + if not os.path.exists(options.input): + stop_err("Unable to read intput file: %s" % options.input) + #if len(args) < 1: + # parser.error ('missing argument') + main() + sys.exit(0) + except KeyboardInterrupt, e: # Ctrl-C + raise e + except SystemExit, e: # sys.exit() + raise e + except Exception, e: + print 'ERROR, UNEXPECTED EXCEPTION' + print str(e) + traceback.print_exc() + os._exit(1) +