Mercurial > repos > lparsons > cutadapt
changeset 8:2d6671b10919 draft
Updated to support cutadapt version 1.1 (also include automatic dependency installation)
author | lparsons |
---|---|
date | Mon, 26 Nov 2012 17:37:26 -0500 |
parents | 1dda185ea2d0 |
children | 93d58ffe39f1 |
files | .hgtags README cutadapt.xml tool_dependencies.xml |
diffstat | 4 files changed, 228 insertions(+), 61 deletions(-) [+] |
line wrap: on
line diff
--- a/.hgtags Thu Dec 22 11:46:33 2011 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -8b064ea1672296c6c224cb4af5e36cb685d88993 0.1 -1dada50cca8ae5ee163254f3981b0c1ec4becb64 v0.9.5.a
--- a/README Thu Dec 22 11:46:33 2011 -0500 +++ b/README Mon Nov 26 17:37:26 2012 -0500 @@ -1,7 +1,18 @@ Galaxy tool definition for cutadapt (http://code.google.com/p/cutadapt/) -Installation ------------- + +Installation - Tool Shed +--------------------------- + +The recommended way to install cutadapt as a tool in Galaxy is to the use the +Galaxy Tool Shed (http://wiki.galaxyproject.org/Tool%20Shed). + +This will allow cutadapt to be installed automatically and keep track of older +versions of cutadapt and the tool wrapper. + + +Installation - Manual +--------------------- 1 - Install the cutadapt package and make sure it is in path for Galaxy 2 - Copy cutadapt.xml to $GALAXY_HOME/tools/cutadapt @@ -16,9 +27,16 @@ See the Galaxy Wiki for more information: http://wiki.g2.bx.psu.edu/ +Configuration of Adapters +------------------------- + +A list of predefined adapters may be specified in the fastx_clipper_sequences.txt +file which resides in the tool-data directory underneath the Galaxy root. A sample +file is provided. + + Limitations ----------- Colorspace data support is not implemented -Prefix and Suffix to read names not implemented -Length-tag addition to read name not implemented +Name adapters support is not implemented
--- a/cutadapt.xml Thu Dec 22 11:46:33 2011 -0500 +++ b/cutadapt.xml Mon Nov 26 17:37:26 2012 -0500 @@ -1,12 +1,19 @@ -<tool id="cutadapt" name="Cutadapt" version="0.9.5.a"> +<tool id="cutadapt" name="Cutadapt" version="1.1.a"> <description>Remove adapter sequences from Fastq/Fasta</description> <requirements> - <requirement type="python-module">cutadapt</requirement> + <requirement type="package" version="1.1">cutadapt</requirement> </requirements> + <version_command>cutadapt --version</version_command> <command>cutadapt #if $input.extension.startswith( "fastq"): - --format=fastq + --format=fastq + #if $input.extension == "fastqillumina": + --quality-base=64 + #end if + #if $input.extension == "fastqsolexa": + --quality-base=64 + #end if #else --format=$input.extension #end if @@ -16,38 +23,64 @@ #for $aa in $anywhere_adapters --anywhere='${aa.anywhere_adapter_source.anywhere_adapter}' #end for + #for $fa in $front_adapters + --front='${fa.front_adapter_source.front_adapter}' + #end for --error-rate=$error_rate --times=$count --overlap=$overlap - #if str($min) != '0': - --minimum-length=$min - #end if - #if str($max) != '0': - --maximum-length=$max - #end if - #if str($quality_cutoff) != '0': - --quality-cutoff=$quality_cutoff - #end if - $discard + $match_read_wildcards + $no_match_adapters_wildcards + + #if str( $output_filtering_options.output_filtering) == "filter": + $output_filtering_options.discard + #if str($output_filtering_options.min) != '0': + --minimum-length=$output_filtering_options.min + #end if + #if str($output_filtering_options.max) != '0': + --maximum-length=$output_filtering_options.max + #end if + #end if + --output='$output' #if str( $output_params.output_type ) == "additional": #if $output_params.rest_file: - --rest-file=$rest_output + --rest-file=$rest_output + #end if + #if $output_params.wildcard_file: + --wildcard-file=$wild_output #end if #if $output_params.too_short_file: - --too-short-output=$too_short_output + --too-short-output=$too_short_output #end if #if $output_params.untrimmed_file: - --untrimmed-output=$untrimmed_output + --untrimmed-output=$untrimmed_output #end if #end if + + #if str( $read_modification_params.read_modification) == "modify": + #if str($read_modification_params.quality_cutoff) != '0': + --quality-cutoff=$read_modification_params.quality_cutoff + #end if + #if $read_modification_params.prefix != '': + --prefix="$read_modification_params.prefix" + #end if + #if $read_modification_params.suffix != '': + --suffix="$read_modification_params.suffix" + #end if + #if $read_modification_params.length_tag != '': + --length-tag="$read_modification_params.length_tag" + #end if + $read_modification_params.zero_cap + #end if + '$input' > $report </command> <inputs> - <param format="fastqsanger, fasta" name="input" type="data" optional="false" label="Fastq file to trim" length="100"/> + <param format="fastqsanger, fastqillumina, fastqsolexa, fasta" name="input" type="data" optional="false" label="Fastq file to trim" length="100"/> - <repeat name="adapters" title="3' Adapters"> + <repeat name="adapters" title="3' Adapters" help="Sequence of an adapter that was ligated to the 3' end. The adapter itself and anything that follows is trimmed."> <conditional name="adapter_source"> <param name="adapter_source_list" type="select" label="Source" > <option value="prebuilt" selected="true">Standard (select from the list below)</option> @@ -55,11 +88,11 @@ </param> <when value="user"> - <param name="adapter" size="30" label="Enter custom 3' adapter sequence" type="text" value="AATTGGCC" help="Sequence of an adapter that was ligated to the 3' end. The adapter itself and anything that follows is trimmed. If multiple adapters are specified, only the best matching adapter is trimmed."/> + <param name="adapter" size="30" label="Enter custom 3' adapter sequence" type="text" value="AATTGGCC" /> </when> <when value="prebuilt"> - <param name="adapter" type="select" label="Choose 3' adapter" help="Sequence of an adapter that was ligated to the 3' end. The adapter itself and anything that follows is trimmed. If multiple adapters are specified, only the best matching adapter is trimmed."> + <param name="adapter" type="select" label="Choose 3' adapter"> <options from_file="fastx_clipper_sequences.txt"> <column name="name" index="1"/> <column name="value" index="0"/> @@ -77,10 +110,31 @@ </param> <when value="user"> - <param name="anywhere_adapter" size="30" label="Enter custom 5' or 3' adapter sequence" type="text" value="AATTGGCC" help="Sequence of an adapter that was ligated to the 5' or 3' end. If the adapter is found within the read or overlapping the 3' end of the read, the behavior is the same as for the -a option. If the adapter overlaps the 5' end (beginning of the read), the initial portion of the read matching the adapter is trimmed, but anything that follows is kept. If multiple -a or -b options are given, only the best matching adapter is trimmed."/> + <param name="anywhere_adapter" size="30" label="Enter custom 5' or 3' adapter sequence" type="text" value="AATTGGCC" /> </when> <when value="prebuilt"> - <param name="anywhere_adapter" type="select" label="Choose 5' or 3' adapter" help="Sequence of an adapter that was ligated to the 5' or 3' end. If the adapter is found within the read or overlapping the 3' end of the read, the behavior is the same as for the -a option. If the adapter overlaps the 5' end (beginning of the read), the initial portion of the read matching the adapter is trimmed, but anything that follows is kept. If multiple -a or -b options are given, only the best matching adapter is trimmed."> + <param name="anywhere_adapter" type="select" label="Choose 5' or 3' adapter"> + <options from_file="fastx_clipper_sequences.txt"> + <column name="name" index="1"/> + <column name="value" index="0"/> + </options> + </param> + </when> + </conditional> + </repeat> + + <repeat name="front_adapters" title="5' (Front) Adapters" help="Sequence of an adapter that was ligated to the 5' end. If the adapter sequence starts with the character '^', the adapter is 'anchored'. An anchored adapter must appear in its entirety at the 5' end of the read (it is a prefix of the read). A non-anchored adapter may appear partially at the 5' end, or it may occur within the read. If it is found within a read, the sequence preceding the adapter is also trimmed. In all cases the adapter itself is trimmed."> + <conditional name="front_adapter_source"> + <param name="front_adapter_source_list" type="select" label="Source"> + <option value="prebuilt" selected="true">Standard (select from the list below)</option> + <option value="user">Enter custom sequence</option> + </param> + + <when value="user"> + <param name="front_adapter" size="30" label="Enter custom 5' adapter sequence" type="text" value="AATTGGCC" /> + </when> + <when value="prebuilt"> + <param name="front_adapter" type="select" label="Choose 5' adapter"> <options from_file="fastx_clipper_sequences.txt"> <column name="name" index="1"/> <column name="value" index="0"/> @@ -92,12 +146,25 @@ <param name="error_rate" type="float" min="0" max="1" value="0.1" label="Maximum error rate" help="Maximum allowed error rate (no. of errors divided by the length of the matching region)." /> <param name="count" type="integer" min="1" value="1" label="Match times" help="Try to remove adapters at most COUNT times. Useful when an adapter gets appended multiple times." /> - <param name="overlap" type="integer" min="1" value="3" label="Minimum overlap length" help="Minimum overlap length. If the overlap between the adapter and the sequence is shorter than LENGTH, the read is not modified." /> - <param name="discard" type="boolean" value="false" truevalue="--discard" falsevalue="" label="Discard Trimmed Reads" help="Discard reads that contain the adapter instead of trimming them. Use the 'Minimum overlap length' option in order to avoid throwing away too many randomly matching reads!" /> - <param name="min" type="integer" min="0" optional="true" value="0" label="Minimum length" help="Discard trimmed reads that are shorter than LENGTH. Reads that are too short even before adapter removal are also discarded. In colorspace, an initial primer is not counted. Value of 0 means no minimum length." /> - <param name="max" type="integer" min="0" optional="true" value="0" label="Maximum length" help="Discard trimmed reads that are longer than LENGTH. Reads that are too long even before adapter removal are also discarded. In colorspace, an initial primer is not counted. Value of 0 means no maximum length." /> - <param name="quality_cutoff" type="integer" min="0" optional="true" value="0" label="Quality cutoff" help="Trim low-quality ends from reads before adapter removal. The algorithm is the same as the one used by BWA (Subtract CUTOFF from all qualities; compute partial sums from all indices to the end of the sequence; cut sequence at the index at which the sum is minimal). Value of 0 means no quality trimming." /> - <conditional name="output_params"> + <param name="overlap" type="integer" min="1" value="3" label="Minimum overlap length" help="Minimum overlap length. If the overlap between the adapter and the sequence is shorter than LENGTH, the read is not modified. This reduces the number of bases trimmed purely due to short random adapter matches." /> + + <param name="match_read_wildcards" type="boolean" value="false" truevalue="--match-read-wildcards" falsevalue="" label="Match Read Wildcards" help="Allow 'N's in the read as matches to the adapter." /> + <param name="no_match_adapters_wildcards" type="boolean" value="false" truevalue="--no-match-adapter-wildcards" falsevalue="" label="Do Not Match Adapter Wildcards" help="Do not treat 'N' in the adapter sequence as wildcards. This is needed when you want to search for literal 'N' characters." /> + + <conditional name="output_filtering_options"> + <param name="output_filtering" type="select" label="Output filtering options" help="Options for filtering processed reads by those that contain the adapter or by minimum or maximum length"> + <option value="default">Default (no filtering)</option> + <option value="filter">Set Filters</option> + </param> + <when value="default" /> + <when value="filter"> + <param name="discard" type="boolean" value="false" truevalue="--discard" falsevalue="" label="Discard Trimmed Reads" help="Discard reads that contain the adapter instead of trimming them. Use the 'Minimum overlap length' option in order to avoid throwing away too many randomly matching reads!" /> + <param name="min" type="integer" min="0" optional="true" value="0" label="Minimum length" help="Discard trimmed reads that are shorter than LENGTH. Reads that are too short even before adapter removal are also discarded. In colorspace, an initial primer is not counted. Value of 0 means no minimum length." /> + <param name="max" type="integer" min="0" optional="true" value="0" label="Maximum length" help="Discard trimmed reads that are longer than LENGTH. Reads that are too long even before adapter removal are also discarded. In colorspace, an initial primer is not counted. Value of 0 means no maximum length." /> + </when> + </conditional> + + <conditional name="output_params"> <param name="output_type" type="select" label="Additional output options" help="By default all reads will be put in the same file. However, reads with adapters matching in the middle, unmatched reads, and too-short reads can be saved in separate files."> <option value="default">Default</option> <option value="additional">Additional output files</option> @@ -105,11 +172,28 @@ <when value="default" /> <when value="additional"> <param name="rest_file" type="boolean" value="false" label="Rest of Read" help="When the adapter matches in the middle of a read, write the rest (after the adapter) into a file."/> + <param name="wildcard_file" type="boolean" value="false" label="Wildcard File" help="When the adapter has wildcard bases ('N's) write adapter bases matching wildcard positions to file."/> <param name="too_short_file" type="boolean" value="false" label="Too Short Reads" help="Write reads that are too short (according to minimum length specified) to a file. (default: discard reads)"/> <param name="untrimmed_file" type="boolean" value="false" label="Untrimmed Reads" help="Write reads that do not contain the adapter to a separate file, instead of writing them to the regular output file. (default: output to same file as trimmed)"/> </when> </conditional> + + <conditional name="read_modification_params"> + <param name="read_modification" type="select" label="Additional modifications to reads" help="Various options to trim reads based on quality, modify read names and quality scores"> + <option value="none">No Read Modifications</option> + <option value="modify">Set Modification Options</option> + </param> + <when value="none" /> + <when value="modify"> + <param name="quality_cutoff" type="integer" min="0" optional="true" value="0" label="Quality cutoff" help="Trim low-quality ends from reads before adapter removal. The algorithm is the same as the one used by BWA (Subtract CUTOFF from all qualities; compute partial sums from all indices to the end of the sequence; cut sequence at the index at which the sum is minimal). Value of 0 means no quality trimming." /> + <param name="prefix" label="Prefix" type="text" help="Add this prefix to read names" /> + <param name="suffix" label="Suffix" type="text" help="Add this suffix to read names" /> + <param name="length_tag" label="Length Tag" type="text" help="Search for TAG followed by a decimal number in the name of the read (description/comment field of the FASTA or FASTQ file). Replace the decimal number with the correct length of the trimmed read. For example, use --length-tag 'length=' to search for fields like 'length=123'." /> + <param name="zero_cap" type="boolean" value="false" label="Change negative quality values to zero (0)" truevalue="--zero-cap" falsevalue="" help="Workaround to avoid segmentation faults in BWA" /> + </when> + </conditional> </inputs> + <outputs> <data format="txt" name="report" label="${tool.name} on ${on_string} (Report)" /> <data format="input" name="output" metadata_source="input"/> @@ -117,6 +201,10 @@ <filter>(output_params['output_type'] == "additional")</filter> <filter>(output_params['rest_file'] is True)</filter> </data> + <data format="txt" name="wild_output" metadata_source="input" label="${tool.name} on ${on_string} (Wildcard File)" > + <filter>(output_params['output_type'] == "additional")</filter> + <filter>(output_params['wild_file'] is True)</filter> + </data> <data format="input" name="too_short_output" metadata_source="input" label="${tool.name} on ${on_string} (Too Short Reads)" > <filter>(output_params['output_type'] == "additional")</filter> <filter>(output_params['too_short_file'] is True)</filter> @@ -134,16 +222,25 @@ <param name="adapter" value=""/> <param name="anywhere_adapter_source_list" value="user"/> <param name="anywhere_adapter" value="TTAGACATATCTCCGTCG"/> + <param name="front_adapter_source_list" value="user"/> + <param name="front_adapter" value=""/> + <param name="output_filtering" value="default"/> + <param name="read_modification" value="none"/> <param name="output_type" value="default"/> <output name="output" file="cutadapt_small.out"/> </test> +<!-- Unable to get tests to function with conditional parameters <test> <param name="input" value="cutadapt_small.fastq" ftype="fastqsanger"/> <param name="adapter_source_list" value="user"/> <param name="adapter" value="TTAGACATATCTCCGTCG"/> <param name="anywhere_adapter_source_list" value="user"/> <param name="anywhere_adapter" value=""/> + <param name="front_adapter_source_list" value="user"/> + <param name="front_adapter" value=""/> + <param name="output_filtering" value="filter"/> <param name="discard" value="true"/> + <param name="read_modification" value="none"/> <param name="output_type" value="default"/> <output name="output" file="cutadapt_discard.out"/> </test> @@ -153,14 +250,21 @@ <param name="adapter" value="ADAPTER"/> <param name="anywhere_adapter_source_list" value="user"/> <param name="anywhere_adapter" value=""/> + <param name="front_adapter_source_list" value="user"/> + <param name="front_adapter" value=""/> + <param name="output_filtering" value="default"/> + <param name="read_modification" value="none"/> <param name="output_type" value="additional"/> <param name="rest_file" value="true"/> <output name="output" file="cutadapt_rest.out"/> <output name="rest_output" file="cutadapt_rest2.out"/> </test> +--> </tests> <help> +Summary +------- This tool removes adapter sequences from DNA high-throughput sequencing data. This is usually necessary when the read length of the machine is longer than the molecule that is sequenced, such as in @@ -170,60 +274,90 @@ ----- -**Algorithm** +Algorithm +--------- cutadapt uses a simple semi-global alignment algorithm, without any special optimizations. -For speed, the algorithm is implemented as a Python extension module in calignmodule.c. +For speed, the algorithm is implemented as a Python extension module in ``calignmodule.c``. -**Partial adapter matches** +Partial adapter matches +----------------------- Cutadapt correctly deals with partial adapter matches. As an example, suppose -your adapter sequence is "ADAPTER" (specified via 3' Adapters parameter). -If you have these input sequences: - -:: +your adapter sequence is ``ADAPTER`` (specified via 3' Adapters parameter). +If you have these input sequences:: MYSEQUENCEADAPTER MYSEQUENCEADAP MYSEQUENCEADAPTERSOMETHINGELSE -All of them will be trimmed to "MYSEQUENCE". If the sequence starts with an -adapter, like this: - -:: +All of them will be trimmed to ``MYSEQUENCE``. If the sequence starts with an +adapter, like this:: ADAPTERSOMETHING It will be empty after trimming. When the allowed error rate is sufficiently high, errors in -the adapter sequence are allowed. For example, ADABTER (1 mismatch), ADAPTR (1 deletion), -and ADAPPTER (1 insertion) will all be recognized if the error rate is set to 0.15. +the adapter sequence are allowed. For example, ``ADABTER`` (1 mismatch), ``ADAPTR`` (1 deletion), +and ``ADAPPTER`` (1 insertion) will all be recognized if the error rate is set to 0.15. -**Allowing adapters anywhere** +Anchoring 5' adapters +--------------------- + +If you specify a 5' (Front) adapter, the adapter may overlap the beginning of the read or +occur anywhere whithin it. If it appears withing the read, the sequence that precedes it +will also be trimmed in addition to the adapter. For example when the adapter sequence is +``ADAPTER``:: + + HELLOADAPTERTHERE + APTERTHERE -Cutadapt assumes that any adapter specified via the *3` Adapters* parameter -was ligated to the 3' end of the sequence. This is the correct assumption for +will both be trimmed to ``THERE``. To avoid this, you can prefix the adapter with the character +``^``. This will restrict the search, forcing the adapter to be a prefix of the read. With +the adapter sequence set to ``^ADAPTER``, only reads like this will be trimmed:: + + ADAPTERHELLO + + +Allowing adapters anywhere +-------------------------- + +Cutadapt assumes that any adapter specified via the 3' Adapter parameter +was ligated to the 3\' end of the sequence. This is the correct assumption for at least the SOLiD and Illumina small RNA protocols and probably others. +The assumption is enforced by the alignment algorithm, which only finds the adapter +when its starting position is within the read. In other words, the 5' base of +the adapter must appear within the read. The adapter and all bases following +it are remved. If, on the other hand, your adapter can also be ligated to the 5' end (on -purpose or by accident), you should tell cutadapt so by using the *5' or 3' (Anywhere) -Adapters* parameter. It will then use a different alignment algorithm and -correctly trim adapters that appear in the beginning of a read. An adapter -specified this way will also be found if it appears only partially in the -beginning of a read. For example, these sequences +purpose or by accident), you should tell cutadapt so by using the Anywhere Adapter +parameter. It will then use a slightly different alignment algorithm +(so-called semiglobal alignment), which allows any type of overlap between the +adapter and the sequence. In particular, the adapter may appear only partially +in the beginning of the read, like this:: -:: + PTERMYSEQUENCE - ADAPTERMYSEQUENCE - PTERMYSEQUENCE +The decision which part of the read to remove is made as follows: If there is at +least one base before the found adapter, then the adapter is considered to be +a 3' adapter and the adapter itself and everything following it is removed. +Otherwise, the adapter is considered to be a 5' adapter and it is removed from +the read. -will be trimmed to "MYSEQUENCE". Note that the regular algorithm would trim -the first read to an empty sequence. +Here are some examples, which may make this clearer (left: read, right: trimmed +read):: -This parameter currently does not work with color space data. + MYSEQUENCEADAPTER -> MYSEQUENCE (3' adapter) + MADAPTER -> M (3' adapter) + ADAPTERMYSEQUENCE -> MYSEQUENCE (5' adapter) + PTERMYSEQUENCE -> MYSEQUENCE (5' adapter) + +The regular algorithm (3' Adapter) would trim the first two examples in the same way, +but trim the third to an empty sequence and trim the fourth not at all. .. _cutadapt: http://code.google.com/p/cutadapt/
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Nov 26 17:37:26 2012 -0500 @@ -0,0 +1,17 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="cutadapt" version="1.1"> + <install version="1.0"> + <actions> + <action type="download_by_url">http://pypi.python.org/packages/source/c/cutadapt/cutadapt-1.1.tar.gz</action> + <action type="shell_command">python setup.py install --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin</action> + <action type="set_environment"> + <environment_variable name="PYTHONPATH" action="append_to">$INSTALL_DIR/lib/python</environment_variable> + <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable> + </action> + </actions> + </install> + <readme> + </readme> + </package> +</tool_dependency>