comparison cutadapt.xml @ 0:8b064ea16722

Initial version with multiple adapter support
author Lance Parsons <lparsons@princeton.edu>
date Fri, 13 May 2011 15:54:01 -0400
parents
children f6b94b76d16b
comparison
equal deleted inserted replaced
-1:000000000000 0:8b064ea16722
1 <tool id="cutadapt" name="Remove adapter sequences" version="0.9.3">
2 <description>from high-throughput sequence data</description>
3 <requirements>
4 <requirement type="python-module">cutadapt</requirement>
5 </requirements>
6
7 <command interpreter="python">cutadapt_galaxy_wrapper.py
8 #if $input.extension.startswith( "fastq"):
9 --format=fastq
10 #else
11 --format=$input.extension
12 #end if
13 #for $a in $adapters
14 -a '${a.adapter_source.adapter}'
15 #end for
16 #for $aa in $anywhere_adapters
17 -b '${aa.anywhere_adapter_source.anywhere_adapter}'
18 #end for
19 -e $error_rate
20 -n $count
21 -O $overlap
22 #if str($min) != '0':
23 -m $min
24 #end if
25 #if str($max) != '0':
26 -M $max
27 #end if
28 --input='$input'
29 --output='$output'
30 > $report
31 </command>
32 <inputs>
33 <param format="fastqsanger, fasta" name="input" type="data" optional="false" label="Fastq file to trim" length="100"/>
34
35 <repeat name="adapters" title="3' Adapters">
36 <conditional name="adapter_source">
37 <param name="adapter_source_list" type="select" label="Source" >
38 <option value="prebuilt" selected="true">Standard (select from the list below)</option>
39 <option value="user">Enter custom sequence</option>
40 </param>
41
42 <when value="user">
43 <param name="adapter" size="30" label="Enter custom 3' adapter sequence" type="text" value="AATTGGCC" help="Sequence of an adapter that was ligated to the 3' end. The adapter itself and anything that follows is trimmed. If multiple adapters are specified, only the best matching adapter is trimmed."/>
44 </when>
45
46 <when value="prebuilt">
47 <param name="adapter" type="select" label="Choose 3' adapter" help="Sequence of an adapter that was ligated to the 3' end. The adapter itself and anything that follows is trimmed. If multiple adapters are specified, only the best matching adapter is trimmed.">
48 <options from_file="fastx_clipper_sequences.txt">
49 <column name="name" index="1"/>
50 <column name="value" index="0"/>
51 </options>
52 </param>
53 </when>
54 </conditional>
55 </repeat>
56
57 <repeat name="anywhere_adapters" title="5' or 3' (Anywhere) Adapters" help="Sequence of an adapter that was ligated to the 5' or 3' end. If the adapter is found within the read or overlapping the 3' end of the read, the behavior is the same as for the -a option. If the adapter overlaps the 5' end (beginning of the read), the initial portion of the read matching the adapter is trimmed, but anything that follows is kept. If multiple -a or -b options are given, only the best matching adapter is trimmed.">
58 <conditional name="anywhere_adapter_source">
59 <param name="anywhere_adapter_source_list" type="select" label="Source">
60 <option value="prebuilt" selected="true">Standard (select from the list below)</option>
61 <option value="user">Enter custom sequence</option>
62 </param>
63
64 <when value="user">
65 <param name="anywhere_adapter" size="30" label="Enter custom 5' or 3' adapter sequence" type="text" value="AATTGGCC" help="Sequence of an adapter that was ligated to the 5' or 3' end. If the adapter is found within the read or overlapping the 3' end of the read, the behavior is the same as for the -a option. If the adapter overlaps the 5' end (beginning of the read), the initial portion of the read matching the adapter is trimmed, but anything that follows is kept. If multiple -a or -b options are given, only the best matching adapter is trimmed."/>
66 </when>
67 <when value="prebuilt">
68 <param name="anywhere_adapter" type="select" label="Choose 5' or 3' adapter" help="Sequence of an adapter that was ligated to the 5' or 3' end. If the adapter is found within the read or overlapping the 3' end of the read, the behavior is the same as for the -a option. If the adapter overlaps the 5' end (beginning of the read), the initial portion of the read matching the adapter is trimmed, but anything that follows is kept. If multiple -a or -b options are given, only the best matching adapter is trimmed.">
69 <options from_file="fastx_clipper_sequences.txt">
70 <column name="name" index="1"/>
71 <column name="value" index="0"/>
72 </options>
73 </param>
74 </when>
75 </conditional>
76 </repeat>
77
78 <param name="error_rate" type="float" min="0" max="1" value="0.1" label="Maximum error rate" help="Maximum allowed error rate (no. of errors divided by the length of the matching region)." />
79 <param name="count" type="integer" min="1" value="1" label="Match times" help="Try to remove adapters at most COUNT times. Useful when an adapter gets appended multiple times." />
80 <param name="overlap" type="integer" min="1" value="3" label="Minimum overlap length" help="Minimum overlap length. If the overlap between the adapter and the sequence is shorter than LENGTH, the read is not modified." />
81 <!--<param name="discard" type="boolean" checked="false" label="Discard Trimmed Reads" help="Discard reads that contain the adapter instead of trimming them. Use the 'Minimum overlap length' option in order to avoid throwing away too many randomly matching reads!" />-->
82 <param name="min" type="integer" min="0" optional="true" value="0" label="Minimum length" help="Discard trimmed reads that are shorter than LENGTH. Reads that are too short even before adapter removal are also discarded. In colorspace, an initial primer is not counted. Value of 0 means no minimum length." />
83 <param name="max" type="integer" min="0" optional="true" value="0" label="Maximum length" help="Discard trimmed reads that are longer than LENGTH. Reads that are too long even before adapter removal are also discarded. In colorspace, an initial primer is not counted. Value of 0 means no maximum length." />
84 </inputs>
85 <outputs>
86 <data format="txt" name="report" label="${tool.name} on ${on_string} (Report)" />
87 <data format="input" name="output" metadata_source="input"/>
88 </outputs>
89
90 <tests>
91 <test>
92 <param name="input" value="fa_gc_content_input.fa"/>
93 <output name="out_file1" file="fa_gc_content_output.txt"/>
94 </test>
95 </tests>
96
97 <help>
98 This tool removes adapter sequences from DNA high-throughput
99 sequencing data. This is usually necessary when the read length of the
100 machine is longer than the molecule that is sequenced, such as in
101 microRNA data.
102
103 The tool is based on the opensource cutadapt_ tool.
104
105 -----
106
107 **Algorithm**
108
109 cutadapt uses a simple semi-global alignment algorithm, without any special optimizations.
110 For speed, the algorithm is implemented as a Python extension module in calignmodule.c.
111
112 The program is sufficiently fast for my purposes, but speedups should be simple to achieve.
113
114
115 **Partial adapter matches**
116
117 Cutadapt correctly deals with partial adapter matches. As an example, suppose
118 your adapter sequence is "ADAPTER" (specified via 3' Adapters parameter).
119 If you have these input sequences:
120
121 ::
122
123 MYSEQUENCEADAPTER
124 MYSEQUENCEADAP
125 MYSEQUENCEADAPTERSOMETHINGELSE
126
127 All of them will be trimmed to "MYSEQUENCE". If the sequence starts with an
128 adapter, like this:
129
130 ::
131
132 ADAPTERSOMETHING
133
134 It will be empty after trimming.
135
136 When the allowed error rate is sufficiently high, errors in
137 the adapter sequence are allowed. For example, ADABTER (1 mismatch), ADAPTR (1 deletion),
138 and ADAPPTER (1 insertion) will all be recognized if the error rate is set to 0.15.
139
140
141 **Allowing adapters anywhere**
142
143 Cutadapt assumes that any adapter specified via the *3` Adapters* parameter
144 was ligated to the 3' end of the sequence. This is the correct assumption for
145 at least the SOLiD and Illumina small RNA protocols and probably others.
146
147 If, on the other hand, your adapter can also be ligated to the 5' end (on
148 purpose or by accident), you should tell cutadapt so by using the *5' or 3' (Anywhere)
149 Adapters parameter. It will then use a different alignment algorithm and
150 correctly trim adapters that appear in the beginning of a read. An adapter
151 specified this way will also be found if it appears only partially in the
152 beginning of a read. For example, these sequences
153
154 ::
155
156 ADAPTERMYSEQUENCE
157 PTERMYSEQUENCE
158
159 will be trimmed to "MYSEQUENCE". Note that the regular algorithm would trim
160 the first read to an empty sequence.
161
162 This parameter currently does not work with color space data.
163
164
165 .. _cutadapt: http://code.google.com/p/cutadapt/
166 </help>
167
168 </tool>