annotate scythe/scythe.xml @ 2:8e5357ca8ebd draft

Uploaded
author nikhil-joshi
date Tue, 06 Aug 2013 23:13:27 -0400
parents 8161274941bf
children 0a70eb1e6432
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
1 <tool id="scythe" name="Scythe">
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
2 <description>Trimming adapters/contaminants using a Naive Bayesian classifier</description>
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
3
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
4 <command>
2
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
5 scythe --quiet -a $adapter_file
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
6
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
7 #if $input_fastq.ext == "fastq":
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
8 -q sanger
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
9 #else if $input_fastq.ext == "fastqsanger":
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
10 -q sanger
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
11 #else if $input_fastq.ext == "fastqillumina":
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
12 -q illumina
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
13 #else if $input_fastq.ext == "fastqsolexa":
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
14 -q solexa
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
15 #end if
0
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
16
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
17 #if str($add_tag) == "add_tag_true":
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
18 -t
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
19 #end if
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
20
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
21 #if str($prior) != "":
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
22 -p $prior
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
23 #end if
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
24
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
25 #if str($min_match) != "":
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
26 -n $min_match
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
27 #end if
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
28
2
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
29 #if str($min_keep) != "":
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
30 -M $min_keep
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
31 #end if
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
32
0
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
33 #if str($matches_file) == "matches_file_true":
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
34 -m $output_matches
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
35 #end if
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
36
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
37 -o $output_trimmed $input_fastq 2> /dev/null
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
38 </command>
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
39
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
40 <inputs>
2
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
41 <param format="fastq, fastqsanger, fastqillumina, fastqsolexa" name="input_fastq" type="data" optional="false" label="FastQ Reads" help="Note: Scythe will infer the quality type of the file from its datatype. I.e., if the datatype is fastqsanger, then the quality type is sanger. The default is fastqsanger."/>
0
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
42
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
43 <param format="fasta" name="adapter_file" type="data" optional="false" label="Adapter/Contaminant file (in fasta format)"/>
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
44
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
45 <param name="add_tag" type="boolean" checked="false" truevalue="add_tag_true" falsevalue="add_tag_false" label="Add a tag to the header indicating that Scythe cut a sequence?"/>
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
46
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
47 <param name="matches_file" type="boolean" checked="false" truevalue="matches_file_true" falsevalue="matches_file_false" label="Also output another file with details about adapter/contaminant matches?"/>
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
48
2
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
49 <param name="prior" value="0.3" type="float" optional="true" label="Prior" help="The prior contamination rate">
0
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
50 <validator type="in_range" min="0" message="Minimum value is 0"/>
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
51 </param>
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
52
2
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
53 <param name="min_match" value="5" type="integer" optional="true" label="Smallest length adapter/contaminant to consider">
0
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
54 <validator type="in_range" min="0" message="Minimum value is 0"/>
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
55 </param>
2
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
56
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
57 <param name="min_keep" value="35" type="integer" optional="true" label="Filter sequences less than this length (after trimming)">
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
58 <validator type="in_range" min="0" message="Minimum value is 0"/>
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
59 </param>
0
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
60 </inputs>
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
61
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
62 <outputs>
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
63 <data format_source="input_fastq" name="output_trimmed" label="Adapter/Contaminant Trimmed FastQ using ${tool.name} on ${on_string}"/>
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
64
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
65 <data format="txt" name="output_matches" label="Matches of Adapters/Contaminants using ${tool.name} on ${on_string}">
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
66 <filter>(matches_file == True)</filter>
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
67 </data>
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
68 </outputs>
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
69
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
70 <help>
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
71 Scythe uses a Naive Bayesian approach to classify contaminant substrings in sequence reads. It considers quality information, which can make it robust in picking out 3'-end adapters, which often include poor quality bases.
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
72
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
73 Most next generation sequencing reads have deteriorating quality towards the 3'-end. It's common for a quality-based trimmer to be employed before mapping, assemblies, and analysis to remove these poor quality bases. However, quality-based trimming could remove bases that are helpful in identifying (and removing) 3'-end adapter contaminants. Thus, it is recommended you run Scythe before quality-based trimming, as part of a read quality control pipeline.
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
74
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
75 The Bayesian approach Scythe uses compares two likelihood models: the probability of seeing the matches in a sequence given contamination, and not given contamination. Given that the read is contaminated, the probability of seeing a certain number of matches and mistmatches is a function of the quality of the sequence. Given the read is not contaminated (and is thus assumed to be random sequence), the probability of seeing a certain number of matches and mismatches is chance. The posterior is calculated across both these likelihood models, and the class (contaminated or not contaminated) with the maximum posterior probability is the class selected.
2
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
76
8e5357ca8ebd Uploaded
nikhil-joshi
parents: 0
diff changeset
77 Scythe will infer the quality type from the datatype of the file.
0
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
78 </help>
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
79
8161274941bf Uploaded
nikhil-joshi
parents:
diff changeset
80 </tool>