annotate tools/sr_assembly/velveth.xml @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 <tool id="velveth" name="velveth" version="1.0.0">
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 <description>Prepare a dataset for the Velvet velvetg Assembler</description>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 <command interpreter="python">
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 velveth_wrapper.py
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 '$out_file1' '$out_file1.extra_files_path'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 $hash_length
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 $strand_specific
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 #for $i in $inputs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 ${i.file_format}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 ${i.read_type}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 ${i.input}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 #end for
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 </command>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 <inputs>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 <param label="Hash Length" name="hash_length" type="select" help="k-mer length in base pairs of the words being hashed.">
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 <option value="11">11</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 <option value="13">13</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 <option value="15">15</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 <option value="17">17</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 <option value="19">19</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 <option value="21" selected="yes">21</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 <option value="23">23</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 <option value="25">25</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 <option value="27">27</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 <option value="29">29</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 </param>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 <param name="strand_specific" type="boolean" checked="false" truevalue="-strand_specific" falsevalue="" label="Use strand specific transcriptome sequencing" help="If you are using a strand specific transcriptome sequencing protocol, you may wish to use this option for better results."/>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 <repeat name="inputs" title="Input Files">
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 <param label="file format" name="file_format" type="select">
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 <option value="-fasta" selected="yes">fasta</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 <option value="-fastq">fastq</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 <option value="-eland">eland</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 <option value="-gerald">gerald</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 </param>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 <param label="read type" name="read_type" type="select">
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 <option value="-short" selected="yes">short reads</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 <option value="-shortPaired">shortPaired reads</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 <option value="-short2">short2 reads</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 <option value="-shortPaired2">shortPaired2 reads</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 <option value="-long">long reads</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 <option value="-longPaired">longPaired reads</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 </param>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 <param name="input" type="data" format="fasta,fastq,eland,gerald" label="Dataset"/>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 </repeat>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 </inputs>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 <outputs>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 <data format="velvet" name="out_file1" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 </outputs>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 <requirements>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 <requirement type="package">velvet</requirement>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 </requirements>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 <tests>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 <test>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 <param name="hash_length" value="21" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 <param name="read_type" value="-shortPaired" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 <!-- <repeat name="inputs"> -->
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 <param name="file_format" value="fasta" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 <param name="read_type" value="shortPaired reads" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 <param name="input" value="velvet_test_reads.fa" ftype="fasta" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 <!-- </repeat> -->
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 <param name="strand_specific" value="" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 <output name="out_file1" file="velveth_test1/output.html" lines_diff="4">
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 <extra_files type="file" name='Sequences' value="velveth_test1/Sequences" compare="diff" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 <extra_files type="file" name='Roadmaps' value="velveth_test1/Roadmaps" compare="diff" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 </output>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 </test>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 </tests>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 <help>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 **Velvet Overview**
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 Velvet_ is a de novo genomic assembler specially designed for short read sequencing technologies, such as Solexa or 454, developed by Daniel Zerbino and Ewan Birney at the European Bioinformatics Institute (EMBL-EBI), near Cambridge, in the United Kingdom.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 Velvet currently takes in short read sequences, removes errors then produces high quality unique contigs. It then uses paired-end read and long read information, when available, to retrieve the repeated areas between contigs.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 Read the Velvet `documentation`__ for details on using the Velvet Assembler.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 .. _Velvet: http://www.ebi.ac.uk/~zerbino/velvet/
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 .. __: http://www.ebi.ac.uk/~zerbino/velvet/Manual.pdf
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 ------
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 **Velveth**
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 Velveth takes in a number of sequence files, produces a hashtable, then outputs two files in an output directory (creating it if necessary), Sequences and Roadmaps, which are necessary to velvetg.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 ------
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 **Hash Length**
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 The hash length, also known as k-mer length, corresponds to the length, in base pairs, of the words being hashed.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 The hash length is the length of the k-mers being entered in the hash table. Firstly, you must observe three technical constraints::
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 # it must be an odd number, to avoid palindromes. If you put in an even number, Velvet will just decrement it and proceed.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 # it must be below or equal to MAXKMERHASH length (cf. 2.3.3, by default 31bp), because it is stored on 64 bits
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 # it must be strictly inferior to read length, otherwise you simply will not observe any overlaps between reads, for obvious reasons.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 Now you still have quite a lot of possibilities. As is often the case, it's a trade- off between specificity and sensitivity. Longer kmers bring you more specificity (i.e. less spurious overlaps) but lowers coverage (cf. below). . . so there's a sweet spot to be found with time and experience.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 We like to think in terms of "k-mer coverage", i.e. how many times has a k-mer been seen among the reads. The relation between k-mer coverage Ck and standard (nucleotide-wise) coverage C is Ck = C # (L - k + 1)/L where k is your hash length, and L you read length.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 Experience shows that this kmer coverage should be above 10 to start getting decent results. If Ck is above 20, you might be "wasting" coverage. Experience also shows that empirical tests with different values for k are not that costly to run!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 **Input Files**
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 Velvet works mainly with fasta and fastq formats. For paired-end reads, the assumption is that each read is next to its mate
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 read. In other words, if the reads are indexed from 0, then reads 0 and 1 are paired, 2 and 3, 4 and 5, etc.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 Supported file formats are::
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 fasta
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 fastq
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 fasta.gz
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 fastq.gz
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 eland
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 gerald
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 Read categories are::
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 short (default)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 shortPaired
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 short2 (same as short, but for a separate insert-size library)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 shortPaired2 (see above)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 long (for Sanger, 454 or even reference sequences)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 longPaired
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 </help>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 </tool>