Mercurial > repos > devteam > hisat
comparison hisat.xml @ 0:e3fe061597ac draft
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/hisat commit 5a7365750648c26206f05ac7956936c243c2b980
author | devteam |
---|---|
date | Thu, 11 Jun 2015 16:35:16 -0400 |
parents | |
children | 8bb3efa0dae8 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e3fe061597ac |
---|---|
1 <tool id="hisat" name="HISAT" version="1.0.1"> | |
2 <description>spliced aligner</description> | |
3 <macros> | |
4 <import>hisat_macros.xml</import> | |
5 </macros> | |
6 <requirements> | |
7 <requirement type="package" version="0.1.6">hisat</requirement> | |
8 <requirement type="package" version="1.2">samtools</requirement> | |
9 </requirements> | |
10 <stdio> | |
11 <exit_code range="1:" /> | |
12 </stdio> | |
13 <version_command>hisat --version</version_command> | |
14 <command><![CDATA[ | |
15 #if str($spliced_options.spliced_options_selector) == "advanced" and str($spliced_options.known_splice_gff) != 'None': | |
16 ln -s "${spliced_options.known_splice_gff}" splice_sites.gff && | |
17 extract_splice_sites.py splice_sites.gff > splice_sites.txt && | |
18 #end if | |
19 #if $reference_genome.reference_genome_source == "history": | |
20 ln -s "$reference_genome.history_item" genome.fa && | |
21 hisat-build genome.fa genome && | |
22 #set index_path = 'genome' | |
23 #else: | |
24 #set index_path = $reference_genome.index.fields.path | |
25 #end if | |
26 hisat -p \${GALAXY_SLOTS:-1} -x "${index_path}" | |
27 #if str($input_format.paired.paired_selector) == 'paired': | |
28 -1 "${reads_f}" -2 "${reads_r}" | |
29 #else if str($input_format.paired.paired_selector) == 'paired_collection': | |
30 -1 "${input_format.paired.reads.forward}" -2 "${input_format.paired.reads.reverse}" | |
31 #else if str($input_format.paired.paired_selector) == 'paired_list': | |
32 #set forward_reads = [] | |
33 #set reverse_reads = [] | |
34 #for read in $input_format.paired.reads: | |
35 $forward_reads.append($read.forward) | |
36 $reverse_reads.append($read.reverse) | |
37 #end for | |
38 -1 "${','.join([str($read) for read in $forward_reads])}" | |
39 -2 "${','.join([str($read) for read in $reverse_reads])}" | |
40 #else: | |
41 -U "${reads}" | |
42 #end if | |
43 #if $input_format.input_format_selector == 'fasta': | |
44 -f | |
45 #end if | |
46 #if str($input_options.input_options_selector) == "advanced": | |
47 -s ${input_options.skip} -u ${input_options.stop_after} -5 ${input_options.trim_five} -3 ${input_options.trim_three} | |
48 #end if | |
49 #if str($scoring_options.scoring_options_selector) == "advanced": | |
50 --ma ${scoring_options.match_bonus} --mp ${scoring_options.max_mismatch},${scoring_options.min_mismatch} | |
51 --np ${scoring_options.ambiguous_penalty} --rdg ${scoring_options.read_open_penalty},${scoring_options.read_extend_penalty} | |
52 --rfg ${scoring_options.ref_open_penalty},${scoring_options.ref_extend_penalty} | |
53 #end if | |
54 #if str($alignment_options.alignment_options_selector) == "advanced": | |
55 --n-ceil ${alignment_options.function_type},${alignment_options.constant_term},${alignment_options.coefficient_term} | |
56 ${alignment_options.skip_forward} ${alignment_options.skip_reverse} | |
57 #end if | |
58 #if str($spliced_options.spliced_options_selector) == "advanced": | |
59 --pen-cansplice ${spliced_options.canonical_penalty} --pen-noncansplice ${spliced_options.noncanonical_penalty} | |
60 --pen-intronlen ${spliced_options.function_type},${spliced_options.constant_term},${spliced_options.coefficient_term} | |
61 #if str($spliced_options.known_splice_gff) != 'None': | |
62 --known-splicesite-infile splice_sites.txt | |
63 #end if | |
64 #end if | |
65 | samtools view -bS - | samtools sort - -o hsbam > "${output_alignments}" | |
66 ]]></command> | |
67 <inputs> | |
68 <conditional name="input_format"> | |
69 <param name="input_format_selector" label="Input data format" type="select"> | |
70 <option value="fastq" selected="selected">FASTQ</option> | |
71 <option value="fasta">FASTA</option> | |
72 </param> | |
73 <when value="fasta"> | |
74 <conditional name="paired"> | |
75 <expand macro="single_paired_selector" /> | |
76 <when value="paired_collection"> | |
77 <param format="fasta" name="reads" type="data_collection" collection_type="paired" label="Paired reads" /> | |
78 </when> | |
79 <when value="paired_list"> | |
80 <param format="fasta" name="reads" type="data_collection" collection_type="list:paired" label="Paired reads" /> | |
81 </when> | |
82 <when value="paired"> | |
83 <param label="Forward reads" type="data" name="reads_f" multiple="true" format="fasta" /> | |
84 <param label="Reverse reads" type="data" name="reads_r" multiple="true" format="fasta" /> | |
85 </when> | |
86 <when value="single"> | |
87 <param label="Reads" type="data" name="reads" multiple="true" format="fasta" /> | |
88 </when> | |
89 </conditional> | |
90 </when> | |
91 <when value="fastq"> | |
92 <conditional name="paired"> | |
93 <expand macro="single_paired_selector" /> | |
94 <when value="paired_collection"> | |
95 <param format="fastq,fastqsanger,fastqsolexa" name="reads" type="data_collection" collection_type="paired" label="Paired reads" /> | |
96 </when> | |
97 <when value="paired_list"> | |
98 <param format="fastq,fastqsanger,fastqsolexa" name="reads" type="data_collection" collection_type="list:paired" label="Paired reads" /> | |
99 </when> | |
100 <when value="paired"> | |
101 <param label="Forward reads" type="data" name="reads_f" multiple="true" format="fastq,fastqsanger,fastqsolexa" /> | |
102 <param label="Reverse reads" type="data" name="reads_r" multiple="true" format="fastq,fastqsanger,fastqsolexa" /> | |
103 </when> | |
104 <when value="single"> | |
105 <param label="Reads" type="data" name="reads" multiple="true" format="fastq,fastqsanger,fastqsolexa" /> | |
106 </when> | |
107 </conditional> | |
108 </when> | |
109 </conditional> | |
110 <conditional name="reference_genome"> | |
111 <param name="reference_genome_source" type="select" label="Source for the reference genome to align against" help="Built-in references were created using default options"> | |
112 <option value="indexed" selected="True">Use a built-in genome</option> | |
113 <option value="history">Use a genome from history</option> | |
114 </param> | |
115 <when value="indexed"> | |
116 <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team"> | |
117 <options from_data_table="hisat_indexes"> | |
118 <filter type="sort_by" column="2"/> | |
119 <validator type="no_options" message="No genomes are available for the selected input dataset"/> | |
120 </options> | |
121 </param> | |
122 </when> | |
123 <when value="history"> | |
124 <param name="history_item" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" /> | |
125 </when> | |
126 </conditional> | |
127 <conditional name="alignment_options"> | |
128 <param label="Alignment options" name="alignment_options_selector" type="select"> | |
129 <option value="defaults">Use default values</option> | |
130 <option value="advanced">Specify alignment parameters</option> | |
131 </param> | |
132 <when value="defaults" /> | |
133 <when value="advanced"> | |
134 <expand macro="function"> | |
135 <label>Sets a function governing the maximum number of ambiguous characters</label> | |
136 </expand> | |
137 <param name="ignore_quals" label="Ignore quality values" type="boolean" truevalue="--ignore-quals" falsevalue="" help="(--ignore-quals) When calculating a mismatch penalty, always consider the quality value at the mismatched position to be the highest possible, regardless of the actual value. I.e. input is treated as though all quality values are high. This is also the default behavior when the input doesn't specify quality values." /> | |
138 <param name="skip_forward" label="Skip forward strand of reference" type="boolean" truevalue="--nofw" falsevalue="" help="(--nofw) If --nofw is specified, hisat will not attempt to align unpaired reads to the forward (Watson) reference strand. In paired-end mode, --nofw and --norc pertain to the fragments; i.e. specifying --nofw causes hisat to explore only those paired-end configurations corresponding to fragments from the reverse-complement (Crick) strand." /> | |
139 <param name="skip_reverse" label="Skip reference strand of reference" type="boolean" truevalue="--norc" falsevalue="" help="(--norc) If --norc is specified, hisat will not attempt to align unpaired reads against the reverse-complement (Crick) reference strand. In paired-end mode, --nofw and --norc pertain to the fragments; i.e. specifying --nofw causes hisat to explore only those paired-end configurations corresponding to fragments from the reverse-complement (Crick) strand." /> | |
140 </when> | |
141 </conditional> | |
142 <conditional name="input_options"> | |
143 <param label="Input options" name="input_options_selector" type="select"> | |
144 <option value="defaults">Use default values</option> | |
145 <option value="advanced">Specify input parameters</option> | |
146 </param> | |
147 <when value="defaults" /> | |
148 <when value="advanced"> | |
149 <param name="skip" label="Skip the first N reads or pairs in the input" help="-s" type="integer" value="0" default="0" /> | |
150 <param name="stop_after" label="Stop after aligning N reads" help="(-u) Align the first N reads or read pairs from the input (after the first N reads or pairs have been skipped), then stop." type="integer" value="0" default="0" /> | |
151 <param name="trim_five" label="Trim 5' end" help="(-5) Trim N bases from 5' (left) end of each read before alignment" type="integer" value="0" default="0" /> | |
152 <param name="trim_three" label="Trim 3' end" help="(-3) Trim N bases from 3' (right) end of each read before alignment" type="integer" value="0" default="0" /> | |
153 </when> | |
154 </conditional> | |
155 <conditional name="scoring_options"> | |
156 <param label="Scoring options" name="scoring_options_selector" type="select"> | |
157 <option value="defaults">Use default values</option> | |
158 <option value="advanced">Specify scoring parameters</option> | |
159 </param> | |
160 <when value="defaults" /> | |
161 <when value="advanced"> | |
162 <param name="match_bonus" label="Set match bonus" help="(--ma) In local mode N is added to the alignment score for each position where a read character aligns to a reference character and the characters match. Not used in end-to-end mode" type="integer" value="2" default="2" /> | |
163 <param name="max_mismatch" label="Maximum mismatch penalty" help="(--mp) Sets the maximum mismatch penalty. A number less than or equal to MX and greater than or equal to MN is subtracted from the alignment score for each position where a read character aligns to a reference character, the characters do not match, and neither is an N. If --ignore-quals is specified, the number subtracted quals MX. Otherwise, the number subtracted is MN + floor( (MX-MN)(MIN(Q, 40.0)/40.0) ) where Q is the Phred quality value." type="integer" value="6" default="6" /> | |
164 <param name="min_mismatch" label="Minimum mismatch penalty" help="(--mp) Sets the minimum mismatch penalty. A number less than or equal to MX and greater than or equal to MN is subtracted from the alignment score for each position where a read character aligns to a reference character, the characters do not match, and neither is an N. If --ignore-quals is specified, the number subtracted quals MX. Otherwise, the number subtracted is MN + floor( (MX-MN)(MIN(Q, 40.0)/40.0) ) where Q is the Phred quality value." type="integer" value="2" default="2" /> | |
165 <param name="ambiguous_penalty" label="Ambiguous read penalty" help="(--np) Sets penalty for positions where the read, reference, or both, contain an ambiguous character such as N" type="integer" value="1" default="1" /> | |
166 <param name="read_open_penalty" label="Read gap open penalty." type="integer" help="(--rdg) A read gap of length N gets a penalty of [open_penalty] + N * [extend_penalty]" value="5" default="5" /> | |
167 <param name="read_extend_penalty" label="Read gap extend penalty." type="integer" help="(--rdg) A read gap of length N gets a penalty of [open_penalty] + N * [extend_penalty]" value="3" default="3" /> | |
168 <param name="ref_open_penalty" label="Reference gap open penalty." help="(--rfg) A reference gap of length N gets a penalty of [open_penalty] + N * [extend_penalty]" type="integer" value="5" default="5" /> | |
169 <param name="ref_extend_penalty" label="Reference gap extend penalty." help="(--rfg) A reference gap of length N gets a penalty of [open_penalty] + N * [extend_penalty]" type="integer" value="3" default="3" /> | |
170 </when> | |
171 </conditional> | |
172 <conditional name="spliced_options"> | |
173 <param label="Spliced alignment parameters" name="spliced_options_selector" type="select"> | |
174 <option value="defaults">Use default values</option> | |
175 <option value="advanced">Specify spliced alignment parameters</option> | |
176 </param> | |
177 <when value="defaults" /> | |
178 <when value="advanced"> | |
179 <param name="canonical_penalty" label="Penalty for canonical splice sites" type="integer" value="0" /> | |
180 <param name="noncanonical_penalty" label="Penalty for non-canonical splice sites" type="integer" value="3" /> | |
181 <expand macro="function"> | |
182 <label>Sets the penalty for long introns so that alignments with shorter introns are preferred to those with longer ones.</label> | |
183 </expand> | |
184 <param name="min_intron" label="Minimum intron length" type="integer" value="20" /> | |
185 <param name="max_intron" label="Maximum intron length" type="integer" value="500000" /> | |
186 <param name="known_splice_gff" label="GTF/GFF file with known splice sites" type="data" format="gff" optional="True" /> | |
187 </when> | |
188 </conditional> | |
189 </inputs> | |
190 <tests> | |
191 <test> | |
192 <param name="input_format_selector" value="fastq" /> | |
193 <param name="paired_selector" value="paired" /> | |
194 <param name="reference_genome_source" value="history" /> | |
195 <param name="history_item" value="phiX.fa" ftype="fasta" /> | |
196 <param name="reads_f" ftype="fastqsanger" value="hisat_input_1_forward.fastq" /> | |
197 <param name="reads_r" ftype="fastqsanger" value="hisat_input_1_reverse.fastq" /> | |
198 <output name="output_alignments" ftype="bam" file="hisat_output_1.bam" /> | |
199 </test> | |
200 <test> | |
201 <param name="input_format_selector" value="fastq" /> | |
202 <param name="paired_selector" value="paired" /> | |
203 <param name="reference_genome_source" value="history" /> | |
204 <param name="history_item" value="phiX.fa" ftype="fasta" /> | |
205 <param name="input_options_selector" value="advanced" /> | |
206 <param name="trim_three" value="15" /> | |
207 <param name="trim_five" value="15" /> | |
208 <param name="reads_f" ftype="fastqsanger" value="hisat_input_2_forward.fastq" /> | |
209 <param name="reads_r" ftype="fastqsanger" value="hisat_input_2_reverse.fastq" /> | |
210 <output name="output_alignments" ftype="bam" file="hisat_output_2.bam" /> | |
211 </test> | |
212 </tests> | |
213 <outputs> | |
214 <data name="output_alignments" format="bam" /> | |
215 </outputs> | |
216 <help><![CDATA[ | |
217 Introduction | |
218 ============ | |
219 | |
220 What is HISAT? | |
221 -------------- | |
222 | |
223 `HISAT <http://ccb.jhu.edu/software/hisat>`__ is a fast and sensitive | |
224 spliced alignment program. As part of HISAT, we have developed a new | |
225 indexing scheme based on the Burrows-Wheeler transform | |
226 (`BWT <http://en.wikipedia.org/wiki/Burrows-Wheeler_transform>`__) and | |
227 the `FM index <http://en.wikipedia.org/wiki/FM-index>`__, called | |
228 hierarchical indexing, that employs two types of indexes: (1) one global | |
229 FM index representing the whole genome, and (2) many separate local FM | |
230 indexes for small regions collectively covering the genome. Our | |
231 hierarchical index for the human genome (about 3 billion bp) includes | |
232 ~48,000 local FM indexes, each representing a genomic region of | |
233 ~64,000bp. As the basis for non-gapped alignment, the FM index is | |
234 extremely fast with a low memory footprint, as demonstrated by | |
235 `Bowtie <http://bowtie-bio.sf.net>`__. In addition, HISAT provides | |
236 several alignment strategies specifically designed for mapping different | |
237 types of RNA-seq reads. All these together, HISAT enables extremely fast | |
238 and sensitive alignment of reads, in particular those spanning two exons | |
239 or more. As a result, HISAT is much faster >50 times than | |
240 `TopHat2 <http://ccb.jhu.edu/software/tophat>`__ with better alignment | |
241 quality. Although it uses a large number of indexes, the memory | |
242 requirement of HISAT is still modest, approximately 4.3 GB for human. | |
243 HISAT uses the `Bowtie2 <http://bowtie-bio.sf.net/bowtie2>`__ | |
244 implementation to handle most of the operations on the FM index. In | |
245 addition to spliced alignment, HISAT handles reads involving indels and | |
246 supports a paired-end alignment mode. Multiple processors can be used | |
247 simultaneously to achieve greater alignment speed. HISAT outputs | |
248 alignments in `SAM <http://samtools.sourceforge.net/SAM1.pdf>`__ format, | |
249 enabling interoperation with a large number of other tools (e.g. | |
250 `SAMtools <http://samtools.sourceforge.net>`__, | |
251 `GATK <http://www.broadinstitute.org/gsa/wiki/index.php/The_Genome_Analysis_Toolkit>`__) | |
252 that use SAM. HISAT is distributed under the `GPLv3 | |
253 license <http://www.gnu.org/licenses/gpl-3.0.html>`__, and it runs on | |
254 the command line under Linux, Mac OS X and Windows. | |
255 | |
256 Running HISAT | |
257 ============= | |
258 | |
259 Reporting | |
260 --------- | |
261 | |
262 The reporting mode governs how many alignments HISAT looks for, and how | |
263 to report them. | |
264 | |
265 In general, when we say that a read has an alignment, we mean that it | |
266 has a `valid | |
267 alignment <#valid-alignments-meet-or-exceed-the-minimum-score-threshold>`__. | |
268 When we say that a read has multiple alignments, we mean that it has | |
269 multiple alignments that are valid and distinct from one another. | |
270 | |
271 Distinct alignments map a read to different places | |
272 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
273 | |
274 Two alignments for the same individual read are "distinct" if they map | |
275 the same read to different places. Specifically, we say that two | |
276 alignments are distinct if there are no alignment positions where a | |
277 particular read offset is aligned opposite a particular reference offset | |
278 in both alignments with the same orientation. E.g. if the first | |
279 alignment is in the forward orientation and aligns the read character at | |
280 read offset 10 to the reference character at chromosome 3, offset | |
281 3,445,245, and the second alignment is also in the forward orientation | |
282 and also aligns the read character at read offset 10 to the reference | |
283 character at chromosome 3, offset 3,445,245, they are not distinct | |
284 alignments. | |
285 | |
286 Two alignments for the same pair are distinct if either the mate 1s in | |
287 the two paired-end alignments are distinct or the mate 2s in the two | |
288 alignments are distinct or both. | |
289 | |
290 Default mode: search for one or more alignments, report each | |
291 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
292 | |
293 HISAT searches for up to N distinct, primary alignments for each read, | |
294 where N equals the integer specified with the ``-k`` parameter. Primary | |
295 alignments mean alignments whose alignment score is equal or higher than | |
296 any other alignments. It is possible that multiple distinct alignments | |
297 whave the same score. That is, if ``-k 2`` is specified, HISAT will | |
298 search for at most 2 distinct alignments. The alignment score for a | |
299 paired-end alignment equals the sum of the alignment scores of the | |
300 individual mates. Each reported read or pair alignment beyond the first | |
301 has the SAM 'secondary' bit (which equals 256) set in its FLAGS field. | |
302 See the `SAM specification <http://samtools.sourceforge.net/SAM1.pdf>`__ | |
303 for details. | |
304 | |
305 HISAT does not "find" alignments in any specific order, so for reads | |
306 that have more than N distinct, valid alignments, HISAT does not | |
307 gaurantee that the N alignments reported are the best possible in terms | |
308 of alignment score. Still, this mode can be effective and fast in | |
309 situations where the user cares more about whether a read aligns (or | |
310 aligns a certain number of times) than where exactly it originated. | |
311 | |
312 Alignment summmary | |
313 ------------------ | |
314 | |
315 When HISAT finishes running, it prints messages summarizing what | |
316 happened. These messages are printed to the "standard error" ("stderr") | |
317 filehandle. For datasets consisting of unpaired reads, the summary might | |
318 look like this: | |
319 | |
320 :: | |
321 | |
322 20000 reads; of these: | |
323 20000 (100.00%) were unpaired; of these: | |
324 1247 (6.24%) aligned 0 times | |
325 18739 (93.69%) aligned exactly 1 time | |
326 14 (0.07%) aligned >1 times | |
327 93.77% overall alignment rate | |
328 | |
329 For datasets consisting of pairs, the summary might look like this: | |
330 | |
331 :: | |
332 | |
333 10000 reads; of these: | |
334 10000 (100.00%) were paired; of these: | |
335 650 (6.50%) aligned concordantly 0 times | |
336 8823 (88.23%) aligned concordantly exactly 1 time | |
337 527 (5.27%) aligned concordantly >1 times | |
338 ---- | |
339 650 pairs aligned concordantly 0 times; of these: | |
340 34 (5.23%) aligned discordantly 1 time | |
341 ---- | |
342 616 pairs aligned 0 times concordantly or discordantly; of these: | |
343 1232 mates make up the pairs; of these: | |
344 660 (53.57%) aligned 0 times | |
345 571 (46.35%) aligned exactly 1 time | |
346 1 (0.08%) aligned >1 times | |
347 96.70% overall alignment rate | |
348 | |
349 The indentation indicates how subtotals relate to totals. | |
350 ]]></help> | |
351 <citations> | |
352 <citation type="doi">10.1038/nmeth.3317</citation> | |
353 </citations> | |
354 </tool> |