comparison arriba.xml @ 0:5ebf2354cc9b draft

"planemo upload for repository https://github.com/jj-umn/tools-iuc/tree/arriba/tools/arriba commit 52c9f9825debe783339c13bd1da9a42b59747bd2"
author jjohnson
date Thu, 07 Oct 2021 11:47:02 +0000
parents
children 9f2665b32c45
comparison
equal deleted inserted replaced
-1:000000000000 0:5ebf2354cc9b
1 <tool id="arriba" name="Arriba" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" python_template_version="3.5">
2 <description>detect gene fusions from STAR aligned RNA-Seq data</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements" />
7 <expand macro="version_command" />
8 <command detect_errors="exit_code"><![CDATA[
9 arriba
10 -x '$input'
11 #if $chimeric
12 -c '$chimeric'
13 #endif
14 -a '$genome_assembly'
15 -g '$gtf'
16 -b '$blacklist'
17 #if '$protein_domains'
18 -p '$protein_domains'
19 #endif
20 #if '$known_fusions'
21 -k '$known_fusions'
22 #endif
23 #if '$tags'
24 -t '$tags'
25 #endif
26 -o fusions.tsv
27 -O fusions.discarded.tsv
28 ]]></command>
29 <inputs>
30 <param name="input" argument="-x" type="data" format="sam,bam,cram" label="STAR Aligned.out.sam"/>
31 <param name="chimeric" argument="-c" type="data" format="sam,bam,cram" optional="true" label="STAR Chimeric.out.sam">
32 <help><![CDATA[ only required, if STAR was run with the parameter '--chimOutType SeparateSAMold' ]]></help>
33 </param>
34 <param name="genome_assembly" argument="-a" type="data" format="fasta" label="genome assembly fasta"/>
35 <param name="gtf" argument="-g" type="data" format="gtf" label="GTF file with gene annotation"/>
36 <param name="blacklist" argument="-b" type="data" format="tabular" label="File containing blacklisted ranges."/>
37 <param name="protein_domains" argument="-p" type="data" format="gff3" optional="true" label="File containing blacklisted ranges."/>
38 <param name="known_fusions" argument="-k" type="data" format="tabular" optional="true" label="File containing known fusions">
39 <help><![CDATA[ file two TAB separated columns: five-prime region three-prime region ]]></help>
40 </param>
41 <param name="tags" argument="-t" type="data" format="tabular" optional="true" label="File containing tag names for a fusion."/>
42 </inputs>
43 <outputs>
44 <data name="fusions" format="tabular" label="${tool.name} on ${on_string}: fusions.tsv" from_work_dir="fusions.tsv"/>
45 <data name="discarded" format="tabular" label="${tool.name} on ${on_string}: fusions.discarded.tsv" from_work_dir="fusions.discarded.tsv"/>
46 </outputs>
47 <help><![CDATA[
48
49 arriba -h
50 [2021-10-06T19:04:33] Launching Arriba 2.1.0
51
52 Arriba gene fusion detector
53 ---------------------------
54 Version: 2.1.0
55
56 Arriba is a fast tool to search for aberrant transcripts such as gene fusions.
57 It is based on chimeric alignments found by the STAR RNA-Seq aligner.
58
59 Usage: arriba [-c Chimeric.out.sam] -x Aligned.out.bam \
60 -g annotation.gtf -a assembly.fa [-b blacklists.tsv] [-k known_fusions.tsv] \
61 [-t tags.tsv] [-p protein_domains.gff3] [-d structural_variants_from_WGS.tsv] \
62 -o fusions.tsv [-O fusions.discarded.tsv] \
63 [OPTIONS]
64
65 -c FILE File in SAM/BAM/CRAM format with chimeric alignments as generated by STAR
66 (Chimeric.out.sam). This parameter is only required, if STAR was run with the
67 parameter '--chimOutType SeparateSAMold'. When STAR was run with the parameter
68 '--chimOutType WithinBAM', it suffices to pass the parameter -x to Arriba and -c
69 can be omitted.
70
71 -x FILE File in SAM/BAM/CRAM format with main alignments as generated by STAR
72 (Aligned.out.sam). Arriba extracts candidate reads from this file.
73
74 -g FILE GTF file with gene annotation. The file may be gzip-compressed.
75
76 -G GTF_FEATURES Comma-/space-separated list of names of GTF features.
77 Default: gene_name=gene_name|gene_id gene_id=gene_id
78 transcript_id=transcript_id feature_exon=exon feature_CDS=CDS
79
80 -a FILE FastA file with genome sequence (assembly). The file may be gzip-compressed. An
81 index with the file extension .fai must exist only if CRAM files are processed.
82
83 -b FILE File containing blacklisted events (recurrent artifacts and transcripts
84 observed in healthy tissue).
85
86 -k FILE File containing known/recurrent fusions. Some cancer entities are often
87 characterized by fusions between the same pair of genes. In order to boost
88 sensitivity, a list of known fusions can be supplied using this parameter. The list
89 must contain two columns with the names of the fused genes, separated by tabs.
90
91 -o FILE Output file with fusions that have passed all filters.
92
93 -O FILE Output file with fusions that were discarded due to filtering.
94
95 -t FILE Tab-separated file containing fusions to annotate with tags in the 'tags' column.
96 The first two columns specify the genes; the third column specifies the tag. The
97 file may be gzip-compressed.
98
99 -p FILE File in GFF3 format containing coordinates of the protein domains of genes. The
100 protein domains retained in a fusion are listed in the column
101 'retained_protein_domains'. The file may be gzip-compressed.
102
103 -d FILE Tab-separated file with coordinates of structural variants found using
104 whole-genome sequencing data. These coordinates serve to increase sensitivity
105 towards weakly expressed fusions and to eliminate fusions with low evidence.
106
107 -D MAX_GENOMIC_BREAKPOINT_DISTANCE When a file with genomic breakpoints obtained via
108 whole-genome sequencing is supplied via the -d
109 parameter, this parameter determines how far a
110 genomic breakpoint may be away from a
111 transcriptomic breakpoint to consider it as a
112 related event. For events inside genes, the
113 distance is added to the end of the gene; for
114 intergenic events, the distance threshold is
115 applied as is. Default: 100000
116
117 -s STRANDEDNESS Whether a strand-specific protocol was used for library preparation,
118 and if so, the type of strandedness (auto/yes/no/reverse). When
119 unstranded data is processed, the strand can sometimes be inferred from
120 splice-patterns. But in unclear situations, stranded data helps
121 resolve ambiguities. Default: auto
122
123 -i CONTIGS Comma-/space-separated list of interesting contigs. Fusions between genes
124 on other contigs are ignored. Contigs can be specified with or without the
125 prefix "chr". Asterisks (*) are treated as wild-cards.
126 Default: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y AC_* NC_*
127
128 -v CONTIGS Comma-/space-separated list of viral contigs. Asterisks (*) are treated as
129 wild-cards.
130 Default: AC_* NC_*
131
132 -f FILTERS Comma-/space-separated list of filters to disable. By default all filters are
133 enabled. Valid values: homologs, low_entropy, isoforms,
134 top_expressed_viral_contigs, viral_contigs, non_coding_neighbors,
135 mismatches, duplicates, no_genomic_support, genomic_support, intronic,
136 end_to_end, relative_support, low_coverage_viral_contigs,
137 merge_adjacent, mismappers, multimappers, same_gene, long_gap,
138 internal_tandem_duplication, small_insert_size, read_through,
139 inconsistently_clipped, uninteresting_contigs, intragenic_exonic,
140 spliced, hairpin, blacklist, min_support, select_best, in_vitro,
141 short_anchor, known_fusions, no_coverage, homopolymer, many_spliced
142
143 -E MAX_E-VALUE Arriba estimates the number of fusions with a given number of supporting
144 reads which one would expect to see by random chance. If the expected number
145 of fusions (e-value) is higher than this threshold, the fusion is
146 discarded by the 'relative_support' filter. Note: Increasing this
147 threshold can dramatically increase the number of false positives and may
148 increase the runtime of resource-intensive steps. Fractional values are
149 possible. Default: 0.300000
150
151 -S MIN_SUPPORTING_READS The 'min_support' filter discards all fusions with fewer than
152 this many supporting reads (split reads and discordant mates
153 combined). Default: 2
154
155 -m MAX_MISMAPPERS When more than this fraction of supporting reads turns out to be
156 mismappers, the 'mismappers' filter discards the fusion. Default:
157 0.800000
158
159 -L MAX_HOMOLOG_IDENTITY Genes with more than the given fraction of sequence identity are
160 considered homologs and removed by the 'homologs' filter.
161 Default: 0.300000
162
163 -H HOMOPOLYMER_LENGTH The 'homopolymer' filter removes breakpoints adjacent to
164 homopolymers of the given length or more. Default: 6
165
166 -R READ_THROUGH_DISTANCE The 'read_through' filter removes read-through fusions
167 where the breakpoints are less than the given distance away
168 from each other. Default: 10000
169
170 -A MIN_ANCHOR_LENGTH Alignment artifacts are often characterized by split reads coming
171 from only one gene and no discordant mates. Moreover, the split
172 reads only align to a short stretch in one of the genes. The
173 'short_anchor' filter removes these fusions. This parameter sets
174 the threshold in bp for what the filter considers short. Default: 23
175
176 -M MANY_SPLICED_EVENTS The 'many_spliced' filter recovers fusions between genes that
177 have at least this many spliced breakpoints. Default: 4
178
179 -K MAX_KMER_CONTENT The 'low_entropy' filter removes reads with repetitive 3-mers. If
180 the 3-mers make up more than the given fraction of the sequence, then
181 the read is discarded. Default: 0.600000
182
183 -V MAX_MISMATCH_PVALUE The 'mismatches' filter uses a binomial model to calculate a
184 p-value for observing a given number of mismatches in a read. If
185 the number of mismatches is too high, the read is discarded.
186 Default: 0.010000
187
188 -F FRAGMENT_LENGTH When paired-end data is given, the fragment length is estimated
189 automatically and this parameter has no effect. But when single-end
190 data is given, the mean fragment length should be specified to
191 effectively filter fusions that arise from hairpin structures.
192 Default: 200
193
194 -U MAX_READS Subsample fusions with more than the given number of supporting reads. This
195 improves performance without compromising sensitivity, as long as the
196 threshold is high. Counting of supporting reads beyond the threshold is
197 inaccurate, obviously. Default: 300
198
199 -Q QUANTILE Highly expressed genes are prone to produce artifacts during library
200 preparation. Genes with an expression above the given quantile are eligible
201 for filtering by the 'in_vitro' filter. Default: 0.998000
202
203 -e EXONIC_FRACTION The breakpoints of false-positive predictions of intragenic events
204 are often both in exons. True predictions are more likely to have at
205 least one breakpoint in an intron, because introns are larger. If the
206 fraction of exonic sequence between two breakpoints is smaller than
207 the given fraction, the 'intragenic_exonic' filter discards the
208 event. Default: 0.330000
209
210 -T TOP_N Only report viral integration sites of the top N most highly expressed viral
211 contigs. Default: 5
212
213 -C COVERED_FRACTION Ignore virally associated events if the virus is not fully
214 expressed, i.e., less than the given fraction of the viral contig is
215 transcribed. Default: 0.150000
216
217 -l MAX_ITD_LENGTH Maximum length of internal tandem duplications. Note: Increasing
218 this value beyond the default can impair performance and lead to many
219 false positives. Default: 100
220
221 -u Instead of performing duplicate marking itself, Arriba relies on duplicate marking by a
222 preceding program using the BAM_FDUP flag. This makes sense when unique molecular
223 identifiers (UMI) are used.
224
225 -X To reduce the runtime and file size, by default, the columns 'fusion_transcript',
226 'peptide_sequence', and 'read_identifiers' are left empty in the file containing
227 discarded fusion candidates (see parameter -O). When this flag is set, this extra
228 information is reported in the discarded fusions file.
229
230 -I If assembly of the fusion transcript sequence from the supporting reads is incomplete
231 (denoted as '...'), fill the gaps using the assembly sequence wherever possible.
232
233 -h Print help and exit.
234
235 Code repository: https://github.com/suhrig/arriba
236 Get help/report bugs: https://github.com/suhrig/arriba/issues
237 User manual: https://arriba.readthedocs.io/
238 Please cite: https://doi.org/10.1101/gr.257246.119
239
240 ]]></help>
241 <expand macro="citations" />
242 </tool>