changeset 0:5ebf2354cc9b draft

"planemo upload for repository https://github.com/jj-umn/tools-iuc/tree/arriba/tools/arriba commit 52c9f9825debe783339c13bd1da9a42b59747bd2"
author jjohnson
date Thu, 07 Oct 2021 11:47:02 +0000
parents
children 9f2665b32c45
files arriba.help arriba.xml macros.xml
diffstat 3 files changed, 453 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/arriba.help	Thu Oct 07 11:47:02 2021 +0000
@@ -0,0 +1,191 @@
+% arriba -h
+[2021-10-06T19:04:33] Launching Arriba 2.1.0
+
+Arriba gene fusion detector
+---------------------------
+Version: 2.1.0
+
+Arriba is a fast tool to search for aberrant transcripts such as gene fusions.
+It is based on chimeric alignments found by the STAR RNA-Seq aligner.
+
+Usage: arriba [-c Chimeric.out.sam] -x Aligned.out.bam \
+              -g annotation.gtf -a assembly.fa [-b blacklists.tsv] [-k known_fusions.tsv] \
+              [-t tags.tsv] [-p protein_domains.gff3] [-d structural_variants_from_WGS.tsv] \
+              -o fusions.tsv [-O fusions.discarded.tsv] \
+              [OPTIONS]
+
+ -c FILE  File in SAM/BAM/CRAM format with chimeric alignments as generated by STAR
+          (Chimeric.out.sam). This parameter is only required, if STAR was run with the
+          parameter '--chimOutType SeparateSAMold'. When STAR was run with the parameter
+          '--chimOutType WithinBAM', it suffices to pass the parameter -x to Arriba and -c
+          can be omitted.
+
+ -x FILE  File in SAM/BAM/CRAM format with main alignments as generated by STAR
+          (Aligned.out.sam). Arriba extracts candidate reads from this file.
+
+ -g FILE  GTF file with gene annotation. The file may be gzip-compressed.
+
+ -G GTF_FEATURES  Comma-/space-separated list of names of GTF features.
+                  Default: gene_name=gene_name|gene_id gene_id=gene_id
+                  transcript_id=transcript_id feature_exon=exon feature_CDS=CDS
+
+ -a FILE  FastA file with genome sequence (assembly). The file may be gzip-compressed. An
+          index with the file extension .fai must exist only if CRAM files are processed.
+
+ -b FILE  File containing blacklisted events (recurrent artifacts and transcripts
+          observed in healthy tissue).
+
+ -k FILE  File containing known/recurrent fusions. Some cancer entities are often
+          characterized by fusions between the same pair of genes. In order to boost
+          sensitivity, a list of known fusions can be supplied using this parameter. The list
+          must contain two columns with the names of the fused genes, separated by tabs.
+
+ -o FILE  Output file with fusions that have passed all filters.
+
+ -O FILE  Output file with fusions that were discarded due to filtering.
+
+ -t FILE  Tab-separated file containing fusions to annotate with tags in the 'tags' column.
+          The first two columns specify the genes; the third column specifies the tag. The
+          file may be gzip-compressed.
+
+ -p FILE  File in GFF3 format containing coordinates of the protein domains of genes. The
+          protein domains retained in a fusion are listed in the column
+          'retained_protein_domains'. The file may be gzip-compressed.
+
+ -d FILE  Tab-separated file with coordinates of structural variants found using
+          whole-genome sequencing data. These coordinates serve to increase sensitivity
+          towards weakly expressed fusions and to eliminate fusions with low evidence.
+
+ -D MAX_GENOMIC_BREAKPOINT_DISTANCE  When a file with genomic breakpoints obtained via
+                                     whole-genome sequencing is supplied via the -d
+                                     parameter, this parameter determines how far a
+                                     genomic breakpoint may be away from a
+                                     transcriptomic breakpoint to consider it as a
+                                     related event. For events inside genes, the
+                                     distance is added to the end of the gene; for
+                                     intergenic events, the distance threshold is
+                                     applied as is. Default: 100000
+
+ -s STRANDEDNESS  Whether a strand-specific protocol was used for library preparation,
+                  and if so, the type of strandedness (auto/yes/no/reverse). When
+                  unstranded data is processed, the strand can sometimes be inferred from
+                  splice-patterns. But in unclear situations, stranded data helps
+                  resolve ambiguities. Default: auto
+
+ -i CONTIGS  Comma-/space-separated list of interesting contigs. Fusions between genes
+             on other contigs are ignored. Contigs can be specified with or without the
+             prefix "chr". Asterisks (*) are treated as wild-cards.
+             Default: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y AC_* NC_*
+
+ -v CONTIGS  Comma-/space-separated list of viral contigs. Asterisks (*) are treated as
+             wild-cards.
+             Default: AC_* NC_*
+
+ -f FILTERS  Comma-/space-separated list of filters to disable. By default all filters are
+             enabled. Valid values: homologs, low_entropy, isoforms,
+             top_expressed_viral_contigs, viral_contigs, non_coding_neighbors,
+             mismatches, duplicates, no_genomic_support, genomic_support, intronic,
+             end_to_end, relative_support, low_coverage_viral_contigs,
+             merge_adjacent, mismappers, multimappers, same_gene, long_gap,
+             internal_tandem_duplication, small_insert_size, read_through,
+             inconsistently_clipped, uninteresting_contigs, intragenic_exonic,
+             spliced, hairpin, blacklist, min_support, select_best, in_vitro,
+             short_anchor, known_fusions, no_coverage, homopolymer, many_spliced
+
+ -E MAX_E-VALUE  Arriba estimates the number of fusions with a given number of supporting
+                 reads which one would expect to see by random chance. If the expected number
+                 of fusions (e-value) is higher than this threshold, the fusion is
+                 discarded by the 'relative_support' filter. Note: Increasing this
+                 threshold can dramatically increase the number of false positives and may
+                 increase the runtime of resource-intensive steps. Fractional values are
+                 possible. Default: 0.300000
+
+ -S MIN_SUPPORTING_READS  The 'min_support' filter discards all fusions with fewer than
+                          this many supporting reads (split reads and discordant mates
+                          combined). Default: 2
+
+ -m MAX_MISMAPPERS  When more than this fraction of supporting reads turns out to be
+                    mismappers, the 'mismappers' filter discards the fusion. Default:
+                    0.800000
+
+ -L MAX_HOMOLOG_IDENTITY  Genes with more than the given fraction of sequence identity are
+                          considered homologs and removed by the 'homologs' filter.
+                          Default: 0.300000
+
+ -H HOMOPOLYMER_LENGTH  The 'homopolymer' filter removes breakpoints adjacent to
+                        homopolymers of the given length or more. Default: 6
+
+ -R READ_THROUGH_DISTANCE  The 'read_through' filter removes read-through fusions
+                           where the breakpoints are less than the given distance away
+                           from each other. Default: 10000
+
+ -A MIN_ANCHOR_LENGTH  Alignment artifacts are often characterized by split reads coming
+                       from only one gene and no discordant mates. Moreover, the split
+                       reads only align to a short stretch in one of the genes. The
+                       'short_anchor' filter removes these fusions. This parameter sets
+                       the threshold in bp for what the filter considers short. Default: 23
+
+ -M MANY_SPLICED_EVENTS  The 'many_spliced' filter recovers fusions between genes that
+                         have at least this many spliced breakpoints. Default: 4
+
+ -K MAX_KMER_CONTENT  The 'low_entropy' filter removes reads with repetitive 3-mers. If
+                      the 3-mers make up more than the given fraction of the sequence, then
+                      the read is discarded. Default: 0.600000
+
+ -V MAX_MISMATCH_PVALUE  The 'mismatches' filter uses a binomial model to calculate a
+                         p-value for observing a given number of mismatches in a read. If
+                         the number of mismatches is too high, the read is discarded.
+                         Default: 0.010000
+
+ -F FRAGMENT_LENGTH  When paired-end data is given, the fragment length is estimated
+                     automatically and this parameter has no effect. But when single-end
+                     data is given, the mean fragment length should be specified to
+                     effectively filter fusions that arise from hairpin structures.
+                     Default: 200
+
+ -U MAX_READS  Subsample fusions with more than the given number of supporting reads. This
+               improves performance without compromising sensitivity, as long as the
+               threshold is high. Counting of supporting reads beyond the threshold is
+               inaccurate, obviously. Default: 300
+
+ -Q QUANTILE  Highly expressed genes are prone to produce artifacts during library
+              preparation. Genes with an expression above the given quantile are eligible
+              for filtering by the 'in_vitro' filter. Default: 0.998000
+
+ -e EXONIC_FRACTION  The breakpoints of false-positive predictions of intragenic events
+                     are often both in exons. True predictions are more likely to have at
+                     least one breakpoint in an intron, because introns are larger. If the
+                     fraction of exonic sequence between two breakpoints is smaller than
+                     the given fraction, the 'intragenic_exonic' filter discards the
+                     event. Default: 0.330000
+
+ -T TOP_N  Only report viral integration sites of the top N most highly expressed viral
+           contigs. Default: 5
+
+ -C COVERED_FRACTION  Ignore virally associated events if the virus is not fully
+                      expressed, i.e., less than the given fraction of the viral contig is
+                      transcribed. Default: 0.150000
+
+ -l MAX_ITD_LENGTH  Maximum length of internal tandem duplications. Note: Increasing
+                    this value beyond the default can impair performance and lead to many
+                    false positives. Default: 100
+
+ -u  Instead of performing duplicate marking itself, Arriba relies on duplicate marking by a
+     preceding program using the BAM_FDUP flag. This makes sense when unique molecular
+     identifiers (UMI) are used.
+
+ -X  To reduce the runtime and file size, by default, the columns 'fusion_transcript',
+     'peptide_sequence', and 'read_identifiers' are left empty in the file containing
+     discarded fusion candidates (see parameter -O). When this flag is set, this extra
+     information is reported in the discarded fusions file.
+
+ -I  If assembly of the fusion transcript sequence from the supporting reads is incomplete
+     (denoted as '...'), fill the gaps using the assembly sequence wherever possible.
+
+ -h  Print help and exit.
+
+         Code repository: https://github.com/suhrig/arriba
+    Get help/report bugs: https://github.com/suhrig/arriba/issues
+             User manual: https://arriba.readthedocs.io/
+             Please cite: https://doi.org/10.1101/gr.257246.119
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/arriba.xml	Thu Oct 07 11:47:02 2021 +0000
@@ -0,0 +1,242 @@
+<tool id="arriba" name="Arriba" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" python_template_version="3.5">
+    <description>detect gene fusions from STAR aligned RNA-Seq data</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="version_command" />
+    <command detect_errors="exit_code"><![CDATA[
+     arriba 
+    -x '$input'
+    #if $chimeric
+        -c '$chimeric'
+    #endif
+    -a '$genome_assembly'
+    -g '$gtf'
+    -b '$blacklist'
+    #if '$protein_domains'
+        -p '$protein_domains'
+    #endif
+    #if '$known_fusions'
+        -k '$known_fusions'
+    #endif
+    #if '$tags'
+        -t '$tags'
+    #endif
+    -o fusions.tsv
+    -O fusions.discarded.tsv 
+    ]]></command>
+    <inputs>
+        <param name="input" argument="-x" type="data" format="sam,bam,cram" label="STAR Aligned.out.sam"/>
+        <param name="chimeric" argument="-c" type="data" format="sam,bam,cram" optional="true" label="STAR Chimeric.out.sam">
+            <help><![CDATA[ only required, if STAR was run with the parameter '--chimOutType SeparateSAMold' ]]></help>
+        </param>
+        <param name="genome_assembly" argument="-a" type="data" format="fasta" label="genome assembly fasta"/>
+        <param name="gtf" argument="-g" type="data" format="gtf" label="GTF file with gene annotation"/>
+        <param name="blacklist" argument="-b" type="data" format="tabular" label="File containing blacklisted ranges."/>
+        <param name="protein_domains" argument="-p" type="data" format="gff3" optional="true" label="File containing blacklisted ranges."/>
+        <param name="known_fusions" argument="-k" type="data" format="tabular"  optional="true" label="File containing known fusions">
+            <help><![CDATA[ file two TAB separated columns: five-prime region three-prime region ]]></help>
+        </param>
+        <param name="tags" argument="-t" type="data" format="tabular" optional="true" label="File containing tag names for a fusion."/>
+    </inputs>
+    <outputs>
+        <data name="fusions" format="tabular" label="${tool.name} on ${on_string}: fusions.tsv" from_work_dir="fusions.tsv"/>
+        <data name="discarded" format="tabular" label="${tool.name} on ${on_string}: fusions.discarded.tsv" from_work_dir="fusions.discarded.tsv"/>
+    </outputs>
+    <help><![CDATA[
+
+arriba -h
+[2021-10-06T19:04:33] Launching Arriba 2.1.0
+
+Arriba gene fusion detector
+---------------------------
+Version: 2.1.0
+
+Arriba is a fast tool to search for aberrant transcripts such as gene fusions.
+It is based on chimeric alignments found by the STAR RNA-Seq aligner.
+
+Usage: arriba [-c Chimeric.out.sam] -x Aligned.out.bam \
+              -g annotation.gtf -a assembly.fa [-b blacklists.tsv] [-k known_fusions.tsv] \
+              [-t tags.tsv] [-p protein_domains.gff3] [-d structural_variants_from_WGS.tsv] \
+              -o fusions.tsv [-O fusions.discarded.tsv] \
+              [OPTIONS]
+
+ -c FILE  File in SAM/BAM/CRAM format with chimeric alignments as generated by STAR
+          (Chimeric.out.sam). This parameter is only required, if STAR was run with the
+          parameter '--chimOutType SeparateSAMold'. When STAR was run with the parameter
+          '--chimOutType WithinBAM', it suffices to pass the parameter -x to Arriba and -c
+          can be omitted.
+
+ -x FILE  File in SAM/BAM/CRAM format with main alignments as generated by STAR
+          (Aligned.out.sam). Arriba extracts candidate reads from this file.
+
+ -g FILE  GTF file with gene annotation. The file may be gzip-compressed.
+
+ -G GTF_FEATURES  Comma-/space-separated list of names of GTF features.
+                  Default: gene_name=gene_name|gene_id gene_id=gene_id
+                  transcript_id=transcript_id feature_exon=exon feature_CDS=CDS
+
+ -a FILE  FastA file with genome sequence (assembly). The file may be gzip-compressed. An
+          index with the file extension .fai must exist only if CRAM files are processed.
+
+ -b FILE  File containing blacklisted events (recurrent artifacts and transcripts
+          observed in healthy tissue).
+
+ -k FILE  File containing known/recurrent fusions. Some cancer entities are often
+          characterized by fusions between the same pair of genes. In order to boost
+          sensitivity, a list of known fusions can be supplied using this parameter. The list
+          must contain two columns with the names of the fused genes, separated by tabs.
+
+ -o FILE  Output file with fusions that have passed all filters.
+
+ -O FILE  Output file with fusions that were discarded due to filtering.
+
+ -t FILE  Tab-separated file containing fusions to annotate with tags in the 'tags' column.
+          The first two columns specify the genes; the third column specifies the tag. The
+          file may be gzip-compressed.
+
+ -p FILE  File in GFF3 format containing coordinates of the protein domains of genes. The
+          protein domains retained in a fusion are listed in the column
+          'retained_protein_domains'. The file may be gzip-compressed.
+
+ -d FILE  Tab-separated file with coordinates of structural variants found using
+          whole-genome sequencing data. These coordinates serve to increase sensitivity
+          towards weakly expressed fusions and to eliminate fusions with low evidence.
+
+ -D MAX_GENOMIC_BREAKPOINT_DISTANCE  When a file with genomic breakpoints obtained via
+                                     whole-genome sequencing is supplied via the -d
+                                     parameter, this parameter determines how far a
+                                     genomic breakpoint may be away from a
+                                     transcriptomic breakpoint to consider it as a
+                                     related event. For events inside genes, the
+                                     distance is added to the end of the gene; for
+                                     intergenic events, the distance threshold is
+                                     applied as is. Default: 100000
+
+ -s STRANDEDNESS  Whether a strand-specific protocol was used for library preparation,
+                  and if so, the type of strandedness (auto/yes/no/reverse). When
+                  unstranded data is processed, the strand can sometimes be inferred from
+                  splice-patterns. But in unclear situations, stranded data helps
+                  resolve ambiguities. Default: auto
+
+ -i CONTIGS  Comma-/space-separated list of interesting contigs. Fusions between genes
+             on other contigs are ignored. Contigs can be specified with or without the
+             prefix "chr". Asterisks (*) are treated as wild-cards.
+             Default: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y AC_* NC_*
+
+ -v CONTIGS  Comma-/space-separated list of viral contigs. Asterisks (*) are treated as
+             wild-cards.
+             Default: AC_* NC_*
+
+ -f FILTERS  Comma-/space-separated list of filters to disable. By default all filters are
+             enabled. Valid values: homologs, low_entropy, isoforms,
+             top_expressed_viral_contigs, viral_contigs, non_coding_neighbors,
+             mismatches, duplicates, no_genomic_support, genomic_support, intronic,
+             end_to_end, relative_support, low_coverage_viral_contigs,
+             merge_adjacent, mismappers, multimappers, same_gene, long_gap,
+             internal_tandem_duplication, small_insert_size, read_through,
+             inconsistently_clipped, uninteresting_contigs, intragenic_exonic,
+             spliced, hairpin, blacklist, min_support, select_best, in_vitro,
+             short_anchor, known_fusions, no_coverage, homopolymer, many_spliced
+
+ -E MAX_E-VALUE  Arriba estimates the number of fusions with a given number of supporting
+                 reads which one would expect to see by random chance. If the expected number
+                 of fusions (e-value) is higher than this threshold, the fusion is
+                 discarded by the 'relative_support' filter. Note: Increasing this
+                 threshold can dramatically increase the number of false positives and may
+                 increase the runtime of resource-intensive steps. Fractional values are
+                 possible. Default: 0.300000
+
+ -S MIN_SUPPORTING_READS  The 'min_support' filter discards all fusions with fewer than
+                          this many supporting reads (split reads and discordant mates
+                          combined). Default: 2
+
+ -m MAX_MISMAPPERS  When more than this fraction of supporting reads turns out to be
+                    mismappers, the 'mismappers' filter discards the fusion. Default:
+                    0.800000
+
+ -L MAX_HOMOLOG_IDENTITY  Genes with more than the given fraction of sequence identity are
+                          considered homologs and removed by the 'homologs' filter.
+                          Default: 0.300000
+
+ -H HOMOPOLYMER_LENGTH  The 'homopolymer' filter removes breakpoints adjacent to
+                        homopolymers of the given length or more. Default: 6
+
+ -R READ_THROUGH_DISTANCE  The 'read_through' filter removes read-through fusions
+                           where the breakpoints are less than the given distance away
+                           from each other. Default: 10000
+
+ -A MIN_ANCHOR_LENGTH  Alignment artifacts are often characterized by split reads coming
+                       from only one gene and no discordant mates. Moreover, the split
+                       reads only align to a short stretch in one of the genes. The
+                       'short_anchor' filter removes these fusions. This parameter sets
+                       the threshold in bp for what the filter considers short. Default: 23
+
+ -M MANY_SPLICED_EVENTS  The 'many_spliced' filter recovers fusions between genes that
+                         have at least this many spliced breakpoints. Default: 4
+
+ -K MAX_KMER_CONTENT  The 'low_entropy' filter removes reads with repetitive 3-mers. If
+                      the 3-mers make up more than the given fraction of the sequence, then
+                      the read is discarded. Default: 0.600000
+
+ -V MAX_MISMATCH_PVALUE  The 'mismatches' filter uses a binomial model to calculate a
+                         p-value for observing a given number of mismatches in a read. If
+                         the number of mismatches is too high, the read is discarded.
+                         Default: 0.010000
+
+ -F FRAGMENT_LENGTH  When paired-end data is given, the fragment length is estimated
+                     automatically and this parameter has no effect. But when single-end
+                     data is given, the mean fragment length should be specified to
+                     effectively filter fusions that arise from hairpin structures.
+                     Default: 200
+
+ -U MAX_READS  Subsample fusions with more than the given number of supporting reads. This
+               improves performance without compromising sensitivity, as long as the
+               threshold is high. Counting of supporting reads beyond the threshold is
+               inaccurate, obviously. Default: 300
+
+ -Q QUANTILE  Highly expressed genes are prone to produce artifacts during library
+              preparation. Genes with an expression above the given quantile are eligible
+              for filtering by the 'in_vitro' filter. Default: 0.998000
+
+ -e EXONIC_FRACTION  The breakpoints of false-positive predictions of intragenic events
+                     are often both in exons. True predictions are more likely to have at
+                     least one breakpoint in an intron, because introns are larger. If the
+                     fraction of exonic sequence between two breakpoints is smaller than
+                     the given fraction, the 'intragenic_exonic' filter discards the
+                     event. Default: 0.330000
+
+ -T TOP_N  Only report viral integration sites of the top N most highly expressed viral
+           contigs. Default: 5
+
+ -C COVERED_FRACTION  Ignore virally associated events if the virus is not fully
+                      expressed, i.e., less than the given fraction of the viral contig is
+                      transcribed. Default: 0.150000
+
+ -l MAX_ITD_LENGTH  Maximum length of internal tandem duplications. Note: Increasing
+                    this value beyond the default can impair performance and lead to many
+                    false positives. Default: 100
+
+ -u  Instead of performing duplicate marking itself, Arriba relies on duplicate marking by a
+     preceding program using the BAM_FDUP flag. This makes sense when unique molecular
+     identifiers (UMI) are used.
+
+ -X  To reduce the runtime and file size, by default, the columns 'fusion_transcript',
+     'peptide_sequence', and 'read_identifiers' are left empty in the file containing
+     discarded fusion candidates (see parameter -O). When this flag is set, this extra
+     information is reported in the discarded fusions file.
+
+ -I  If assembly of the fusion transcript sequence from the supporting reads is incomplete
+     (denoted as '...'), fill the gaps using the assembly sequence wherever possible.
+
+ -h  Print help and exit.
+
+         Code repository: https://github.com/suhrig/arriba
+    Get help/report bugs: https://github.com/suhrig/arriba/issues
+             User manual: https://arriba.readthedocs.io/
+             Please cite: https://doi.org/10.1101/gr.257246.119
+
+    ]]></help>
+    <expand macro="citations" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Thu Oct 07 11:47:02 2021 +0000
@@ -0,0 +1,20 @@
+<macros>
+    <token name="@TOOL_VERSION@">2.1.0</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+dd
+    <xml name="requirements">
+        <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">arriba</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1101/gr.257246.119</citation>
+            <yield />
+        </citations>
+    </xml>
+    <xml name="version_command">
+        <version_command>arriba -h | grep Version | sed 's/^.* //'</version_command>
+    </xml>
+</macros>