Mercurial > repos > jjohnson > arriba

--- a/arriba.xml	Sun Oct 10 13:00:45 2021 +0000
+++ b/arriba.xml	Mon Oct 11 01:47:22 2021 +0000
@@ -70,10 +70,83 @@
     #if $tags
         -t '$tags'
     #end if
+    #if str($wgs.use_wgs) == "yes"
+        -d '$wgs.wgs'
+        #if $wgs.max_genomic_breakpoint_distance
+            -D $wgs.max_genomic_breakpoint_distance
+        #end if
+    #end if
     -o fusions.tsv
 #if $output_fusions_discarded
     -O fusions.discarded.tsv
 #end if
+## Arriba options
+    #if $options.gtf_features
+        -G $options.gtf_features
+    #end if
+    #if $options.strandedness
+        -s $options.strandedness
+    #end if
+    #if $options.genome_contigs
+        -i $options.genome_contigs
+    #end if
+    #if $options.viral_contigs
+        -v $options.viral_contigs
+    #end if
+    #if $options.max_evalue
+        -E $options.max_evalue
+    #end if
+    #if $options.min_supporting_reads
+        -S $options.min_supporting_reads
+    #end if
+    #if $options.max_mismappers
+        -m $options.max_mismappers
+    #end if
+    #if $options.max_homolog_identity
+        -L $options.max_homolog_identity
+    #end if
+    #if $options.homopolymer_length
+        -H $options.homopolymer_length
+    #end if
+    #if $options.read_through_distance
+        -R $options.read_through_distance
+    #end if
+    #if $options.min_anchor_length
+        -A $options.min_anchor_length
+    #end if
+    #if $options.many_spliced_events
+        -M $options.many_spliced_events
+    #end if
+    #if $options.max_kmer_content
+        -m $options.max_kmer_content
+    #end if
+    #if $options.max_mismatch_pvalue
+        -V $options.max_mismatch_pvalue
+    #end if
+    #if $options.fragment_length
+        -F $options.fragment_length
+    #end if
+    #if $options.max_reads
+        -U $options.max_reads
+    #end if
+    #if $options.quantile
+        -Q $options.quantile
+    #end if
+    #if $options.exonic_fraction
+        -e $options.exonic_fraction
+    #end if
+    #if $options.top_n
+        -T $options.top_n
+    #end if
+    #if $options.covered_fraction
+        -C $options.covered_fraction
+    #end if
+    #if $options.max_itd_length
+        -l $options.max_itd_length
+    #end if
+    $options.duplicate_marking
+    $options.fill_discarded_columns
+    $options.fill_the_gaps
 #if str($input_params.input_source) == "use_fastq"
     && samtools sort -@ \${GALAXY_SLOTS:-1} -m 4G -T tmp -O bam Aligned.out.bam > Aligned.sortedByCoord.out.bam
     && samtools index Aligned.sortedByCoord.out.bam
@@ -85,16 +158,55 @@
 && draw_fusions.R
     --fusions=fusions.tsv
     --alignments=Aligned.sortedByCoord.out.bam
+    --annotation='$gtf'
     --output=fusions.pdf
-    --annotation='$gtf'
     #if $visualization.cytobands
     --cytobands='$visualization.cytobands'
     #end if
     #if $protein_domains
     --proteinDomains='$protein_domains'
     #end if
+    ## Visualization Options
+    #if $visualization.options.transcriptSelection
+        --transcriptSelection=$visualization.options.transcriptSelection
+    #end if
+    #if $visualization.options.minConfidenceForCircosPlot
+        --minConfidenceForCircosPlot=$visualization.options.minConfidenceForCircosPlot
+    #end if
+    #if $visualization.options.showIntergenicVicinity
+        --showIntergenicVicinity=$visualization.options.showIntergenicVicinity
+    #end if
+    #if $visualization.options.squishIntrons
+        --squishIntrons=$visualization.options.squishIntrons
+    #end if
+    #if $visualization.options.mergeDomainsOverlappingBy
+        --mergeDomainsOverlappingBy=$visualization.options.mergeDomainsOverlappingBy
+    #end if
+    #if $visualization.options.printExonLabels
+        --printExonLabels=$visualization.options.printExonLabels
+    #end if
+    #if $visualization.options.render3dEffect
+        --render3dEffect=$visualization.options.render3dEffect
+    #end if
+    #if $visualization.options.optimizeDomainColors
+        --optimizeDomainColors=$visualization.options.optimizeDomainColors
+    #end if
+    #if $visualization.options.color1
+        --color1=$visualization.options.color1
+    #end if
+    #if $visualization.options.color2
+        --color2=$visualization.options.color2
+    #end if
+    #if $visualization.options.pdfWidth
+        --pdfWidth=$visualization.options.pdfWidth
+    #end if
+    #if $visualization.options.pdfHeight
+        --pdfHeight=$visualization.options.pdfHeight
+    #end if
+    #if $visualization.options.fontSize
+        --fontSize=$visualization.options.fontSize
+    #end if
 #end if
-
     ]]></command>
     <inputs>
         <conditional name="input_params">
@@ -139,7 +251,182 @@
         </param>
         <param name="tags" argument="-t" type="data" format="tabular" optional="true" label="File containing tag names for a fusion."
                help="This can be the known fusions if that input has a third column with a name"/>
-        <param name="output_fusions_discarded" type="boolean" truevalue="yes" falsevalue="no" checked="true" label="Output fusions.discarded.tsv"/>
+        <conditional name="wgs">
+            <param name="use_wgs" type="select" label="Use whole-genome sequencing data">
+                <option value="no">no</option>
+                <option value="yes">Yes</option>
+            </param>
+            <when value="yes">
+                <param name="wgs" argument="-d" type="data" format="tabular" label="whole-genome sequencing structural variant data"
+                       help="These coordinates serve to increase sensitivity towards weakly expressed fusions and to eliminate fusions with low evidence."/>
+                <param name="max_genomic_breakpoint_distance" argument="-D" type="integer" value="100000" label="Max genomic breakpoint distance"
+                       help="determines how far a genomic breakpoint may be away from a transcriptomic breakpoint to consider it as a related event."/>
+            </when>
+            <when value="no"/>
+        </conditional>
+        <section name="options" expanded="false" title="Arriba Options">
+            <param name="gtf_features" argument="-G" type="text" value="" optional="true" label="Names of features in the GTF annotation file">
+                <help>Commma or SPACE separated list, default: gene_name=gene_name gene_id=gene_id transcript_id=transcript_id feature_exon=exon feature_CDS=CDS</help>
+                <validator type="regex" message="">^(gene_name|gene_id|transcript_id|feature_exon|feature_CDS)=[^ ,]+([ ,](gene_name|gene_id|transcript_id|feature_exon|feature_CDS)=[^ ,]+)?$</validator>
+            </param>
+            <param name="strandedness" argument="-s" type="select" optional="true" label="Whether a strand-specific protocol was used for library preparation">
+                <help>When unstranded data is processed, the strand can sometimes be inferred from splice-patterns. But in unclear situations, stranded data helps resolve ambiguities.</help>
+                <option value="auto">auto</option>
+                <option value="yes">yes</option>
+                <option value="no">no</option>
+                <option value="reverse">reverse</option>
+            </param>
+            <param name="genome_contigs" argument="-i" type="text" value="" optional="true" label="Comma-/space-separated list of interesting contigs">
+                <help>Comma-/space-separated list of interesting contigs.
+                      Fusions between genes on other contigs are ignored. Contigs can be specified with or without the prefix "chr".
+                      Asterisks (*) are treated as wild-cards.
+                      Default: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y AC_* NC_*
+                </help>
+            </param>
+            <param name="viral_contigs" argument="-v" type="text" value="" optional="true" label="Comma-/space-separated list of viral contigs">
+                <help>Comma-/space-separated list of viral contigs.
+                      Asterisks (*) are treated as wild-cards.
+                      Default: AC_* NC_*
+                </help>
+            </param>
+            <param name="max_evalue" argument="-E" type="float" value="" optional="true" label="Max e-value threahold">
+                <help>Arriba estimates the number of fusions with a given number of supporting
+                      reads which one would expect to see by random chance. If the expected number
+                      of fusions (e-value) is higher than this threshold, the fusion is
+                      discarded by the 'relative_support' filter. Note: Increasing this
+                      threshold can dramatically increase the number of false positives and may
+                      increase the runtime of resource-intensive steps. Fractional values are possible.
+                       Default: 0.300000
+                </help>
+            </param>
+
+            <param name="min_supporting_reads" argument="-S" type="integer" value="" min="1" optional="true" label="Min supporting reads">
+                <help>discard all fusions with fewer than this many supporting reads (split reads and discordant mates combined).
+                      Default: 2
+                </help>
+            </param>
+            <param name="max_mismappers" argument="-m" type="float" value="" min="0." max="1.0" optional="true" label="Max mismappers threshold">
+                <help>When more than this fraction of supporting reads turns out to be mismappers,
+                      the 'mismappers' filter discards the fusion.
+                      Default: 0.800000
+                </help>
+            </param>
+            <param name="max_homolog_identity" argument="-L" type="float" value="" min="0." max="1.0" optional="true" label="Max homologs identity threshold">
+                <help>Genes with more than the given fraction of sequence identity are
+                      considered homologs and removed by the 'homologs' filter.
+                      Default: 0.300000
+                </help>
+            </param>
+            <param name="homopolymer_length" argument="-H" type="integer" value="" min="1" optional="true" label="Homopolymer length">
+                <help>The 'homopolymer' filter removes breakpoints adjacent to homopolymers of the given length or more.
+                      Default: 6
+                </help>
+            </param>
+            <param name="read_through_distance" argument="-R" type="integer" value="" min="1" optional="true" label="Read-through distance">
+                <help>The 'read_through' filter removes read-through fusions
+                      where the breakpoints are less than the given distance away from each other.
+                      Default: 10000
+                </help>
+            </param>
+            <param name="min_anchor_length" argument="-A" type="integer" value="" min="1" optional="true" label="Min anchor length">
+                <help>Alignment artifacts are often characterized by split reads coming
+                      from only one gene and no discordant mates. Moreover, the split
+                      reads only align to a short stretch in one of the genes. The
+                      'short_anchor' filter removes these fusions. This parameter sets
+                      the threshold in bp for what the filter considers short.
+                      Default: 23
+                </help>
+            </param>
+            <param name="many_spliced_events" argument="-M" type="integer" value="" min="1" optional="true" label="Many spliced events">
+                <help>The 'many_spliced' filter recovers fusions between genes that
+                      have at least this many spliced breakpoints.
+                      Default: 4
+                </help>
+            </param>
+            <param name="max_kmer_content" argument="-m" type="float" value="" min="0." max="1.0" optional="true" label="Max kmer content">
+                <help>The 'low_entropy' filter removes reads with repetitive 3-mers. If
+                      the 3-mers make up more than the given fraction of the sequence, then
+                      the read is discarded.
+                      Default: 0.600000
+                </help>
+            </param>
+
+            <param name="max_mismatch_pvalue" argument="-V" type="float" value="" optional="true" label="Max mismatchrpvalue threahold">
+                <help>The 'mismatches' filter uses a binomial model to calculate a
+                      p-value for observing a given number of mismatches in a read.
+                      If the number of mismatches is too high, the read is discarded.
+                      Default: 0.010000
+                </help>
+            </param>
+
+            <param name="fragment_length" argument="-F" type="integer" value="" min="1" optional="true" label="Single-end fragment length">
+                <help>When paired-end data is given, the fragment length is estimated
+                      automatically and this parameter has no effect. But when single-end
+                      data is given, the mean fragment length should be specified to
+                      effectively filter fusions that arise from hairpin structures.
+                      Default: 200
+                </help>
+            </param>
+            <param name="max_reads" argument="-U" type="integer" value="" min="1" optional="true" label="Max reads">
+                <help>Subsample fusions with more than the given number of supporting reads. This
+                      improves performance without compromising sensitivity, as long as the
+                      threshold is high. Counting of supporting reads beyond the threshold is
+                      inaccurate, obviously.
+                      Default: 300
+                </help>
+            </param>
+            <param name="quantile" argument="-Q" type="float" value="" min="0." max="1.0" optional="true" label="Quantile">
+                <help>Highly expressed genes are prone to produce artifacts during library preparation.
+                      Genes with an expression above the given quantile are eligible for filtering by the 'in_vitro' filter.
+                      Default: 0.998000
+                </help>
+            </param>
+            <param name="exonic_fraction" argument="-e" type="float" value="" min="0." max="1.0" optional="true" label="Exonic fraction">
+                <help>The breakpoints of false-positive predictions of intragenic events
+                      are often both in exons. True predictions are more likely to have at
+                      least one breakpoint in an intron, because introns are larger.
+                      If the fraction of exonic sequence between two breakpoints is smaller than
+                      the given fraction, the 'intragenic_exonic' filter discards the event.
+                      Default: 0.330000
+                </help>
+            </param>
+
+            <param name="top_n" argument="-T" type="integer" value="" min="1" optional="true" label="top N viral contigs">
+                <help>Only report viral integration sites of the top N most highly expressed viral contigs.
+                      Default: 5
+                </help>
+            </param>
+            <param name="covered_fraction" argument="-C" type="float" value="" min="0." max="1.0" optional="true" label="Covered fraction">
+                <help>Ignore virally associated events if the virus is not fully expressed,
+                      i.e., less than the given fraction of the viral contig is transcribed.
+                      Default: 0.150000
+                </help>
+            </param>
+            <param name="max_itd_length" argument="-l" type="integer" value="" min="1" optional="true" label="Maximum length of internal tandem duplications">
+                <help>Note: Increasing this value beyond the default can impair performance and lead to many false positives.
+                      Default: 100
+                </help>
+            </param>
+            <param name="duplicate_marking" argument="-u" type="boolean" truevalue="-u" falsevalue="" checked="false" label="Use aligners duplicate marking">
+                <help>Instead of performing duplicate marking itself, Arriba relies on duplicate marking by a
+                      preceding program using the BAM_FDUP flag. This makes sense when unique molecular
+                      identifiers (UMI) are used.
+                </help>
+            </param>
+            <param name="fill_discarded_columns" argument="-X" type="boolean" truevalue="-X" falsevalue="" checked="false" label="Fill all fusion.discarded.tsv columns">
+                <help>To reduce the runtime and file size, by default, the columns 'fusion_transcript',
+                      'peptide_sequence', and 'read_identifiers' are left empty in the file containing
+                      discarded fusion candidates (see parameter -O). When this flag is set, this extra
+                      information is reported in the discarded fusions file.
+                </help>
+            </param>
+            <param name="fill_the_gaps" argument="-I" type="boolean" truevalue="-I" falsevalue="" checked="false" label="Fill fusion transcript gaps from the assembly">
+                <help>If assembly of the fusion transcript sequence from the supporting reads is incomplete
+                      (denoted as '...'), fill the gaps using the assembly sequence wherever possible.
+                </help>
+            </param>
+        </section>
+        <param name="output_fusions_discarded" argument="-O" type="boolean" truevalue="yes" falsevalue="no" checked="true" label="Output fusions.discarded.tsv"/>
         <conditional name="visualization">
             <param name="do_viz" type="select" label="Generate visualization">
                 <option value="yes">Yes</option>
@@ -147,9 +434,115 @@
             </param>
             <when value="yes">
                 <param name="cytobands" argument="--cytobands" type="data" format="tabular" optional="true" label="Cytobands"/>
+                <section name="options" expanded="false" title="Visualization Options">
+                    <param argument="--transcriptSelection" type="select" optional="true" label="Transcript selection">
+                        <help>By default the transcript isoform with the highest coverage is drawn.
+                             Alternatively, the transcript isoform that is provided in the columns
+                             transcript_id1 and transcript_id2 in the given fusions file can be drawn.
+                             Selecting the isoform with the highest coverage usually produces nicer plots,
+                             in the sense that the coverage track is smooth and shows a visible increase in coverage after the fusion breakpoint.
+                             However, the isoform with the highest coverage may not be the one that is involved in the fusion.
+                             Often, genomic rearrangements lead to non-canonical isoforms being transcribed.
+                             For this reason, it can make sense to rely on the transcript selection provided by the columns transcript_id1/2,
+                             which reflect the actual isoforms involved in a fusion.
+\                            As a third option, the transcripts that are annotated as canonical can be drawn.
+                             Transcript isoforms tagged with appris_principal, appris_candidate, or CCDS are considered canonical.
+                        </help>
+                        <option value="coverage">coverage</option>
+                        <option value="provided">provided</option>
+                        <option value="canonical">canonical</option>
+                    </param>
+                    <param argument="--minConfidenceForCircosPlot" type="select" optional="true" label="Transcript selection">
+                        <help>The fusion of interest is drawn as a solid line in the circos plot.
+                              To give an impression of the overall degree of rearrangement,
+                              all other fusions are drawn as semi-transparent lines in the background.
+                              This option determines which other fusions should be included in the circos plot.
+                              Values specify the minimum confidence a fusion must have to be included.
+                              It usually makes no sense to include low-confidence fusions in circos plots,
+                              because they are abundant and unreliable, and would clutter up the circos plot.
+                              Default: medium
+                        </help>
+                        <option value="none">none - only the fusion of interest is drawn</option>
+                        <option value="low">low</option>
+                        <option value="medium">medium</option>
+                        <option value="high">high</option>
+                    </param>
+                    <param argument="--showIntergenicVicinity" type="integer" value="" min="0" optional="true" label="Intergenic Vicinity">
+                        <help>This option only applies to intergenic breakpoints.
+                              If it is set to a value greater than 0, then the script draws the genes
+                              which are no more than the given distance away from an intergenic breakpoint.
+                              Note that this option is incompatible with squishIntrons.
+                              Default: 0
+                        </help>
+                    </param>
+                    <param argument="--squishIntrons" type="select" optional="true" label="Squish introns">
+                        <help>Exons usually make up only a small fraction of a gene.
+                              They may be hard to see in the plot. i
+                              Since introns are in most situations of no interest in the context of gene fusions,
+                              this switch can be used to shrink the size of introns to a fixed, negligible size.
+                              It makes sense to disable this feature, if breakpoints in introns are of importance.
+                              Default: TRUE
+                        </help>
+                        <option value="TRUE">True</option>
+                        <option value="FALSE">False</option>
+                    </param>
+
+                    <param argument="--mergeDomainsOverlappingBy" type="float" value="" min="0." max="1.0" optional="true" label="Merge Domains Overlapping By">
+                        <help>Occasionally, domains are annotated redundantly.
+                              For example, tyrosine kinase domains are frequently annotated as
+                              Protein tyrosine kinase and Protein kinase domain.
+                              In order to simplify the visualization, such domains can be merged into one,
+                              given that they overlap by the given fraction.
+                              The description of the larger domain is used.
+                              Default: 0.9
+                        </help>
+                    </param>
+                    <param argument="--printExonLabels" type="select" optional="true" label="Print Exon Labels">
+                        <help>By default the number of an exon is printed inside each exon,
+                              which is taken from the attribute exon_number of the GTF annotation.
+                              When a gene has many exons, the boxes may be too narrow to contain the labels,
+                              resulting in unreadable exon labels. In these situations, i
+                              it may be better to turn off exon labels.
+                              Default: TRUE
+                        </help>
+                        <option value="TRUE">True</option>
+                        <option value="FALSE">False</option>
+                    </param>
+                    <param argument="--render3dEffect" type="select" optional="true" label="Render 3D effect">
+                        <help>Whether light and shadow should be rendered to give objects a 3D effect.
+                              Default: TRUE
+                        </help>
+                        <option value="TRUE">True</option>
+                        <option value="FALSE">False</option>
+                    </param>
+                    <param argument="--optimizeDomainColors" type="select" optional="true" label="Optimize Domain Colors">
+                        <help>By default, the script colorizes domains according to the colors
+                              specified in the file given in --annotation.
+                              This way, coloring of domains is consistent across all proteins.
+                              But since there are more distinct domains than colors,
+                              this can lead to different domains having the same color.
+                              If this option is set to TRUE, the colors are recomputed for each fusion separately.
+                              This ensures that the colors have the maximum distance for each individual fusion,
+                              but they are no longer consistent across different fusions.
+                              Default: FALSE
+                        </help>
+                        <option value="TRUE">True</option>
+                        <option value="FALSE">False</option>
+                    </param>
+                    <param argument="--color1" type="color" value="" optional="true"  label="Color of the 5' end of the fusion."/>
+                    <param argument="--color2" type="color" value="" optional="true"  label="Color of the 3' end of the fusion."/>
+                    <param argument="--pdfWidth" type="float" value="" min="1." optional="true" label="Width of PDF output file in inches"
+                           help="Default: 11.692"/>
+                    <param argument="--pdfHeight" type="float" value="" min="1." optional="true" label="Height of PDF output file in inches"
+                           help="Default: 8.267"/>
+                    <param argument="--fontSize" type="float" value="" min="0." optional="true" label="Scale the size of text"
+                           help="Default: 1.0"/>
+                </section>
+
             </when>
             <when value="no"/>
         </conditional>
+
     </inputs>
     <outputs>
         <data name="fusions" format="tabular" label="${tool.name} on ${on_string}: fusions.tsv" from_work_dir="fusions.tsv"/>
@@ -203,15 +596,16 @@

     </tests>
     <help><![CDATA[
-** Arriba **
+**Arriba**


 Arriba_ is a fast tool to search for aberrant transcripts such as gene fusions.
 It is based on chimeric alignments found by the STAR RNA-Seq aligner.


-** INPUTS_ **
+**INPUTS**

+See:  https://arriba.readthedocs.io/en/latest/input-files/

   - Alignments

@@ -352,7 +746,9 @@
     NOTE: Arriba was designed for alignments from RNA-Seq data. It should not be run on WGS data directly. Many assumptions made by Arriba about the data (statistical models, blacklist, etc.) only apply to RNA-Seq data and are not valid for DNA-Seq data. For such data, a structural variant calling algorithm should be used and the results should be passed to Arriba.


-** OUTPUTS_ **
+**OUTPUTS**
+
+See:  https://arriba.readthedocs.io/en/latest/output-files/

   - fusions.tsv

@@ -403,22 +799,27 @@
     The file fusions.discarded.tsv (as specified by the parameter -O) contains all events that Arriba classified as an artifact or that are also observed in healthy tissue. It has the same format as the file fusions.tsv.


-** VISUALIZATION_ **
+**VISUALIZATION**
+
+See: https://arriba.readthedocs.io/en/latest/visualization/
+
   - fusions.pdf

     A PDF file with one page for each predicted fusion. Each page depicts the fusion partners, their orientation, the retained exons in the fusion transcript, statistics about the number of supporting reads, and if the column fusion_transcript has a value an excerpt of the sequence around the breakpoint.


-Code repository: https://github.com/suhrig/arriba
-Get help/report bugs: https://github.com/suhrig/arriba/issues
-User manual: https://arriba.readthedocs.io/
-Please cite: https://doi.org/10.1101/gr.257246.119
+**OPTIONS**
+
+  - Arriba: https://arriba.readthedocs.io/en/latest/command-line-options/#arriba
+  - Visualization: https://arriba.readthedocs.io/en/latest/command-line-options/#draw_fusionsr
+  - RNA STAR: https://arriba.readthedocs.io/en/latest/workflow/


 .. _Arriba: https://arriba.readthedocs.io/en/latest/
 .. _INPUTS: https://arriba.readthedocs.io/en/latest/input-files/
 .. _OUTPUTS: https://arriba.readthedocs.io/en/latest/output-files/
 .. _VISUALIZATION: https://arriba.readthedocs.io/en/latest/visualization/
+.. _OPTIONS: https://arriba.readthedocs.io/en/latest/command-line-options/

     ]]></help>
     <expand macro="citations" />