Mercurial > repos > jjohnson > arriba_draw_fusions
changeset 0:e6f0f0da3b61 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/arriba commit c1d05da7c2c76feae94cbc640be7b010f31397d2-dirty"
author | jjohnson |
---|---|
date | Fri, 11 Feb 2022 19:08:17 +0000 |
parents | |
children | 97958f5c2c52 |
files | arriba_draw_fusions.xml macros.xml static/images/draw-fusions-example.png test-data/genome.fasta.gz test-data/genome.gtf.gz tool-data/arriba_indexes.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 8 files changed, 358 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arriba_draw_fusions.xml Fri Feb 11 19:08:17 2022 +0000 @@ -0,0 +1,120 @@ +<tool id="arriba_draw_fusions" name="Arriba Draw Fusions" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" python_template_version="3.5"> + <description></description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <expand macro="version_command" /> + <command detect_errors="exit_code"><![CDATA[ + #if $alignments.extension == 'sam' + ln -sf '$genome.assembly' input.fa && + samtools faidx input.fa && + samtools view -b -@ \${GALAXY_SLOTS:-1} -t input.fa.fai '$alignments' | + samtools sort -O bam -@ \${GALAXY_SLOTS:-1} -T "\${TMPDIR:-.}" -o Aligned.sortedByCoord.out.bam && + samtools index Aligned.sortedByCoord.out.bam && + #else + ln -sf '${alignments}' 'Aligned.sortedByCoord.out.bam' && + ln -sf '$alignments.metadata.bam_index' 'Aligned.sortedByCoord.out.bam.bai' && + #end if + @DRAW_FUSIONS@ + ]]></command> + <inputs> + <param argument="--fusions" type="data" format="tabular" label="Arriba fusions.tsv"/> + <param argument="--alignments" type="data" format="sam,bam" label="STAR Aligned.out.bam"/> + <expand macro="genome_source" assembly_optional="true"/> + <param name="protein_domains" argument="-p" type="data" format="gff3" optional="true" label="File containing protein domains"/> + <section name="visualization" expanded="true" title="Visualization Options"> + <expand macro="visualization_options" /> + </section> + </inputs> + <outputs> + <data name="fusions_pdf" format="pdf" label="${tool.name} on ${on_string}: fusions.pdf" from_work_dir="fusions.pdf"> + <filter>visualization['do_viz'] == "yes"</filter> + </data> + </outputs> + <tests> + <!-- Test 1 - From exisitng BAM --> + <test> + <param name="fusions" ftype="tabular" value="fusions.tsv"/> + <param name="alignments" ftype="sam" value="Aligned.out.sam"/> + <conditional name="genome"> + <param name="genome_source" value="history"/> + <param name="assembly" ftype="fasta" value="genome.fasta.gz"/> + <param name="annotation" ftype="gtf" value="genome.gtf.gz"/> + </conditional> + <param name="protein_domains" ftype="gff3" value="protein_domains.gff3"/> + <section name="visualization"> + <param name="cytobands" ftype="tabular" value="cytobands.tsv"/> + </section> + <output name="fusions_pdf"> + <assert_contents> + <has_size value="64000" delta="5000" /> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ +**Arriba Draw Fusions** + +Arriba_Draw_Fusions_ (draw_fusions.R) renders publication-quality visualizations of the transcripts involved in predicted fusions. It generates a PDF file with one page for each predicted fusion. Each page depicts the fusion partners, their orientation, the retained exons in the fusion transcript, statistics about the number of supporting reads, and - if the column fusion_transcript has a value - an excerpt of the sequence around the breakpoint. + + +**INPUTS** + +See: https://arriba.readthedocs.io/en/latest/command-line-options/#draw_fusionsr + + - Fusions + + File containing fusion predictions from Arriba_ (fusions.tsv) or STAR-Fusion (star-fusion.fusion_predictions.tsv or star-fusion.fusion_predictions.abridged.coding_effect.tsv). + + - Annotation + + Gene annotation in GTF format that was used by the STAR aligner. + + - Alignments + + BAM file containing normal alignments from STAR. + + - Annotation + + The gene annotation (parameter -g) is used for multiple purposes: + + - Assembly (Optional) + + Only required when alignments are not sorted bam format. The genonme assembly will be used by samtools to produce a sorted bam file. + + - Protein domains (Optional) + + GFF3 file containing the genomic coordinates of protein domains. Distributions of Arriba offer protein domain annotations for all supported assemblies in the database directory. When this file is given, a plot is generated, which shows the protein domains retained in the fusion transcript. + + - Cytobands (Optional) + + Coordinates of the Giemsa staining bands. This information is used to draw ideograms. If the argument is omitted, then no ideograms are rendered. The file must have the following columns: contig, start, end, name, giemsa. Recognized values for the Giemsa staining intensity are: gneg, gpos followed by a percentage, acen, stalk. Cytobands forahuman and mouse reference can be retrieved from the Arriba distribution with the **Arriba Get Filters** tool. + + +**OPTIONS** + + See: https://arriba.readthedocs.io/en/latest/command-line-options/#draw_fusionsr + + +**OUTPUTS** + +See: https://arriba.readthedocs.io/en/latest/visualization/ + + - fusions.pdf + + A PDF file with one page for each predicted fusion. Each page depicts the fusion partners, their orientation, the retained exons in the fusion transcript, statistics about the number of supporting reads, and if the column fusion_transcript has a value an excerpt of the sequence around the breakpoint. + +.. image:: draw-fusions-example.png + :width: 800 + :height: 467 + + + + +.. _Arriba_Draw_Fusions: https://arriba.readthedocs.io/en/latest/visualization/ +.. _Arriba: https://arriba.readthedocs.io/en/latest/ + + ]]></help> + <expand macro="citations" /> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Fri Feb 11 19:08:17 2022 +0000 @@ -0,0 +1,209 @@ +<macros> + <token name="@TOOL_VERSION@">2.2.1</token> + <token name="@VERSION_SUFFIX@">0</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">arriba</requirement> + <yield/> + </requirements> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1101/gr.257246.119</citation> + <yield /> + </citations> + </xml> + <xml name="version_command"> + <version_command>arriba -h | grep Version | sed 's/^.* //'</version_command> + </xml> + <xml name="genome_source" token_assembly_optional="false" > + <conditional name="genome"> + <param name="genome_source" type="select" label="Arriba Genome assembly and annotation source"> + <option value="history">From your history</option> + <option value="cached">Use built-in Arriba</option> + </param> + <when value="history"> + <param name="assembly" argument="-a" type="data" format="fasta" optional="@ASSEMBLY_OPTIONAL@" label="Genome assembly fasta"/> + <param name="annotation" argument="-g" type="data" format="gtf" label="Gene annotation in GTF format"/> + </when> + <when value="cached"> + <param name="arriba_ref" type="select" label="Arriba Genome assembly and annotation"> + <options from_data_table="arriba_indexes"> + </options> + </param> + </when> + </conditional> + </xml> + <token name="@GENOME_SOURCE@"> +#if str($genome.genome_source) == "history" + #if $genome.assembly + #set $genome_assembly = $genome.assembly + #end if + #set $genome_annotation = $genome.annotation +#else + #set $genome_assembly = $genome.arriba_ref.fields.fasta + #set $genome_annotation = $genome.arriba_ref.fields.gtf +#end if +</token> + + <xml name="visualization_options"> + <param name="cytobands" argument="--cytobands" type="data" format="tabular" optional="true" label="Cytobands"/> + <section name="options" expanded="false" title="Draw Fusion Options"> + <param argument="--transcriptSelection" type="select" optional="true" label="Transcript selection"> + <help>By default the transcript isoform with the highest coverage is drawn. + Alternatively, the transcript isoform that is provided in the columns + transcript_id1 and transcript_id2 in the given fusions file can be drawn. + Selecting the isoform with the highest coverage usually produces nicer plots, + in the sense that the coverage track is smooth and shows a visible increase in coverage after the fusion breakpoint. + However, the isoform with the highest coverage may not be the one that is involved in the fusion. + Often, genomic rearrangements lead to non-canonical isoforms being transcribed. + For this reason, it can make sense to rely on the transcript selection provided by the columns transcript_id1/2, + which reflect the actual isoforms involved in a fusion. +\ As a third option, the transcripts that are annotated as canonical can be drawn. + Transcript isoforms tagged with appris_principal, appris_candidate, or CCDS are considered canonical. + </help> + <option value="coverage">coverage</option> + <option value="provided">provided</option> + <option value="canonical">canonical</option> + </param> + <param argument="--minConfidenceForCircosPlot" type="select" optional="true" label="Transcript selection"> + <help>The fusion of interest is drawn as a solid line in the circos plot. + To give an impression of the overall degree of rearrangement, + all other fusions are drawn as semi-transparent lines in the background. + This option determines which other fusions should be included in the circos plot. + Values specify the minimum confidence a fusion must have to be included. + It usually makes no sense to include low-confidence fusions in circos plots, + because they are abundant and unreliable, and would clutter up the circos plot. + Default: medium + </help> + <option value="none">none - only the fusion of interest is drawn</option> + <option value="low">low</option> + <option value="medium">medium</option> + <option value="high">high</option> + </param> + <param argument="--showIntergenicVicinity" type="integer" value="" min="0" optional="true" label="Intergenic Vicinity"> + <help>This option only applies to intergenic breakpoints. + If it is set to a value greater than 0, then the script draws the genes + which are no more than the given distance away from an intergenic breakpoint. + Note that this option is incompatible with squishIntrons. + Default: 0 + </help> + </param> + <param argument="--squishIntrons" type="select" optional="true" label="Squish introns"> + <help>Exons usually make up only a small fraction of a gene. + They may be hard to see in the plot. i + Since introns are in most situations of no interest in the context of gene fusions, + this switch can be used to shrink the size of introns to a fixed, negligible size. + It makes sense to disable this feature, if breakpoints in introns are of importance. + Default: TRUE + </help> + <option value="TRUE">True</option> + <option value="FALSE">False</option> + </param> + + <param argument="--mergeDomainsOverlappingBy" type="float" value="" min="0." max="1.0" optional="true" label="Merge Domains Overlapping By"> + <help>Occasionally, domains are annotated redundantly. + For example, tyrosine kinase domains are frequently annotated as + Protein tyrosine kinase and Protein kinase domain. + In order to simplify the visualization, such domains can be merged into one, + given that they overlap by the given fraction. + The description of the larger domain is used. + Default: 0.9 + </help> + </param> + <param argument="--printExonLabels" type="select" optional="true" label="Print Exon Labels"> + <help>By default the number of an exon is printed inside each exon, + which is taken from the attribute exon_number of the GTF annotation. + When a gene has many exons, the boxes may be too narrow to contain the labels, + resulting in unreadable exon labels. In these situations, i + it may be better to turn off exon labels. + Default: TRUE + </help> + <option value="TRUE">True</option> + <option value="FALSE">False</option> + </param> + <param argument="--render3dEffect" type="select" optional="true" label="Render 3D effect"> + <help>Whether light and shadow should be rendered to give objects a 3D effect. + Default: TRUE + </help> + <option value="TRUE">True</option> + <option value="FALSE">False</option> + </param> + <param argument="--optimizeDomainColors" type="select" optional="true" label="Optimize Domain Colors"> + <help>By default, the script colorizes domains according to the colors + specified in the file given in --annotation. + This way, coloring of domains is consistent across all proteins. + But since there are more distinct domains than colors, + this can lead to different domains having the same color. + If this option is set to TRUE, the colors are recomputed for each fusion separately. + This ensures that the colors have the maximum distance for each individual fusion, + but they are no longer consistent across different fusions. + Default: FALSE + </help> + <option value="TRUE">True</option> + <option value="FALSE">False</option> + </param> + <param argument="--color1" type="color" value="" optional="true" label="Color of the 5' end of the fusion."/> + <param argument="--color2" type="color" value="" optional="true" label="Color of the 3' end of the fusion."/> + <param argument="--pdfWidth" type="float" value="" min="1." optional="true" label="Width of PDF output file in inches" + help="Default: 11.692"/> + <param argument="--pdfHeight" type="float" value="" min="1." optional="true" label="Height of PDF output file in inches" + help="Default: 8.267"/> + <param argument="--fontSize" type="float" value="" min="0." optional="true" label="Scale the size of text" + help="Default: 1.0"/> + </section> + </xml> + <token name="@DRAW_FUSIONS@"> +draw_fusions.R + --fusions='$fusions' + --alignments='Aligned.sortedByCoord.out.bam' + --annotation='$genome.annotation' + --output=fusions.pdf + #if $visualization.cytobands + --cytobands='$visualization.cytobands' + #end if + #if $protein_domains + --proteinDomains='$protein_domains' + #end if + ## Visualization Options + #if $visualization.options.transcriptSelection + --transcriptSelection=$visualization.options.transcriptSelection + #end if + #if $visualization.options.minConfidenceForCircosPlot + --minConfidenceForCircosPlot=$visualization.options.minConfidenceForCircosPlot + #end if + #if $visualization.options.showIntergenicVicinity + --showIntergenicVicinity=$visualization.options.showIntergenicVicinity + #end if + #if $visualization.options.squishIntrons + --squishIntrons=$visualization.options.squishIntrons + #end if + #if $visualization.options.mergeDomainsOverlappingBy + --mergeDomainsOverlappingBy=$visualization.options.mergeDomainsOverlappingBy + #end if + #if $visualization.options.printExonLabels + --printExonLabels=$visualization.options.printExonLabels + #end if + #if $visualization.options.render3dEffect + --render3dEffect=$visualization.options.render3dEffect + #end if + #if $visualization.options.optimizeDomainColors + --optimizeDomainColors=$visualization.options.optimizeDomainColors + #end if + #if $visualization.options.color1 + --color1=$visualization.options.color1 + #end if + #if $visualization.options.color2 + --color2=$visualization.options.color2 + #end if + #if $visualization.options.pdfWidth + --pdfWidth=$visualization.options.pdfWidth + #end if + #if $visualization.options.pdfHeight + --pdfHeight=$visualization.options.pdfHeight + #end if + #if $visualization.options.fontSize + --fontSize=$visualization.options.fontSize + #end if +</token> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/arriba_indexes.loc.sample Fri Feb 11 19:08:17 2022 +0000 @@ -0,0 +1,17 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of Ariba data files. +#The Arriba script download_references.sh retrieves a genome assembly fasta +#and a related GTF annotation file, then builds a STAR index. +#You will need to create these data files and then create a +#arriba_indexes.loc similar to this one (store it in this +#directory) that points to the directories in which those files are stored. +#The arriba_indexes.loc file has this format (longer white space +#characters are TAB characters): +# +#<unique_build_id> <display_name> <genome_fasta_path> <genome_gtf_path> <STAR_index_path> +# +#Note that STAR indices can become quite large. +# +#<unique_build_id> <display_name> <genome_fasta_path> <genome_gtf_path> <STAR_index_path> +#GRCh38+ENSEMBL93 GRCh38+ENSEMBL93 /depot/GRCh38+ENSEMBL93/genome.fa /depot/GRCh38+ENSEMBL93/genome.gtf /depot/GRCh38+ENSEMBL93/STAR_index/ +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Fri Feb 11 19:08:17 2022 +0000 @@ -0,0 +1,6 @@ +<tables> + <table name="arriba_indexes" comment_char="#" allow_duplicate_entries="False"> + <columns>value, name, fasta, gtf, star_index</columns> + <file path="tool-data/arriba_indexes.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Fri Feb 11 19:08:17 2022 +0000 @@ -0,0 +1,6 @@ +<tables> + <table name="arriba_indexes" comment_char="#" allow_duplicate_entries="False"> + <columns>value, name, fasta, gtf, star_index</columns> + <file path="${__HERE__}/test-data/arriba_indexes.loc" /> + </table> +</tables>