arriba: arriba.xml comparison

comparison arriba.xml @ 11:8ed8af5836d1 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/arriba commit e0aa03add09ecc4ad5a5d41c439b8af9551fc53c"

author	jjohnson
date	Tue, 26 Apr 2022 20:21:29 +0000
parents	c58d1774c762
children	73fd7703a743

comparison

equal deleted inserted replaced

-:c58d1774c762
+:8ed8af5836d1
 <tool id="arriba" name="Arriba" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" python_template_version="3.5">
 <description>detect gene fusions from STAR aligned RNA-Seq data</description>
 <macros>
 <import>macros.xml</import>
+<xml name="fusion_actions">
+<actions>
+<action name="comment_lines" type="metadata" default="1" />
+<action name="column_names" type="metadata" default="gene1,gene2,strand1(gene/fusion),strand2(gene/fusion),breakpoint1,breakpoint2,site1,site2,type,split_reads1,split_reads2,discordant_mates,coverage1,coverage2,confidence,reading_frame,tags,retained_protein_domains,closest_genomic_breakpoint1,closest_genomic_breakpoint2,gene_id1,gene_id2,transcript_id1,transcript_id2,direction1,direction2,filters,fusion_transcript,peptide_sequence,read_identifiers" />
+</actions>
+</xml>
 </macros>
 <expand macro="requirements" />
 <expand macro="version_command" />
 <command detect_errors="exit_code"><![CDATA[
 @GENOME_SOURCE@
 #if str($input_params.index.index_source) == "history"
 #set $star_index_dir = $input_params.index.star_index.extra_files_path
 #else
 #set $star_index_dir = $input_params.index.arriba_ref.fields.star_index
 #end if
+#if $blacklist
+#if $blacklist.is_of_type('tabular.gz')
+#set $blacklist_file = 'blacklist.tsv.gz'
+ln -sf '$blacklist'  $blacklist_file &&
+#else
+#set $blacklist_file = $blacklist
+#end if
+#end if
+#if $known_fusions
+#if $known_fusions.is_of_type('tabular.gz')
+#set $known_fusions_file = 'known_fusions.tsv.gz'
+ln -sf '$known_fusions'  $known_fusions_file &&
+#else
+#set $known_fusions_file = $known_fusions
+#end if
+#end if
+#if $tags
+#if $tags.is_of_type('tabular.gz')
+#set $tags_file = 'tags.tsv.gz'
+ln -sf '$tags'  $tags_file &&
+#else
+#set $tags_file = $tags
+#end if
+#end if
 STAR
 --runThreadN \${GALAXY_SLOTS:-1}
 --genomeDir $star_index_dir
 --genomeLoad NoSharedMemory
 --readFilesIn $read1 $read2
 #end if
 #end if
 -a '$genome_assembly'
 -g '$genome_annotation'
 #if $blacklist
--b '$blacklist'
+-b '$blacklist_file'
 #else
 -f 'blacklist'
 #end if
 #if $protein_domains
 -p '$protein_domains'
 #end if
 #if $known_fusions
--k '$known_fusions'
+-k '$known_fusions_file'
 #end if
 #if $tags
--t '$tags'
+-t '$tags_file'
 #end if
 #if str($wgs.use_wgs) == "yes"
 -d '$wgs.wgs'
 #if $wgs.max_genomic_breakpoint_distance
 -D $wgs.max_genomic_breakpoint_distance
 && samtools index Aligned.sortedByCoord.out.bam
 #elif str($visualization.do_viz) == "yes"
 && samtools sort -@ \${GALAXY_SLOTS:-1} -m 4G -T tmp -O bam '$input_params.input' > Aligned.sortedByCoord.out.bam
 && samtools index Aligned.sortedByCoord.out.bam
 #end if
+#if $output_fusions_vcf
+&& convert_fusions_to_vcf.sh '$genome_assembly' fusions.tsv fusions.vcf
+#end if
+#if $output_fusion_bams
+&& mkdir fusion_bams
+&& extract_fusion-supporting_alignments.sh fusions.tsv Aligned.sortedByCoord.out.bam 'fusion_bams/fusion'
+#end if
 #if str($visualization.do_viz) == "yes"
 #set $fusions = 'fusions.tsv'
 && @DRAW_FUSIONS@
 #end if
 ]]></command>
 <inputs>
 <conditional name="input_params">
 <param name="input_source" type="select" label="Use output from earlier STAR run or let Arriba running STAR">
 <option value="use_star">Use output from earlier STAR</option>
 <option value="use_fastq">Let Arriba control running STAR</option>
 </param>
 <when value="use_star">
-<param name="input" argument="-x" type="data" format="sam,bam,cram" label="STAR Aligned.out.sam"/>
+<param name="input" argument="-x" type="data" format="sam,bam,cram" label="STAR Aligned.out.sam">
+<help><![CDATA[ recommended STAR options: --chimSegmentMin 10 --chimOutType WithinBAM ]]></help>
+</param>
 <param name="chimeric" argument="-c" type="data" format="sam,bam,cram" optional="true" label="STAR Chimeric.out.sam">
 <help><![CDATA[ only required, if STAR was run with the parameter '--chimOutType SeparateSAMold' ]]></help>
 </param>
 </when>
 <when value="use_fastq">
 (denoted as '...'), fill the gaps using the assembly sequence wherever possible.
 </help>
 </param>
 </section>
 <param name="output_fusions_discarded" argument="-O" type="boolean" truevalue="yes" falsevalue="no" checked="true" label="Output fusions.discarded.tsv"/>
+<param name="output_fusions_vcf" type="boolean" truevalue="yes" falsevalue="no" checked="true" label="Output fusions.vcf"/>
+<param name="output_fusion_bams" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Output fusion BAMs"/>
 <conditional name="visualization">
 <param name="do_viz" type="select" label="Generate visualization">
 <option value="yes">Yes</option>
 <option value="no">no</option>
 </param>
 <when value="yes">
 <expand macro="visualization_options" />
 </when>
 <when value="no"/>
 </conditional>
 </inputs>
 <outputs>
-<data name="fusions_tsv" format="tabular" label="${tool.name} on ${on_string}: fusions.tsv" from_work_dir="fusions.tsv"/>
+<data name="fusions_tsv" format="tabular" label="${tool.name} on ${on_string}: fusions.tsv" from_work_dir="fusions.tsv">
+<expand macro="fusion_actions" />
+</data>
 <data name="discarded_fusions_tsv" format="tabular" label="${tool.name} on ${on_string}: fusions.discarded.tsv" from_work_dir="fusions.discarded.tsv">
 <filter> output_fusions_discarded == True</filter>
+<expand macro="fusion_actions" />
 </data>
+<data name="fusions_vcf" format="vcf" label="${tool.name} on ${on_string}: fusions.vcf" from_work_dir="fusions.vcf">
+<filter> output_fusions_vcf == True</filter>
+</data>
+<collection name="fusion_bams" type="list" label="${tool.name} on ${on_string}: Fusion Alignments">
+<discover_datasets pattern="(?P&lt;name&gt;fusion_\d+\.bam)$" format="bam" directory="fusion_bams" visible="false"/>
+<filter>output_fusion_bams == True</filter>
+</collection>
 <data name="aligned_bam" format="bam" label="${tool.name} on ${on_string}: Aligned.bam" from_work_dir="Aligned.sortedByCoord.out.bam">
 <filter>input_params['input_source'] == "use_fastq"</filter>
 </data>
 <data name="fusions_pdf" format="pdf" label="${tool.name} on ${on_string}: fusions.pdf" from_work_dir="fusions.pdf">
 <filter>visualization['do_viz'] == "yes"</filter>
 <has_text_matching expression="BCR\tABL1"/>
 </assert_contents>
 </output>
 </test>
 <!-- Test 2 - From exisitng BAM with protein_domains and visualization -->
 <test>
 <conditional name="input_params">
 <param name="input_source" value="use_star"/>
 <param name="input" ftype="sam" value="Aligned.out.sam"/>
 </conditional>
 - Alignments
 Arriba takes the main output file of STAR (Aligned.out.bam) as input (parameter -x). If STAR was run with the parameter --chimOutType WithinBAM, then this file contains all the information needed by Arriba to find fusions. When STAR was run with the parameter --chimOutType SeparateSAMold, the main output file lacks chimeric alignments. Instead, STAR writes them to a separate output file named Chimeric.out.sam. In this case, the file needs to be passed to Arriba via the parameter -c in addition to the main output file Aligned.out.bam.
+STAR index create recommended parameter value:
+* --sjdbOverhang 250
+STAR recommended parameter values ::
+* --outSAMunmapped Within
+* --outFilterMultimapNmax 50
+* --peOverlapNbasesMin 10
+* --alignSplicedMateMapLminOverLmate 0.5
+* --alignSJstitchMismatchNmax 5 -1 5 5
+* --chimSegmentMin 10
+* --chimOutType WithinBAM HardClip
+* --chimJunctionOverhangMin 10
+* --chimScoreDropMax 30
+* --chimScoreJunctionNonGTAG 0
+* --chimScoreSeparation 1
+* --chimSegmentReadGapMax 3
+* --chimMultimapNmax 50
 Arriba extracts three types of reads from the alignment file(s):
 * Split-reads, i.e., reads composed of segments which map in a non-linear way. STAR stores such reads as supplementary alignments.
 * Discordant mates, i.e., paired-end reads which originate from the same fragment but which align in a non-linear way.
 * Alignments which cross the boundaries of annotated genes, because these alignments might arise from focal deletions. In RNA-Seq data deletions of up to several hundred kb are hard to distinguish from splicing. They are represented identically as gapped alignments, because the sizes of many introns are in fact of this order of magnitude. STAR applies a rather arbitrary measure to decide whether a gapped alignment arises from splicing or from a genomic deletion: The parameter --alignIntronMax determines what gap size is still assumed to be a splicing event and introns are used to represent these gaps. Only gaps larger than this limit are classified as potential evidence for genomic deletions and are stored as chimeric alignments. Most STAR-based fusion detection tools only consider chimeric alignments as evidence for gene fusions and are blind to focal deletions, hence. As a workaround, these tools recommend reducing the value of the parameter --alignIntronMax. But this impairs the quality of alignment, because it reduces the scope that STAR searches to find a spliced alignment. To avoid compromising the quality of alignment for the sake of fusion detection, the only solution would be to run STAR twice - once with settings optimized for regular alignment and once for fusion detection. This would double the runtime. In contrast, Arriba does not require to reduce the maximum intron size. It employs a more sensible criterion to distinguish splicing from deletions: Arriba considers all those reads as potential evidence for deletions that span the boundary of annotated genes.

Mercurial > repos > jjohnson > arriba

comparison arriba.xml @ 11:8ed8af5836d1 draft