view fuma.xml @ 9:b21145d59d9c draft default tip

planemo upload for repository https://github.com/ErasmusMC-Bioinformatics/galaxytools-emc/tree/master/tools/fuma commit 7bf6f3629e96bcaaed8770badd11366889efa99c
author erasmusmc
date Mon, 23 Jan 2017 09:41:08 -0500
parents 8366a8c82a7a
children
line wrap: on
line source

<?xml version="1.0" encoding="UTF-8"?>
<tool id="fuma" name="FuMa" version="3.0.5-g0">
    <description>match detected fusion genes based on gene names (in particular for RNA-Seq)</description>

    <requirements>
        <requirement type="package" version="3.0.5">fuma</requirement>
    </requirements>

    <version_command>fuma --version 2>&amp;1 | head -n 1</version_command><!-- -V also works, but is not GNU standard -->

    <command><![CDATA[
        #import pipes
        
        #set $gene_annotations = []
        #set $samples = []
        #set $links = []
        
        #for $i, $d in enumerate( $datasets )
            #set $sample_name = pipes.quote(str($d['sample'].name))
            
            #set $gene_annotations = $gene_annotations + [ "ga_" + str($i) + ":" + str($d['gene_annotation'].file_name) ]
            
            #set $samples = $samples + [ $sample_name + ":" + str($d['format']) + ":" + str($d['sample'].file_name) ]
            #set $links = $links + [ $sample_name + ":" + str("ga_") + str($i) ]
        #end for
        
        #set $gene_annotations_str = "'"+"' '".join(gene_annotations)+"'"
        #set $samples_str = "'"+"' '".join(samples)+"'"
        #set $links_str = "'"+"' '".join(links)+"'"
        
        fuma 
          -m
            $params.matching_method
          
          $params.strand_specific_matching
          $params.acceptor_donor_order_specific_matchig
          
          --long-gene-size $params.long_gene_size
          
          -a
            $gene_annotations_str
          -s
            $samples_str
          -l
            $links_str
        #if $params.output_format.value == "list_boolean"
          -f list
        #else
          -f $params.output_format.value
        #end if
          -o '${fuma_overview}'
        
        
        #if $params.output_format.value == "list_boolean"
            && fuma-list-to-boolean-list -o tmp.txt '${fuma_overview}'
            && mv tmp.txt '${fuma_overview}'
        #end if
    ]]></command>
    
    <inputs>
        <repeat name="datasets" title="FusionGene Datasets" min="2">
            <param name="sample" type="data" format="txt,tabular" label="Dataset (RNA-Seq fusion gene detection experiment)" />
            <param name="format" type="select" label="Format of dataset">
                <option value="chimera">Chimera prettyPrint()</option>
                <option value="chimerascan">ChimeraScan</option>
                <option value="complete-genomics">Complete Genomics var/mastervar</option>
                <option value="defuse">DeFuse</option>
                <option value="ericscript">EricScript (.results.total.txt)</option>
                <option value="fusion-catcher_final">Fusion Catcher (final-list file)</option>
                <option value="fusionmap">FusionMap</option>
                <option value="trinity-gmap">GMAP (As step after Trinity)</option>
                <option value="oncofuse">OncoFuse</option>
                <option value="soapfuse-final-gene">SOAPFuse (final.*.for.genes.txt)</option>
                <option value="soapfuse-final-transcript">SOAPFuse (final.*.for.trans.txt)</option>
                <option value="rna-star_chimeric">STAR (chimeric file)</option>
                <option value="star-fusion_final">STAR-Fusion (candidates.final)</option>
                <option value="tophat-fusion_pre">Tophat Fusion Pre (fusions.out)</option>
                <option value="tophat-fusion_post_potential_fusion">Tophat Fusion Post (potential_fusion.txt)</option>
                <option value="tophat-fusion_post_result">Tophat Fusion Post (result.txt)</option>
                <option value="tophat-fusion_post_result_html">Tophat Fusion Post (result.html)</option>
            </param>
            <param name="gene_annotation" type="data" format="bed" label="Corresponding gene-name annotation file (BED format)" help="Make use of persistent gene annotations! Gene annotations should only be different if different reference genome builds were used" />
        </repeat>
        
        <conditional name="params">
            <param name="settingsType" type="select" label="Settings to use" help="You can use the default settings or set custom values for any FuMa parameter">
                <option value="preSet" selected="true">Use Defaults</option>
                <option value="full">Full parameter list</option>
            </param>
            <when value="preSet">
                <param name="strand_specific_matching" type="hidden" value="--strand-specific-matching" />
                <param name="acceptor_donor_order_specific_matchig" type="hidden" value="--no-acceptor-donor-order-specific-matching" />
                <param name="long_gene_size" type="hidden" value="200000" />
            </when>
            <when value="full">
                <param name="matching_method" type="select" label="Matching method: technique used to match fusion genes based on annotated gene sets"
                       help="Overlap is the most sensitive but also more sensitive for long gene artefacts; subset is the recommended technique and EGM is conservative"
                       argument="-m">
                    <option value="overlap">Overlap</option>
                    <option value="subset" selected="True">Subset</option>
                    <option value="egm">Exact Geneset Matching (EGM)</option>
                </param>
                
                <param name="strand_specific_matching" type="boolean" checked="True"
                        truevalue="--strand-specific-matching"
                    falsevalue="--no-strand-specific-matching"
                    label="Consider fusion genes distinct when the breakpoints have different strands: (A&lt;-,B&lt;-) != (-&gt;A,B&lt;-); default"
                    help="Only a limited number of file formats support this feature" />
                <param name="acceptor_donor_order_specific_matchig" type="boolean" checked="False"
                        truevalue="--acceptor-donor-order-specific-matching"
                    falsevalue="--no-acceptor-donor-order-specific-matching"
                    label="Consider fusion genes distinct when the donor and acceptor sites are swapped (A,B) != (B,A)"
                    help="This settings is not recommended when fusion genes detected in DNA-Seq are used" />
                
                <param name="long_gene_size" type="integer" min="0" value="200000" label="Long gene size"
                       help="Gene-name based matching is more sensitive to long genes. This is the gene size used to mark fusion genes spanning a 'long gene' as reported the output. Use 0 to disable this feature"
                       argument="--long-gene-size" />
                
                <param name="output_format" type="select" label="Output format">
                    <option value="list_boolean" selected="true">List (Boolean)</option>
                    <option value="list">List</option>
                    <option value="summary">Count summary</option>
                </param>
            </when>
        </conditional>
    </inputs>
    
    <outputs>
        <data format="tabular" name="fuma_overview" label="${tool.name} on ${', '.join([ str(d['sample'].hid)+': '+d['sample'].name for d in $datasets ])}" />
    </outputs>
    
    <tests>
        <!-- Large tests, take long
        <test>
            <param name="datasets_0|sample" value="edgren_chimerascan.txt" ftype="tabular" />
            <param name="datasets_0|format" value="chimerascan" />
            <param name="datasets_0|gene_annotation" value="refseq_genes_hg19.bed" ftype="bed" />

            <param name="datasets_1|sample" value="edgren_defuse.txt" ftype="tabular" />
            <param name="datasets_1|format" value="defuse" />
            <param name="datasets_1|gene_annotation" value="refseq_genes_hg19.bed" ftype="bed" />

            <param name="datasets_2|sample" value="edgren_fusion-map.txt" ftype="tabular" />
            <param name="datasets_2|format" value="fusionmap" />
            <param name="datasets_2|gene_annotation" value="refseq_genes_hg19.bed" ftype="bed" />

            <param name="datasets_3|sample" value="edgren_true_positives.txt" ftype="tabular" />
            <param name="datasets_3|format" value="fusionmap" />
            <param name="datasets_3|gene_annotation" value="refseq_genes_hg19.bed" ftype="bed" />
            
            <param name="settingsType" value="full" />
            
            <param name="matching_method" value="subset" />
            <param name="strand_specific_matching" value=" - - strand-specific-matching" />
            <param name="acceptor_donor_order_specific_matchig" value=" - -acceptor-donor-order-specific-matching" />
            <param name="long_gene_size" value="200000" />
            <param name="output_format" value="list_boolean" />
            
            <output name="fuma_overview" file="edgren_test_01_specifc_matching_output.txt" />
        </test>
        <test>
            <param name="datasets_0|sample" value="edgren_fusion-map.txt" ftype="tabular" />
            <param name="datasets_0|format" value="fusionmap" />
            <param name="datasets_0|gene_annotation" value="refseq_genes_hg19.bed" ftype="bed" />

            <param name="datasets_1|sample" value="edgren_true_positives.txt" ftype="tabular" />
            <param name="datasets_1|format" value="fusionmap" />
            <param name="datasets_1|gene_annotation" value="refseq_genes_hg19.bed" ftype="bed" />

            <param name="datasets_0|sample" value="edgren_chimerascan.txt" ftype="tabular" />
            <param name="datasets_0|format" value="chimerascan" />
            <param name="datasets_0|gene_annotation" value="refseq_genes_hg19.bed" ftype="bed" />
            
            <param name="datasets_1|sample" value="edgren_defuse.txt" ftype="tabular" />
            <param name="datasets_1|format" value="defuse" />
            <param name="datasets_1|gene_annotation" value="refseq_genes_hg19.bed" ftype="bed" />
            
            <param name="settingsType" value="full" />
            
            <param name="matching_method" value="subset" />
            <param name="strand_specific_matching" value="False" />
            <param name="acceptor_donor_order_specific_matchig" value="False" />
            <param name="long_gene_size" value="200000" />
            <param name="output_format" value="list_boolean" />
            
            <output name="fuma_overview" file="edgren_test_02_unspecifc_matching_output.txt" />
        </test>
        -->
        <test>
            <param name="datasets_0|sample" value="edgren_fusion-map.txt" ftype="tabular" />
            <param name="datasets_0|format" value="fusionmap" />
            <param name="datasets_0|gene_annotation" value="refseq_genes_hg19.bed" ftype="bed" />

            <param name="datasets_1|sample" value="edgren_true_positives.txt" ftype="tabular" />
            <param name="datasets_1|format" value="fusionmap" />
            <param name="datasets_1|gene_annotation" value="refseq_genes_hg19.bed" ftype="bed" />
            
            <param name="settingsType" value="full" />
            
            <param name="matching_method" value="subset" />
            <param name="strand_specific_matching" value="--strand-specific-matching" />
            <param name="acceptor_donor_order_specific_matchig" value="--acceptor-donor-order-specific-matching" />
            <param name="long_gene_size" value="200000" />
            <param name="output_format" value="list_boolean" />
            
            <output name="fuma_overview" file="edgren_test_03_specific_matching_output.txt" />
        </test>
        <test>
            <param name="datasets_0|sample" value="edgren_fusion-map.txt" ftype="tabular" />
            <param name="datasets_0|format" value="fusionmap" />
            <param name="datasets_0|gene_annotation" value="refseq_genes_hg19.bed" ftype="bed" />

            <param name="datasets_1|sample" value="edgren_true_positives.txt" ftype="tabular" />
            <param name="datasets_1|format" value="fusionmap" />
            <param name="datasets_1|gene_annotation" value="refseq_genes_hg19.bed" ftype="bed" />
            
            <param name="settingsType" value="full" />
            
            <param name="matching_method" value="subset" />
            <param name="strand_specific_matching" value="False" /><!-- Set to false, automatically sets the false value -->
            <param name="acceptor_donor_order_specific_matchig" value="False" /><!-- Set to false, automatically sets the false value -->
            <param name="long_gene_size" value="200000" />
            <param name="output_format" value="list_boolean" />
            
            <output name="fuma_overview" file="edgren_test_04_unspecific_matching_output.txt" />
        </test>
    </tests>
    
    <help><![CDATA[
============
Introduction
============

A new generation of tools that identify fusion genes in RNA-seq data is limited in either sensitivity and or specificity. To allow further downstream analysis and to estimate performance, predicted fusion genes from different tools have to be compared. However, the transcriptomic context complicates genomic location-based matching. FusionMatcher (FuMa) is a program that reports identical fusion genes based on gene-name annotations. FuMa automatically compares and summarizes all combinations of two or more datasets in a single run, without additional programming necessary. FuMa uses one gene annotation, avoiding mismatches caused by tool specific gene annotations. FuMa matches 10% more fusion genes compared to exact gene matching (EGM) due to overlapping genes and accepts intermediate output files that allow a step wise analysis of corresponding tools.

=====
Usage
=====

After you have uploaded the results of your Fusion Gene detection experiment, and selected the format to be *tabular*, you can start the FuMa wrapper. For each dataset you simply have to add another repeat. Then you have to select a corresponding format:

*******
Formats
*******

+-------------------+-----------------------+-------------------------------------+
|Tools              | File                  | Format string                       |
+===================+=======================+=====================================+
|Chimera            | prettyPrint() output  | chimera                             |
+-------------------+-----------------------+-------------------------------------+
|ChimeraScan        | chimeras.bedpe        | chimerascan                         |
+-------------------+-----------------------+-------------------------------------+
|Complete Genomics  | highConfidenceJu*.tsv | complete-genomics                   |
+-------------------+-----------------------+-------------------------------------+
|Complete Genomics  | allJunctionsBeta*.tsv | complete-genomics                   |
+-------------------+-----------------------+-------------------------------------+
|DeFuse             | results.txt           | defuse                              |
+-------------------+-----------------------+-------------------------------------+
|DeFuse             | results.classify.txt  | defuse                              |
+-------------------+-----------------------+-------------------------------------+
|DeFuse             | results.filtered.txt  | defuse                              |
+-------------------+-----------------------+-------------------------------------+
|EricScript         | .results.total.txt    | ericscript *************            |
+-------------------+-----------------------+-------------------------------------+
|Fusion Catcher     | final-list_cand*.txt  | fusion-catcher_final                |
+-------------------+-----------------------+-------------------------------------+
|FusionMap          |                       | fusionmap                           |
+-------------------+-----------------------+-------------------------------------+
|Trinity + GMAP     |                       | trinity-gmap                        |
+-------------------+-----------------------+-------------------------------------+
|OncoFuse           |                       | oncofuse                            |
+-------------------+-----------------------+-------------------------------------+
|RNA STAR           | Chimeric.out.junction | rna-star_chimeric                   |
+-------------------+-----------------------+-------------------------------------+
|SOAPFuse           | final.*.for.genes.txt | soapfuse-final-gene                 |
+-------------------+-----------------------+-------------------------------------+
|SOAPFuse           | final.*.for.trans.txt | soapfuse-final-transcript           |
+-------------------+-----------------------+-------------------------------------+
|STAR Fusion        | _candidates.final     | star-fusion_final                   |
+-------------------+-----------------------+-------------------------------------+
|TopHat Fusion pre  | fusions.out           | tophat-fusion_pre                   |
+-------------------+-----------------------+-------------------------------------+
|TopHat Fusion post | potential_fusion.txt  | tophat-fusion_post_potential_fusion |
+-------------------+-----------------------+-------------------------------------+
|TopHat Fusion post | result.txt            | tophat-fusion_post_result           |
+-------------------+-----------------------+-------------------------------------+
|TopHat Fusion post | result.html           | tophat-fusion_post_result_html      |
+-------------------+-----------------------+-------------------------------------+

************* EricScript often contains entries with unknown breakpoints.
Because no genomic coordinates are given those fusion genes can not be
imported into FuMa and only those with breakpoints will be taken into account.



To annotate genes upon the breakpoints you must provide a BED file that contains gene annotations for the user genome build. Make sure **your BED file contains one gene per line**. You should use BED files that contain one exon per line only if you want restrict your analysis to fusion genes detected within exons.

UCSC genome browser provides a very simple way of obtaining BED files with one gene per line by selecting their *RefSeq Genes*-track and *knownGene*-table and putting the export format to BED. Galaxy should have a built-in UCSC table browser.

    ]]></help>
    
    <citations>
        <citation type="doi">10.1093/bioinformatics/btv721</citation>
    </citations>
</tool>