diff featurecounts.xml @ 10:46cccc52be5f draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/featurecounts commit cf1ae941d02bff8848f05c4e4039457656e3a4e8
author iuc
date Sun, 14 Jan 2018 09:23:49 -0500
parents e6a2a912677a
children e803ca6407c0
line wrap: on
line diff
--- a/featurecounts.xml	Fri Nov 17 06:02:56 2017 -0500
+++ b/featurecounts.xml	Sun Jan 14 09:23:49 2018 -0500
@@ -1,17 +1,27 @@
-<tool id="featurecounts" name="featureCounts" version="1.6.0.1" profile="16.04">
+<tool id="featurecounts" name="featureCounts" version="1.6.0.2" profile="16.04">
     <description>Measure gene expression in RNA-Seq experiments from SAM or BAM files.</description>
     <requirements>
         <requirement type="package" version="1.6.0">subread</requirement>
     </requirements>
 
     <version_command>featureCounts -v 2&gt;&amp;1 | grep .</version_command>
-    <command><![CDATA[
+    <command detect_errors="exit_code"><![CDATA[
+        ## Export fc path for its built-in annotation
+        export FC_PATH=\$(command -v featureCounts | sed 's@/bin/featureCounts$@@') &&
+
         ## Check whether all alignments are from the same type (bam || sam)
         featureCounts
-            #if $gtf_source.ref_source=="history":
-                -a '$gtf_source.reference_gene_sets'
-            #else:
-                -a '$gtf_source.reference_gene_sets_builtin.fields.path'
+
+            #if $anno.anno_select=="gtf":
+                #if $anno.gtf_source.ref_source=="history":
+                    -a '$anno.gtf_source.reference_gene_sets'
+                #else:
+                    -a '$anno.gtf_source.reference_gene_sets_builtin.fields.path'
+                #end if
+                -F "GTF"
+            #elif $anno.anno_select=="builtin":
+                -a \${FC_PATH}/annotation/${anno.genome}_RefSeq_exon.txt
+                -F "SAF"
             #end if
 
             -o "output"
@@ -24,13 +34,13 @@
             -s  $extended_parameters.strand_specificity
                 $extended_parameters.multimapping_enabled.multimapping_counts
 
-                #if str($extended_parameters.multimapping_enabled.multimapping_counts) == " -M"
+                #if str($extended_parameters.multimapping_enabled.multimapping_counts) == " -M":
                     $extended_parameters.multimapping_enabled.fraction
                 #end if
 
                 $extended_parameters.exon_exon_junction_read_counting_enabled.count_exon_exon_junction_reads
-                #if str($extended_parameters.exon_exon_junction_read_counting_enabled.count_exon_exon_junction_reads) == "-J"
-                    #if $extended_parameters.exon_exon_junction_read_counting_enabled.genome
+                #if str($extended_parameters.exon_exon_junction_read_counting_enabled.count_exon_exon_junction_reads) == "-J":
+                    #if $extended_parameters.exon_exon_junction_read_counting_enabled.genome:
                         -G '$extended_parameters.exon_exon_junction_read_counting_enabled.genome'
                     #end if
                 #end if
@@ -48,18 +58,18 @@
                 $extended_parameters.primary
                 $extended_parameters.ignore_dup
 
-                #if str($extended_parameters.read_extension_5p) != "0"
+                #if str($extended_parameters.read_extension_5p) != "0":
                     --readExtension5 $extended_parameters.read_extension_5p
                 #end if
 
-                #if str($extended_parameters.read_extension_3p) != "0"
+                #if str($extended_parameters.read_extension_3p) != "0":
                     --readExtension3 $extended_parameters.read_extension_3p
                 #end if
 
                 $pe_parameters.fragment_counting_enabled.fragment_counting
-                #if str($pe_parameters.fragment_counting_enabled.fragment_counting) == " -p"
+                #if str($pe_parameters.fragment_counting_enabled.fragment_counting) == " -p":
                     $pe_parameters.fragment_counting_enabled.check_distance_enabled.check_distance
-                    #if str($pe_parameters.fragment_counting_enabled.check_distance_enabled.check_distance) == " -P"
+                    #if str($pe_parameters.fragment_counting_enabled.check_distance_enabled.check_distance) == " -P":
                         -d $pe_parameters.fragment_counting_enabled.check_distance_enabled.minimum_fragment_length
                         -D $pe_parameters.fragment_counting_enabled.check_distance_enabled.maximum_fragment_length
                     #end if
@@ -70,11 +80,19 @@
 
         '${alignment}'
 
-        ## Removal of comment and column-header line
-        && grep -v "^#" "output" | tail -n+2 > body.txt
+        ## Removal of comment
+        && grep -v "^#" "output"
 
+        #if $format.value != "tabdel_short":
+            ## and remove column-header line
+            | tail -n+2
+        #else
+            ## update header
+            | sed --expression='s|${alignment}|${alignment.element_identifier}|g'
+        #end if
+        > body.txt
         ## Set the right columns for the tabular formats
-        #if $format.value == "tabdel_medium"
+        #if $format.value == "tabdel_medium":
             && cut -f 1,7 body.txt > expression_matrix.txt
 
             ## Paste doesn't allow a non ordered list of columns: -f 1,7,8,6 will only return columns 1,7 and 8
@@ -82,23 +100,30 @@
             && cut -f 6 body.txt > gene_lengths.txt
             && paste expression_matrix.txt gene_lengths.txt > expression_matrix.txt.bak
             && mv -f expression_matrix.txt.bak '${output_medium}'
-        #elif $format.value == "tabdel_short"
+        #elif $format.value == "tabdel_short" or $format.value == "tabdel_short_noheader":
             && cut -f 1,7 body.txt > '${output_short}'
-        #else
+        #else:
             && cp body.txt '${output_full}'
         #end if
 
-
-        #if str($include_feature_length_file) == "true"
+        #if str($include_feature_length_file) == "true":
             && cut -f 1,6 body.txt > '${output_feature_lengths}'
         #end if
 
-        #if str($extended_parameters.exon_exon_junction_read_counting_enabled.count_exon_exon_junction_reads) == "-J"
-            && tail -n+2 'output.jcounts' > '${output_jcounts}'
+        #if str($extended_parameters.exon_exon_junction_read_counting_enabled.count_exon_exon_junction_reads) == "-J":
+            #if $format.value != "tabdel_short":
+              && tail -n+2 'output.jcounts' > '${output_jcounts}'
+            #else:
+
+              && sed --expression='s|${alignment}|${alignment.element_identifier}|g' 'output.jcounts' > '${output_jcounts}'
+            #end if
         #end if
 
-        && tail -n+2 'output.summary' > '${output_summary}'
-
+        #if $format.value != "tabdel_short":
+            && tail -n+2 'output.summary' > '${output_summary}'
+        #else:
+            && sed --expression='s|${alignment}|${alignment.element_identifier}|g' 'output.summary' > '${output_summary}'
+        #end if
     ]]></command>
     <inputs>
         <param name="alignment"
@@ -107,26 +132,41 @@
                format="bam,sam"
                label="Alignment file"
                help="The input alignment file(s) where the gene expression has to be counted. The file can have a SAM or BAM format; but ALL files must be in the same format" />
-
-        <conditional name="gtf_source">
-            <param name="ref_source" type="select" label="Gene annotation file">
-                <option value="cached">locally cached</option>
-                <option value="history">in your history</option>
+        <conditional name="anno">
+            <param name="anno_select" type="select" label="Gene annotation file">
+                <option value="builtin">featureCounts built-in</option>
+                <option value="gtf">GTF file</option>
             </param>
-            <when value="cached">
-                <param name="reference_gene_sets_builtin" type="select" label="Using locally cached annotation" help="If the annotation file you require is not listed here, please contact the Galaxy administrator">
-                    <options from_data_table="gene_sets">
-                        <filter type="sort_by" column="1" />
-                        <validator type="no_options" message="No annotations are available." />
-                    </options>
+            <when value="builtin">
+                <param name="genome" type="select" label="Select built-in genome" help="Built-in gene annotations for genomes hg38, hg19, mm10 and mm9 are included in featureCounts">
+                    <option value="hg38">hg38</option>
+                    <option value="hg19">hg19</option>
+                    <option value="mm10">mm10</option>
+                    <option value="mm9">mm9</option>
                 </param>
             </when>
-            <when value="history">
-                <param name="reference_gene_sets"
-                       format="gff,gtf,gff3"
-                       type="data"
-                       label="Gene annotation file"
-                       help="The program assumes that the provided annotation file is in GTF format. Make sure that the gene annotation file corresponds to the same reference genome as used for the alignment" />
+            <when value="gtf">
+                <conditional name="gtf_source">
+                    <param name="ref_source" type="select" label="Gene annotation file">
+                        <option value="cached">locally cached</option>
+                        <option value="history">in your history</option>
+                    </param>
+                    <when value="cached">
+                        <param name="reference_gene_sets_builtin" type="select" label="Using locally cached annotation" help="If the annotation file you require is not listed here, please contact the Galaxy administrator">
+                            <options from_data_table="gene_sets">
+                                <filter type="sort_by" column="1" />
+                                <validator type="no_options" message="No annotations are available." />
+                            </options>
+                        </param>
+                    </when>
+                    <when value="history">
+                        <param name="reference_gene_sets"
+                               format="gff,gtf,gff3"
+                               type="data"
+                               label="Gene annotation file"
+                               help="The program assumes that the provided annotation file is in GTF format. Make sure that the gene annotation file corresponds to the same reference genome as used for the alignment" />
+                    </when>
+                </conditional>
             </when>
         </conditional>
 
@@ -134,7 +174,8 @@
                type="select"
                label="Output format"
                help="The output format will be tabular, select the preferred columns here">
-            <option value="tabdel_short" selected="true">Gene-ID "\t" read-count (DESeq2 IUC wrapper compatible)</option>
+            <option value="tabdel_short_noheader" selected="true">Gene-ID "\t" read-count (DESeq2 IUC wrapper compatible)</option>
+            <option value="tabdel_short">Gene-ID "\t" read-count (MultiQC/edgeR/limma-voom compatible, includes header in output)</option>
             <option value="tabdel_medium">Gene-ID "\t" read-count "\t" gene-length</option>
             <option value="tabdel_full">featureCounts 1.4.0+ default (includes regions provided by the GTF file)</option>
         </param>
@@ -291,7 +332,7 @@
                    label="Long reads"
                    help="If specified, long reads such as Nanopore and PacBio reads will be counted. Long read counting can only run in one thread and only reads (not read-pairs) can be counted." />
 
-           <param name="by_read_group" argument="--byReadGroup" type="boolean" truevalue="--byReadGroup" falsevalue=""
+            <param name="by_read_group" argument="--byReadGroup" type="boolean" truevalue="--byReadGroup" falsevalue=""
                   label="Count reads by read group"
                   help="If specified, reads are counted for each read group separately. The 'RG' tag must be present in the input BAM/SAM alignment files." />
 
@@ -311,7 +352,7 @@
                    label="Minimum bases of overlap"
                    help="Specify the minimum required number of overlapping bases between a read (or a fragment) and a feature. 1 by default. If a negative value is provided, the read will be extended from both ends." />
 
-           <param name="frac_overlap"
+            <param name="frac_overlap"
                   type="integer"
                   value="0"
                   min="0"
@@ -320,7 +361,7 @@
                   label="Minimum fraction (of read) overlapping a feature"
                   help="Specify the minimum required fraction of overlapping bases between a read (or a fragment) and a feature. Value should be within range [0,1]. 0 by default. Number of overlapping bases is counted from both reads if paired end. Both this option and '--minOverlap' need to be satisfied for read assignment." />
 
-              <param name="frac_overlap_feature"
+            <param name="frac_overlap_feature"
                      type="integer"
                      value="0"
                      min="0"
@@ -391,7 +432,7 @@
         <data format="tabular"
               name="output_short"
               label="${tool.name} on ${on_string}">
-            <filter>format == "tabdel_short"</filter>
+            <filter>format == "tabdel_short_noheader" or format == "tabdel_short"</filter>
             <actions>
                 <action name="column_names" type="metadata" default="Geneid,${alignment.element_identifier}" />
             </actions>
@@ -408,7 +449,6 @@
 
         <data format="tabular"
               name="output_summary"
-              hidden="true"
               label="${tool.name} on ${on_string}: summary">
             <actions>
                 <action name="column_names" type="metadata" default="Status,${alignment.element_identifier}" />
@@ -428,15 +468,17 @@
               label="${tool.name} on ${on_string}: junction counts">
             <filter>extended_parameters['exon_exon_junction_read_counting_enabled']['count_exon_exon_junction_reads']</filter>
             <actions>
-                <action name="column_names" type="metadata" default="PrimaryGene,SecondaryGene,Site1_chr,Site1_location,Site1_strand,Site2_chr,Site2_location,Site2_strand,${alignment.element_identifier}" />
+                <action name="column_names" type="metadata"
+                    default="PrimaryGene,SecondaryGene,Site1_chr,Site1_location,Site1_strand,Site2_chr,Site2_location,Site2_strand,${alignment.element_identifier}" />
             </actions>
         </data>
     </outputs>
     <tests>
         <test expect_num_outputs="4">
             <param name="alignment" value="featureCounts_input1.bam" ftype="bam" />
+            <param name="anno_select" value="gtf"/>
             <param name="reference_gene_sets" value="featureCounts_guide.gff" ftype="gff" />
-            <param name="format" value="tabdel_short" />
+            <param name="format" value="tabdel_short_noheader" />
             <param name="include_feature_length_file" value="true"/>
             <param name="ref_source" value="history" />
             <param name="count_exon_exon_junction_reads" value="-J"/>
@@ -452,6 +494,7 @@
         </test>
         <test expect_num_outputs="3">
             <param name="alignment" value="featureCounts_input1.bam" ftype="bam" />
+            <param name="anno_select" value="gtf"/>
             <param name="reference_gene_sets" value="featureCounts_guide.gff" ftype="gff" />
             <param name="format" value="tabdel_medium" />
             <param name="include_feature_length_file" value="true"/>
@@ -465,6 +508,7 @@
         </test>
         <test expect_num_outputs="3">
             <param name="alignment" value="featureCounts_input1.bam" ftype="bam" />
+            <param name="anno_select" value="gtf"/>
             <param name="reference_gene_sets" value="featureCounts_guide.gff" ftype="gff" />
             <param name="format" value="tabdel_full" />
             <param name="include_feature_length_file" value="true"/>
@@ -479,7 +523,35 @@
                 <metadata name="column_names" value="Feature,Length"/>
             </output>
         </test>
-
+        <test expect_num_outputs="4">
+            <param name="alignment" value="featureCounts_input1.bam" ftype="bam" />
+            <param name="anno_select" value="gtf"/>
+            <param name="reference_gene_sets" value="featureCounts_guide.gff" ftype="gff" />
+            <param name="format" value="tabdel_short" />
+            <param name="include_feature_length_file" value="true"/>
+            <param name="ref_source" value="history" />
+            <param name="count_exon_exon_junction_reads" value="-J"/>
+            <output name="output_short" file="output_1_short_with_header.tab">
+                <metadata name="column_names" value="Geneid,featureCounts_input1.bam"/>
+            </output>
+            <output name="output_summary" file="output_1_summary_with_header.tab">
+                <metadata name="column_names" value="Status,featureCounts_input1.bam"/>
+            </output>
+            <output name="output_jcounts" file="output_1_jcounts_with_header.tab">
+                <metadata name="column_names" value="PrimaryGene,SecondaryGene,Site1_chr,Site1_location,Site1_strand,Site2_chr,Site2_location,Site2_strand,featureCounts_input1.bam"/>
+            </output>
+        </test>
+        <!-- Ensure built-in annotation works -->
+        <test expect_num_outputs="2">
+            <param name="alignment" value="pairend_strandspecific_51mer_hg19_chr1_1-100000.bam" ftype="bam" />
+            <param name="anno_select" value="builtin"/>
+            <param name="format" value="tabdel_short" />
+            <param name="genome" value="hg19" />
+            <output name="output_short" file="output_builtin_hg19.tab">
+                <metadata name="column_names" value="Geneid,pairend_strandspecific_51mer_hg19_chr1_1-100000.bam"/>
+            </output>
+            <output name="output_summary" file="output_summary_builtin_hg19.tab"/>
+        </test>
     </tests>
 
     <help><![CDATA[
@@ -488,7 +560,7 @@
 
 Overview
 --------
-FeatureCounts is a light-weight read counting program written entirely in the C programming language. It can be used to count both gDNA-seq and RNA-seq reads for genomic features in in SAM/BAM files.
+FeatureCounts is a light-weight read counting program written entirely in the C programming language. It can be used to count both gDNA-seq and RNA-seq reads for genomic features in in SAM/BAM files. FeatureCounts is part of the Subread_ package.
 
 Input formats
 -------------
@@ -497,14 +569,18 @@
  - SAM format, http://samtools.sourceforge.net/samtools.shtml#5
  - BAM format
 
-Gene regions should be provided in the GFF/GTF format:
+Annotations for gene regions should be provided in the GFF/GTF format:
 
  - http://genome.ucsc.edu/FAQ/FAQformat.html#format3
  - http://www.ensembl.org/info/website/upload/gff.html
 
+Alternatively, the featureCounts built-in annotations for genomes hg38, hg19, mm10 and mm9 can be used through selecting the built-in option above. These annotations were downloaded from NCBI RefSeq database and then adapted by merging overlapping exons from the same gene to form a set of disjoint exons for each gene. Genes with the same Entrez gene identifiers were also merged into one gene. See the Subread_ User's Guide for more information.
+
 Output format
 -------------
 FeatureCounts produces a table containing counted reads, per gene, per row. Optionally the last column can be set to be the effective gene-length. These tables are compatible with the DESeq2 Galaxy wrapper by IUC. Column names are added as metadata object.
+
+.. _Subread: http://bioinf.wehi.edu.au/subread-package/SubreadUsersGuide.pdf
     ]]></help>
     <citations>
         <citation type="doi">10.1093/bioinformatics/btt656</citation>