Mercurial > repos > bgruening > sailfish

--- a/README.rst	Sun Sep 18 06:05:57 2016 -0400
+++ b/README.rst	Wed Nov 02 10:30:36 2016 -0400
@@ -1,11 +1,10 @@
 Galaxy wrappers for sailfish
 =====================================

-These wrappers are copyright 2014 by Björn Grüning and additional contributors.
+These wrappers are copyright 2014 by Björn Grüning, Mhd Ramez Alrawas and additional contributors.
 All rights reserved. See the licence text below.

-Currently tested with sailfish 0.6.3.
-
+Currently tested with sailfish 0.10.1

 Automated Installation
 ======================
@@ -20,7 +19,8 @@
 Version  Changes
 -------- ----------------------------------------------------------------------
 v0.6.3.0  - First release
-
+-------- ----------------------------------------------------------------------
+v0.10.1   - Second release
 ======== ======================================================================
Binary file sailfish.tar.bz2 has changed
--- a/sailfish.xml	Sun Sep 18 06:05:57 2016 -0400
+++ b/sailfish.xml	Wed Nov 02 10:30:36 2016 -0400
@@ -1,9 +1,5 @@
-<tool id="sailfish" name="Sailfish" version="0.7.6.1">
+<tool id="sailfish" name="Sailfish" version="0.10.1">
     <description>transcript quantification from RNA-seq data</description>
-    <requirements>
-        <requirement type="package" version="0.7.6">sailfish</requirement>
-        <requirement type="package" version="1.57.0">boost</requirement>
-    </requirements>
     <macros>
         <xml name="strandedness">
             <param name="strandedness" type="select" label="Specify the strandedness of the reads">
@@ -13,6 +9,9 @@
             </param>
         </xml>
     </macros>
+    <requirements>
+        <requirement type="package" version="0.10.1">sailfish</requirement>
+    </requirements>
     <stdio>
         <exit_code range="1:" />
         <exit_code range=":-1" />
@@ -23,7 +22,6 @@
     <version_command>sailfish -version</version_command>
     <command>
 <![CDATA[
-
         #if $refTranscriptSource.TranscriptSource == "history":
             sailfish index
                 --transcripts $refTranscriptSource.ownFile
@@ -34,35 +32,26 @@
         #else:
             #set $index_path = $refTranscriptSource.index.fields.path
         #end if
-
         &&
-
         #if $single_or_paired.single_or_paired_opts == 'single':
-
             #if $single_or_paired.input_singles.ext == 'fasta':
                 #set $ext = 'fasta'
             #else:
                 #set $ext = 'fastq'
             #end if
-
             ln -s $single_or_paired.input_singles ./single.$ext &&
         #else:
-
             #if $single_or_paired.input_mate1.ext == 'fasta':
                 #set $ext = 'fasta'
             #else:
                 #set $ext = 'fastq'
             #end if
-
             ln -s $single_or_paired.input_mate1 ./mate1.$ext &&
             ln -s $single_or_paired.input_mate2 ./mate2.$ext &&
         #end if
-
-
         #if $geneMap:
             ln -s "$geneMap" ./geneMap.$geneMap.ext &&
         #end if
-
         sailfish quant
             --index $index_path
             #if $single_or_paired.single_or_paired_opts == 'single':
@@ -73,34 +62,51 @@
                 --mates2 ./mate2.$ext
                 --libType "${single_or_paired.orientation}${single_or_paired.strandedness}"
             #end if
-            --output ./
+            --output ./results
             $biasCorrect
+            $gcBiasCorrect
             --threads "\${GALAXY_SLOTS:-4}"
-
-            #if $fldMean:
+            $dumpEq
+            #if str($gcSizeSamp):
+                --gcSizeSamp $gcSizeSamp
+            #end if
+            #if str($gcSpeedSamp):
+                --gcSpeedSamp $gcSpeedSamp
+            #end if
+            #if str($fldMean):
                 --fldMean $fldMean
             #end if
-
-            #if $fldSD:
+            #if str($fldSD):
                 --fldSD $fldSD
             #end if
-
             #if $maxReadOcc:
                 --maxReadOcc $maxReadOcc
             #end if
-
             #if $geneMap:
                 --geneMap ./geneMap.${geneMap.ext}
             #end if
-
+            $strictIntersect
             $noEffectiveLengthCorrection
             $useVBOpt
-            $allowOrphans
-
+            $discardOrphans
             $unsmoothedFLD
             --maxFragLen ${maxFragLen}
-            --txpAggregationKey "${txpAggregationKey}"
-
+            --txpAggregationKey '${txpAggregationKey}'
+            $ignoreLibCompat
+            $enforceLibCompat
+            $allowDovetail
+            #if str($numBiasSamples):
+                --numBiasSamples $numBiasSamples
+            #end if
+            #if str($numFragSamples):
+                --numFragSamples $numFragSamples
+            #end if
+            #if str($numGibbsSamples):
+                --numGibbsSamples $numGibbsSamples
+            #end if
+            #if str($numBootstraps):
+                --numBootstraps $numBootstraps
+            #end if
 ]]>
     </command>
     <inputs>
@@ -118,7 +124,7 @@
                 </param>
             </when>  <!-- build-in -->
             <when value="history">
-                <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select the reference transcriptome" help="in FASTA format" />
+                <param name="ownFile" type="data" format="fasta"  label="Select the reference transcriptome" help="in FASTA format" />
                 <param argument="kmerSize" type="integer" value="21" max="32" label="The size of the k-mer on which the index is built"
                     help="There is a tradeoff here between the distinctiveness of the k-mers and their robustness to errors.
                         The shorter the k-mers, the more robust they will be to errors in the reads, but the longer the k-mers,
@@ -152,47 +158,109 @@
             where each line contains the name of a transcript and the gene to which it belongs separated by a tab." />

         <param argument="--biasCorrect" type="boolean" truevalue="--biasCorrect" falsevalue="" checked="False"
-                    label="Perform bias correction" help=""/>
+            label="Perform sequence-specific bias correction" help=""/>
+
+        <param argument="--gcBiasCorrect" type="boolean" truevalue="--gcBiasCorrect" falsevalue="" checked="False"
+            label="Perform fragment GC bias correction" help=""/>
+
+        <param argument="--dumpEq" type="boolean" truevalue="--dumpEq" falsevalue="" checked="False"
+            label="Dump the equivalence class counts that were computed during quasi-mapping." help=""/>
+
+        <param argument="--gcSizeSamp" type="integer" value="1" optional="True"
+            label="The value by which to down-sample transcripts when representing the GC content"
+            help="Larger values will reduce memory usage, but may decrease the fidelity of bias modeling results."/>
+
+        <param argument="--gcSpeedSamp" type="integer" value="1" optional="True"
+            label="The value at which the fragment length PMF is down-sampled when evaluating GC fragment bias."
+            help="Larger values speed up effective length correction, but may decrease the fidelity of bias modeling results."/>
+
+        <param argument="--strictIntersect" type="boolean" truevalue="--strictIntersect" falsevalue="" checked="False"
+            label="Strict Intersect." help="When this flag is set, if the intersection of the
+            quasi-mappings for the left and right is empty, then all mappings for the left and all mappings
+            for the right read are reported as orphaned quasi-mappings."/>

         <param argument="--fldMean" type="integer" value="200" optional="True" label="Calculate effective lengths"
-            help="If single end reads are being used for quantification, or there are an insufficient number of uniquely mapping reads when performing paired-end quantification
-                    to estimate the empirical fragment length distribution, then use this value to calculate effective lengths."/>
+            help="If single end reads are being used for quantification, or there are an insufficient number of uniquely
+            mapping reads when performing paired-end quantification
+            to estimate the empirical fragment length distribution, then use this value to calculate effective lengths."/>

         <param argument="--fldSD" type="integer" value="80" optional="True" label="Standard deviation"
-            help="The standard deviation used in the fragment length distribution for single-end quantification or when an empirical distribution cannot be learned."/>
+            help="The standard deviation used in the fragment length distribution for single-end quantification or
+            when an empirical distribution cannot be learned."/>

         <param argument="--maxReadOcc" type="integer" value="200" optional="True" label="Maximal read mapping occurence"
             help="Reads mapping to more than this many places won't be considered."/>

         <param argument="--noEffectiveLengthCorrection" type="boolean" truevalue="--noEffectiveLengthCorrection" falsevalue="" checked="False"
-            label="Disable effective length correction" help="Disables effective length correction when computing the probability that a fragment was generated from a transcript.
+            label="Disable effective length correction" help="Disables effective length correction when computing the probability
+            that a fragment was generated from a transcript.
             If this flag is passed in, the fragment length distribution is not taken into account when computing this probability."/>

         <param argument="--useVBOpt" type="boolean" truevalue="--useVBOpt" falsevalue="" checked="False"
-            label="Use Variational Bayesian EM algorithm for optimization" help=""/>
+            label="Use Variational Bayesian EM algorithm for optimization" help="Use Variational Bayesian EM algorithm rather
+            than the traditional EM angorithm for optimization"/>

-        <param argument="--allowOrphans" type="boolean" truevalue="--allowOrphans" falsevalue="" checked="False"
-            label="Consider orphaned reads as valid hits when performing lightweight-alignment"
-            help="This option will increase sensitivity (allow more reads to map and more transcripts to be detected), but may decrease specificity as orphaned alignments are more likely to be spurious."/>
+        <param argument="--discardOrphans" type="boolean" truevalue="--discardOrphans" falsevalue="" checked="False"
+            label="Discard orphaned reads as valid hits when performing lightweight-alignment"
+            help="This option will discard orphaned fragments. This only has an effect on paired-end input, but enabling this option will discard, rather than count, any reads where only one of the paired fragments maps to a transcript."/>

         <param argument="--unsmoothedFLD" type="boolean" truevalue="--unsmoothedFLD" falsevalue="" checked="False"
-            label="Use the un-smoothed approach to effective length correction" help="This traditional approach works by convolving the FLD with the characteristic function over each transcript."/>
+            label="Use the un-smoothed approach to effective length correction" help="This traditional approach works by convolving the FLD with the
+            characteristic function over each transcript."/>

         <param argument="--maxFragLen" type="integer" value="1000" optional="True"
             label="The maximum length of a fragment to consider when building the empirical fragment length distribution"
             help=""/>

-        <param argument="--txpAggregationKey" value="gene_id" type="text" label="The key for aggregating transcripts during gene-level estimates"
-            help="The default is the gene_id field, but other fields (e.g. gene_name) might be useful depending on the specifics of the annotation being used." />
+        <param argument="--txpAggregationKey" value="gene_id" type="text" label="The key for aggregating transcripts during gene-level estimates">
+            <help>
+              <![CDATA[
+              When generating the gene-level estimates, use the provided key for aggregating transcripts. The default is the "gene_id" field,
+              but other fields (e.g. "gene_name") might be useful depending on the specifics of the annotation being used. Note: this option only
+              affects aggregation when using a GTF annotation; not an annotation in "simple" format.]]>
+            </help>
+        </param>
+        <param argument="--ignoreLibCompat" type="boolean" truevalue="--ignoreLibCompat" falsevalue="" checked="False"
+                label="Disables strand-aware processing completely.">
+            <help>
+                <![CDATA[
+                All hits are considered "Valid".]]>
+            </help>
+        </param>
+        <param argument="--enforceLibCompat" type="boolean" truevalue="--enforceLibCompat" falsevalue="" checked="False"
+                label="Enforces strict library compatibility.">
+            <help>
+                <![CDATA[
+                Fragments that map in a manner other than what is specified by the expected library type will be discarded,
+                even if there are no mappings that agree with the expected library type.]]>
+            </help>
+        </param>
+        <param argument="--allowDovetail" type="boolean" truevalue="--allowDovetail" falsevalue="" checked="False"
+                label="Allow paired-end reads from the same fragment to dovetail.">
+            <help>
+                <![CDATA[
+                Allow paired-end reads from the same fragment to "dovetail", such that the ends of the mapped reads can extend past each other.]]>
+            </help>
+        </param>
+        <param argument="--numBiasSamples" type="integer" value="1000000" optional="True"
+          label="Number of fragment mappings to use when learning the sequene-specific bias model"
+          help=""/>
+        <param argument="--numFragSamples" type="integer" value="10000" optional="True"
+          label="Number of fragments from unique alignments to sample when building the fragment length distribution"
+          help=""/>
+        <param argument="--numGibbsSamples" type="integer" value="0" optional="True"
+          label="Number of Gibbs sampling rounds to perform."
+          help=""/>
+        <param argument="--numBootstraps" type="integer" value="0" optional="True"
+          label="Number of bootstrap samples to generate."
+          help="This is mutually exclusive with Gibbs"/>
+    </inputs>

-    </inputs>
+
     <outputs>
-        <data name="output_quant" format="tabular" from_work_dir="quant.sf" label="${tool.name} on ${on_string} (Quantification)" />
-        <data name="output_bias_corrected_quant" format="tabular" from_work_dir="quant_bias_corrected.sf" label="${tool.name} on ${on_string} (Bias corrected Quantification)">
-            <filter>biasCorrect is True</filter>
-        </data>
-        <data name="output_gene_quant" format="tabular" from_work_dir="quant.genes.sf" label="${tool.name} on ${on_string} (Gene Quantification)">
-            <filter>geneMap is True</filter>
+        <data name="output_quant" format="tabular" from_work_dir="results/quant.sf" label="${tool.name} on ${on_string} (Quantification)" />
+        <data name="output_gene_quant" format="tabular" from_work_dir="results/quant.genes.sf" label="${tool.name} on ${on_string} (Gene Quantification)">
+            <filter>geneMap</filter>
         </data>
     </outputs>
     <tests>
@@ -200,15 +268,33 @@
             <param name="single_or_paired_opts" value="paired" />
             <param name="input_mate1" value="reads_1.fastq" />
             <param name="input_mate2" value="reads_2.fastq" />
-            <param name="biasCorrect" value="True" />
+            <param name="biasCorrect" value="False" />
             <param name="TranscriptSource" value="history" />
             <param name="ownFile" value="transcripts.fasta" ftype="fasta" />
             <output file="sailfish_quant_result1.tab" ftype="tabular" name="output_quant" />
-            <output file="sailfish_bias_result1.tab" ftype="tabular" name="output_bias_corrected_quant" />
+        </test>
+        <test>
+            <param name="single_or_paired_opts" value="paired" />
+            <param name="input_mate1" value="reads_1.fastq" />
+            <param name="input_mate2" value="reads_2.fastq" />
+            <param name="biasCorrect" value="True" />
+            <param name="TranscriptSource" value="history" />
+            <param name="ownFile" value="transcripts.fasta" ftype="fasta" />
+            <output file="sailfish_bias_result1.tab" ftype="tabular" name="output_quant" />
+        </test>
+        <test>
+            <param name="single_or_paired_opts" value="paired" />
+            <param name="input_mate1" value="reads_1.fastq" />
+            <param name="input_mate2" value="reads_2.fastq" />
+            <param name="biasCorrect" value="True" />
+            <param name="TranscriptSource" value="history" />
+            <param name="ownFile" value="transcripts.fasta" ftype="fasta" />
+            <param name="geneMap" value="gene_map.tab" ftype="tabular" />
+            <output file="sailfish_bias_result1.tab" ftype="tabular" name="output_quant" />
+            <output file="sailfish_genMap_result1.tab" ftype="tabular" name="output_gene_quant" />
         </test>
     </tests>
-    <help>
-<![CDATA[
+    <help><![CDATA[

 **What it does**

@@ -336,6 +422,8 @@
    of the TopHat library types, and so there is no direct mapping for them.


-]]>
-    </help>
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1038/nbt.2862</citation>
+    </citations>
 </tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gene_map.tab	Wed Nov 02 10:30:36 2016 -0400
@@ -0,0 +1,3 @@
+NM_174914	foo
+NM_001168316	bar
+NR_003084	baz
--- a/test-data/sailfish_bias_result1.tab	Sun Sep 18 06:05:57 2016 -0400
+++ b/test-data/sailfish_bias_result1.tab	Wed Nov 02 10:30:36 2016 -0400
@@ -1,32 +1,16 @@
-# sailfish (quasi-mapping-based) v0.7.6
-# [ program ] => sailfish
-# [ command ] => quant
-# [ index ] => { ./index_dir }
-# [ mates1 ] => { ./mate1.fastq }
-# [ mates2 ] => { ./mate2.fastq }
-# [ libType ] => { IU }
-# [ output ] => { ./ }
-# [ biasCorrect ] => { }
-# [ threads ] => { 1 }
-# [ fldMean ] => { 200 }
-# [ fldSD ] => { 80 }
-# [ maxReadOcc ] => { 200 }
-# [ maxFragLen ] => { 1000 }
-# [ txpAggregationKey ] => { gene_id }
-# [ mapping rate ] => { 100% }
-# Name	Length	TPM	NumReads
-NM_022658	2288	378838	4881
-NM_174914	2385	111257	1500.04
-NM_017410	2396	3099.5	42
-NM_018953	1612	26168.3	228
-NM_001168316	2283	12398.5	159.361
-NM_004503	1681	36198.8	330.806
-NR_003084	1640	0	0
-NM_173860	849	240218	962
-NM_006897	1541	80244.3	664
-NM_153693	2072	6430.57	74.2815
-NR_031764	1853	10254.5	104.595
-NM_014620	2300	45132.7	584.838
-NM_153633	1666	40578.4	367.074
-NM_014212	2037	4852.08	55
-NM_017409	1959	4330.19	47
+Name	Length	EffectiveLength	TPM	NumReads
+NM_001168316	2283	1528.95	12702.4	158.926
+NM_174914	2385	1599.63	114719	1501.66
+NR_031764	1853	1214.33	10407.1	103.415
+NM_004503	1681	1085.83	37300.1	331.428
+NM_006897	1541	984.724	82401.9	664
+NM_014212	2037	1316.12	5106.81	55
+NM_014620	2300	1541.27	46908.6	591.628
+NM_017409	1959	1273.24	4510.99	47
+NM_017410	2396	1562.29	3285.28	42
+NM_018953	1612	1019.15	27338	227.994
+NM_022658	2288	1634.87	364846	4881
+NM_153633	1666	1082.85	40694.7	360.597
+NM_153693	2072	1374.67	6520.1	73.3448
+NM_173860	849	483.271	243258	962
+NR_003084	1640	1052.77	1.09566	0.00943897
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sailfish_genMap_result1.tab	Wed Nov 02 10:30:36 2016 -0400
@@ -0,0 +1,5 @@
+Name	Length	EffectiveLength	TPM	NumReads
+baz	1640	1052.77	1.09566	0.00943897
+NR_031764	1853	1214.33	10407.1	103.415
+foo	348.949	235.269	976889	9737.65
+bar	2283	1528.95	12702.4	158.926
--- a/test-data/sailfish_quant_result1.tab	Sun Sep 18 06:05:57 2016 -0400
+++ b/test-data/sailfish_quant_result1.tab	Wed Nov 02 10:30:36 2016 -0400
@@ -1,32 +1,16 @@
-# sailfish (quasi-mapping-based) v0.7.6
-# [ program ] => sailfish
-# [ command ] => quant
-# [ index ] => { ./index_dir }
-# [ mates1 ] => { ./mate1.fastq }
-# [ mates2 ] => { ./mate2.fastq }
-# [ libType ] => { IU }
-# [ output ] => { ./ }
-# [ biasCorrect ] => { }
-# [ threads ] => { 1 }
-# [ fldMean ] => { 200 }
-# [ fldSD ] => { 80 }
-# [ maxReadOcc ] => { 200 }
-# [ maxFragLen ] => { 1000 }
-# [ txpAggregationKey ] => { gene_id }
-# [ mapping rate ] => { 100% }
-# Name	Length	TPM	NumReads
-NM_001168316	2283	12398.5	159.361
-NM_174914	2385	111257	1500.04
-NR_031764	1853	10254.5	104.595
-NM_004503	1681	36198.8	330.806
-NM_006897	1541	80244.3	664
-NM_014212	2037	4852.08	55
-NM_014620	2300	45132.7	584.838
-NM_017409	1959	4330.19	47
-NM_017410	2396	3099.5	42
-NM_018953	1612	26168.3	228
-NM_022658	2288	378838	4881
-NM_153633	1666	40578.4	367.074
-NM_153693	2072	6430.57	74.2815
-NM_173860	849	240218	962
-NR_003084	1640	0	0
+Name	Length	EffectiveLength	TPM	NumReads
+NM_001168316	2283	2082.61	12552.5	161.366
+NM_174914	2385	2184.61	111020	1497.1
+NR_031764	1853	1652.61	10345.6	105.535
+NM_004503	1681	1480.61	36162.7	330.503
+NM_006897	1541	1340.61	80240.2	664
+NM_014212	2037	1836.61	4851.45	55
+NM_014620	2300	2099.61	45082	584.273
+NM_017409	1959	1758.61	4329.67	47
+NM_017410	2396	2195.61	3098.99	42
+NM_018953	1612	1411.61	26165.8	227.994
+NM_022658	2288	2087.61	378779	4881
+NM_153633	1666	1465.61	40626.6	367.539
+NM_153693	2072	1871.61	6464.46	74.683
+NM_173860	849	648.611	240280	962
+NR_003084	1640	1439.61	1.04309	0.00926914