Repository 'rgrnastar'
Changeset 30:4014de1b6daf (2024-08-27)
Previous changeset 29:b0f2be869d6d (2024-02-14)
Commit message:
planemo upload for repository commit 2b3fa63863a366beef057c7f75ccbcaf0c280151
diff -r b0f2be869d6d -r 4014de1b6daf macros.xml
--- a/macros.xml Wed Feb 14 09:03:31 2024 +0000
+++ b/macros.xml Tue Aug 27 14:11:16 2024 +0000
b'@@ -5,7 +5,7 @@\n     the index versions in sync, but you should manually update @IDX_VERSION_SUFFIX@ -->\n     <!-- STAR version to be used -->\n     <token name="@TOOL_VERSION@">2.7.11a</token>\n-    <token name="@VERSION_SUFFIX@">0</token>\n+    <token name="@VERSION_SUFFIX@">1</token>\n     <token name="@PROFILE@">21.01</token>\n     <!-- STAR index version compatible with this version of STAR\n     This is the STAR version that introduced the index structure expected\n@@ -17,16 +17,14 @@\n     <token name="@IDX_VERSION@">2.7.4a</token>\n     <token name="@IDX_VERSION_SUFFIX@">2</token>\n     <token name="@IDX_DATA_TABLE@">rnastar_index2x_versioned</token>\n-\n     <xml name="requirements">\n         <requirements>\n             <requirement type="package" version="@TOOL_VERSION@">star</requirement>\n             <requirement type="package" version="1.18">samtools</requirement>\n             <requirement type="package" version="1.13">gzip</requirement>\n-            <yield />\n+            <yield/>\n         </requirements>\n     </xml>\n-\n     <xml name="edam">\n         <edam_topics>\n             <edam_topic>topic_3170</edam_topic>\n@@ -36,20 +34,16 @@\n             <edam_operation>operation_0292</edam_operation>\n         </edam_operations>\n     </xml>\n-\n     <xml name="index_selection" token_with_gene_model="0">\n-        <param argument="--genomeDir" type="select"\n-        label="Select reference genome"\n-        help="If your genome of interest is not listed, contact the Galaxy team">\n+        <param argument="--genomeDir" type="select" label="Select reference genome" help="If your genome of interest is not listed, contact the Galaxy team">\n             <options from_data_table="@IDX_DATA_TABLE@">\n-                <filter type="static_value" column="4" value="@WITH_GENE_MODEL@" />\n-                <filter type="static_value" column="5" value="@IDX_VERSION@" />\n-                <filter type="sort_by" column="2" />\n-                <validator type="no_options" message="No indexes are available for the selected input dataset" />\n+                <filter type="static_value" column="4" value="@WITH_GENE_MODEL@"/>\n+                <filter type="static_value" column="5" value="@IDX_VERSION@"/>\n+                <filter type="sort_by" column="2"/>\n+                <validator type="no_options" message="No indexes are available for the selected input dataset"/>\n             </options>\n         </param>\n     </xml>\n-\n     <token name="@FASTQ_GZ_OPTION@">\n         --readFilesCommand zcat\n     </token>\n@@ -59,9 +53,9 @@\n         </citations>\n     </xml>\n     <xml name="SJDBOPTIONS">\n-         <param argument="--sjdbGTFfile" type="data" format="gff3,gtf" label="Gene model (gff3,gtf) file for splice junctions" optional="false" help="Exon junction information for mapping splices"/>\n-         <param argument="--sjdbGTFfeatureExon" type="text" value="exon" label="Elements to use from the gene model to use for splice junctions" help="By default and for almost all cases: \'exon\', referring to finding junctions at the RNA splice sites. This can optionally be changed to allow splicing at other levels, such as \'gene\', \'transcript\', \'CDS\'."/>\n-         <param argument="--sjdbOverhang" type="integer" min="1" value="100" label="Length of the genomic sequence around annotated junctions" help="Used in constructing the splice junctions database. Ideal value is ReadLength-1"/>\n+        <param argument="--sjdbGTFfile" type="data" format="gff3,gtf" label="Gene model (gff3,gtf) file for splice junctions" optional="false" help="Exon junction information for mapping splices"/>\n+        <param argument="--sjdbGTFfeatureExon" type="text" value="exon" label="Elements to use from the gene model to use for splice junctions" help="By default and for almost all cases: \'exon\', referring to finding junctions at the RNA splice sites. This can optionally be changed to allow splicing at other levels, such as \'gene\', \'transcript\', \'CDS\'."/>\n+        <param argument="--sjdbOverhang" type="inte'..b'           <option value="-">No per gene or transcript output as no GTF was provided</option>\n             </param>\n-            <when value="-" />\n+            <when value="-"/>\n         </conditional>\n     </xml>\n     <xml name="outSAMmapqUnique">\n         <!-- MAPQ 255 is the default in STAR (coming from tophat behaviour and compatibility for Cufflinks) but it is a problematic value\n         - according to SAM/BAM specs it means "undefined".\n         - Using 255 as the max mapq causes problem with modern downstream tools like mutect2: and 60 has become an inofficial replacement for 255. -->\n-        <param argument="--outSAMmapqUnique" type="integer" value="60" min="0" max="255"\n-        label="MAPQ value for unique mappers"\n-        help="STAR bases the mapping quality scores of alignment records in its BAM output on the number of alternative mappings for the read. If a read maps to multiple locations on the reference genome, the following MAPQ scoring scheme is\n-used: >=5 mappings => MAPQ=0; 3-4 mappings => MAPQ=1; 2 mappings => MAPQ=3. This setting lets you control the MAPQ used for reads mapped to a single location. Set to 255 for compatibility with Cufflink (default in STAR) but keep to 60 for modern downstream tools like mutect2." />\n+        <param argument="--outSAMmapqUnique" type="integer" value="60" min="0" max="255" label="MAPQ value for unique mappers" help="STAR bases the mapping quality scores of alignment records in its BAM output on the number of alternative mappings for the read. If a read maps to multiple locations on the reference genome, the following MAPQ scoring scheme is used: &gt;=5 mappings =&gt; MAPQ=0; 3-4 mappings =&gt; MAPQ=1; 2 mappings =&gt; MAPQ=3. This setting lets you control the MAPQ used for reads mapped to a single location. Set to 255 for compatibility with Cufflink (default in STAR) but keep to 60 for modern downstream tools like mutect2."/>\n+    </xml>\n+    <xml name="wasp">\n+        <!--\n+            This is re-implementation of the original WASP algorithm by Bryce van de Geijn, Graham McVicker,\n+            Yoav Gilad and Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12,\n+            1061\xe2\x80\x931063 (2015) WASP filtering is activated\n+            with "waspOutputMode SAMtag".\n+            -->\n+        <conditional name="wasp_conditional">\n+            <param argument="--waspOutputMode" type="select" label="Actiavte WASP filtering">\n+                <help><![CDATA[This is a reimplementation of the original WASP algorithm by Bryce van de Geijn, Graham McVicker,\n+                    Yoav Gilad and Jonathan K Pritchard. This option will add the vW tag to the SAM output. vW:i:1 means\n+                    alignment passed WASP filtering, and all other values mean it did not:<br/>\n+                    - vW:i:2 = multi-mapping read<br/>\n+                    - vW:i:3 = variant base in the read is N (non-ACGT)<br/>\n+                    - vW:i:4 = remapped read did not map <br/>\n+                    - vW:i:5 = remapped read multi-maps <br/>\n+                    - vW:i:6 = remapped read maps to a different locus <br/>\n+                    - vW:i:7 = read overlaps too many variants <br/>\n+                    ]]>\n+                </help>\n+                <option value="" selected="true">No WASP filtering</option>\n+                <option value="wasp_mode">Activate WASP filtering</option>\n+            </param>\n+            <when value="wasp_mode">\n+                <param argument="--varVCFfile" type="data" format="vcf" label="VCF file with personal variants" help="Each variant is expected to have a genotype with two alleles. The VCF file needs to have the 10th column with genotype recorded as 0/1, 1/0, 1/1 (or | instead of /)"/>\n+            </when>\n+            <when value=""/>\n+        </conditional>\n     </xml>\n </macros>\n'
diff -r b0f2be869d6d -r 4014de1b6daf rg_rnaStar.xml
--- a/rg_rnaStar.xml Wed Feb 14 09:03:31 2024 +0000
+++ b/rg_rnaStar.xml Tue Aug 27 14:11:16 2024 +0000
@@ -47,9 +47,11 @@
         ## Two pass mode
         --twopassMode ${twopass.twopassMode} ${twopass.twopass_read_subset}
-        #for $sj_input in $twopass.sj_precalculated:
-            '$sj_input'
-        #end for
+        #if str($twopass.sj_precalculated).strip():
+            #for $sj_input in $twopass.sj_precalculated:
+                '$sj_input'
+            #end for
+        #end if
         #if str($twopass.twopassMode) != 'None':
             #if str($refGenomeSource.GTFconditional.GTFselect) == 'with-gtf':
                 ## need to check first if its a cached index or from history
@@ -215,7 +217,7 @@
             #end if
             ## Limits
-                @LIMITS@
+            @LIMITS@
             ## Go with STAR's default algorithmic settings,
             ## but we need to provide a reasonable default
@@ -232,12 +234,16 @@
                 #end if
             #end if
         #end if
         --outBAMsortingThreadN \${GALAXY_SLOTS:-4}
         --outBAMsortingBinsN $perf.outBAMsortingBinsN
         --winAnchorMultimapNmax $perf.winAnchorMultimapNmax
         --limitBAMsortRAM \$((\${GALAXY_MEMORY_MB:-0}*1000000))
+        #if $oformat.wasp_conditional.waspOutputMode == "wasp_mode":
+            --waspOutputMode SAMtag
+            --varVCFfile '$oformat.wasp_conditional.varVCFfile'
+        #end if
         ## Handle chimeric options and output
         #if str($chimOutType):
             --chimOutType $chimOutType
@@ -408,6 +414,7 @@
             primary?"/> -->
             <param name="outSAMprimaryFlag" type="hidden" value="OneBestScore" />
             <expand macro="outSAMmapqUnique"/>
+            <expand macro="wasp"/>
         <section name="filter" title="Output filter criteria" expanded="true">
             <param name="basic_filters" type="select" display="checkboxes" multiple="true" optional="true"
@@ -565,20 +572,24 @@
         <expand macro="outWigOutputs"/>
         <test expect_num_outputs="3">
             <conditional name="singlePaired">
-                <param name="sPaired" value="single" />
-                <param name="input1" value="tophat_in2.fastqsanger" ftype="fastqsanger" />
+                <param name="sPaired" value="paired" />
+                <param name="input1" value="pbmc_1k_v2_L001.R1.10k.fastq.gz" ftype="fastqsanger.gz" />
+                <param name="input2" value="pbmc_1k_v2_L001.R2.10k.fastq.gz" ftype="fastqsanger.gz" />
             <conditional name="refGenomeSource">
                 <param name="geneSource" value="history" />
-                <param name="genomeFastaFiles" value="tophat_test.fa.gz" />
+                <param name="genomeFastaFiles" value="filtered3.Homo_sapiens.GRCh38.dna.chromosome.21.fa.gz" />
                 <param name="genomeSAindexNbases" value="5" />
             <section name="oformat">
                 <param name="outSAMattributes" value="NH,HI,AS,nM,NM,MD,jM,jI,MC,ch" />
+                <conditional name="wasp_conditional">
+                    <param name="waspOutputMode" value="wasp_mode"/>
+                    <param name="varVCFfile" value="filtered3.vcf" ftype="vcf" />
+                </conditional>
             <section name="algo">
                 <conditional name="params">
@@ -586,8 +597,7 @@
             <output name="output_log" file="rnastar_test.log" compare="re_match_multiline" />
-            <output name="splice_junctions" file="rnastar_test_splicejunctions.bed"/>
-            <output name="mapped_reads" file="rnastar_test_mapped_reads.bam" compare="sim_size" delta="634" />
+            <output name="splice_junctions" file="rnastar_test_splicejunctions_wasp.bed"/>
         <!-- test with cached genome index -->
         <test expect_num_outputs="3">
diff -r b0f2be869d6d -r 4014de1b6daf test-data/filtered3.vcf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filtered3.vcf Tue Aug 27 14:11:16 2024 +0000
diff -r b0f2be869d6d -r 4014de1b6daf test-data/filtered4.bam
Binary file test-data/filtered4.bam has changed
diff -r b0f2be869d6d -r 4014de1b6daf test-data/rnastar_test.log
--- a/test-data/rnastar_test.log Wed Feb 14 09:03:31 2024 +0000
+++ b/test-data/rnastar_test.log Tue Aug 27 14:11:16 2024 +0000
@@ -3,32 +3,35 @@
                                     Finished on | .*
        Mapping speed, Million of reads per hour | .*
-                          Number of input reads | 100
-                      Average input read length | 75
+                          Number of input reads | 15447
+                      Average input read length | 119
                                     UNIQUE READS:
-                   Uniquely mapped reads number | 99
-                        Uniquely mapped reads % | 99.00%
-                          Average mapped length | 74.65
-                       Number of splices: Total | 52
+                   Uniquely mapped reads number | 150
+                        Uniquely mapped reads % | 0.97%
+                          Average mapped length | 105.11
+                       Number of splices: Total | 131
             Number of splices: Annotated (sjdb) | 0
-                       Number of splices: GT/AG | 52
-                       Number of splices: GC/AG | 0
-                       Number of splices: AT/AC | 0
-               Number of splices: Non-canonical | 0
-                      Mismatch rate per base, % | 2.00%
-                         Deletion rate per base | 0.00%
-                        Deletion average length | 0.00
-                        Insertion rate per base | 0.00%
-                       Insertion average length | 0.00
+                       Number of splices: GT/AG | 100
+                       Number of splices: GC/AG | 6
+                       Number of splices: AT/AC | 1
+               Number of splices: Non-canonical | 24
+                      Mismatch rate per base, % | 6.68%
+                         Deletion rate per base | 0.04%
+                        Deletion average length | 1.17
+                        Insertion rate per base | 0.04%
+                       Insertion average length | 1.50
                              MULTI-MAPPING READS:
-        Number of reads mapped to multiple loci | 1
-             % of reads mapped to multiple loci | 1.00%
-        Number of reads mapped to too many loci | 0
-             % of reads mapped to too many loci | 0.00%
+        Number of reads mapped to multiple loci | 451
+             % of reads mapped to multiple loci | 2.92%
+        Number of reads mapped to too many loci | 58
+             % of reads mapped to too many loci | 0.38%
                                   UNMAPPED READS:
+  Number of reads unmapped: too many mismatches | 0
        % of reads unmapped: too many mismatches | 0.00%
-                 % of reads unmapped: too short | 0.00%
-                     % of reads unmapped: other | 0.00%
+            Number of reads unmapped: too short | 7912
+                 % of reads unmapped: too short | 51.22%
+                Number of reads unmapped: other | 6876
+                     % of reads unmapped: other | 44.51%
                                   CHIMERIC READS:
                        Number of chimeric reads | 0
                             % of chimeric reads | 0.00%
diff -r b0f2be869d6d -r 4014de1b6daf test-data/rnastar_test_splicejunctions_wasp.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rnastar_test_splicejunctions_wasp.bed Tue Aug 27 14:11:16 2024 +0000
@@ -0,0 +1,193 @@
