Mercurial > repos > iuc > data_manager_star_index_builder

--- a/data_manager/macros.xml	Sun Apr 16 08:31:33 2023 +0000
+++ b/data_manager/macros.xml	Thu Dec 05 06:49:40 2024 +0000
@@ -4,8 +4,8 @@
     The data manager uses a symlink to this macro file to keep the STAR and
     the index versions in sync, but you should manually update @IDX_VERSION_SUFFIX@ -->
     <!-- STAR version to be used -->
-    <token name="@TOOL_VERSION@">2.7.10b</token>
-    <token name="@VERSION_SUFFIX@">3</token>
+    <token name="@TOOL_VERSION@">2.7.11a</token>
+    <token name="@VERSION_SUFFIX@">1</token>
     <token name="@PROFILE@">21.01</token>
     <!-- STAR index version compatible with this version of STAR
     This is the STAR version that introduced the index structure expected
@@ -15,18 +15,16 @@
     or by looking for the versionGenome parameter in source/parametersDefault
     of STAR's source code -->
     <token name="@IDX_VERSION@">2.7.4a</token>
-    <token name="@IDX_VERSION_SUFFIX@">1</token>
+    <token name="@IDX_VERSION_SUFFIX@">3</token>
     <token name="@IDX_DATA_TABLE@">rnastar_index2x_versioned</token>
-
     <xml name="requirements">
         <requirements>
             <requirement type="package" version="@TOOL_VERSION@">star</requirement>
-            <requirement type="package" version="1.16.1">samtools</requirement>
-            <requirement type="package" version="1.12">gzip</requirement>
-            <yield />
+            <requirement type="package" version="1.18">samtools</requirement>
+            <requirement type="package" version="1.13">gzip</requirement>
+            <yield/>
         </requirements>
     </xml>
-
     <xml name="edam">
         <edam_topics>
             <edam_topic>topic_3170</edam_topic>
@@ -36,20 +34,16 @@
             <edam_operation>operation_0292</edam_operation>
         </edam_operations>
     </xml>
-
     <xml name="index_selection" token_with_gene_model="0">
-        <param argument="--genomeDir" type="select"
-        label="Select reference genome"
-        help="If your genome of interest is not listed, contact the Galaxy team">
+        <param argument="--genomeDir" type="select" label="Select reference genome" help="If your genome of interest is not listed, contact the Galaxy team">
             <options from_data_table="@IDX_DATA_TABLE@">
-                <filter type="static_value" column="4" value="@WITH_GENE_MODEL@" />
-                <filter type="static_value" column="5" value="@IDX_VERSION@" />
-                <filter type="sort_by" column="2" />
-                <validator type="no_options" message="No indexes are available for the selected input dataset" />
+                <filter type="static_value" column="4" value="@WITH_GENE_MODEL@"/>
+                <filter type="static_value" column="5" value="@IDX_VERSION@"/>
+                <filter type="sort_by" column="2"/>
+                <validator type="no_options" message="No indexes are available for the selected input dataset"/>
             </options>
         </param>
     </xml>
-
     <token name="@FASTQ_GZ_OPTION@">
         --readFilesCommand zcat
     </token>
@@ -59,8 +53,9 @@
         </citations>
     </xml>
     <xml name="SJDBOPTIONS">
-         <param argument="--sjdbGTFfile" type="data" format="gff3,gtf" label="Gene model (gff3,gtf) file for splice junctions" optional="false" help="Exon junction information for mapping splices"/>
-         <param argument="--sjdbOverhang" type="integer" min="1" value="100" label="Length of the genomic sequence around annotated junctions" help="Used in constructing the splice junctions database. Ideal value is ReadLength-1"/>
+        <param argument="--sjdbGTFfile" type="data" format="gff3,gtf" label="Gene model (gff3,gtf) file for splice junctions" optional="false" help="Exon junction information for mapping splices"/>
+        <param argument="--sjdbGTFfeatureExon" type="text" value="exon" label="Elements to use from the gene model to use for splice junctions" help="By default and for almost all cases: 'exon', referring to finding junctions at the RNA splice sites. This can optionally be changed to allow splicing at other levels, such as 'gene', 'transcript', 'CDS'."/>
+        <param argument="--sjdbOverhang" type="integer" min="1" value="100" label="Length of the genomic sequence around annotated junctions" help="Used in constructing the splice junctions database. Ideal value is ReadLength-1"/>
     </xml>
     <xml name="dbKeyActions">
         <actions>
@@ -79,7 +74,7 @@
             </when>
             <when value="history">
                 <action type="metadata" name="dbkey">
-                    <option type="from_param" name="refGenomeSource.genomeFastaFiles" param_attribute="dbkey" />
+                    <option type="from_param" name="refGenomeSource.genomeFastaFiles" param_attribute="dbkey"/>
                 </action>
             </when>
         </conditional>
@@ -103,15 +98,17 @@
                 #if str($refGenomeSource.GTFconditional.GTFselect) == 'with-gtf':
                     --sjdbOverhang '${refGenomeSource.GTFconditional.sjdbOverhang}'
                     --sjdbGTFfile '${refGenomeSource.GTFconditional.sjdbGTFfile}'
+                    --sjdbGTFfeatureExon '${refGenomeSource.GTFconditional.sjdbGTFfeatureExon}'
                     #if str($refGenomeSource.GTFconditional.sjdbGTFfile.ext) == 'gff3':
                         --sjdbGTFtagExonParentTranscript Parent
                     #end if
                 #end if
             #else:
-                ## ref genome selection is less complex for STARsolo cause
+                ## ref genome selection is less complex for STARsolo because
                 ## with-gtf is mandatory there
                 --sjdbOverhang '${refGenomeSource.sjdbOverhang}'
                 --sjdbGTFfile '${refGenomeSource.sjdbGTFfile}'
+                --sjdbGTFfeatureExon '${refGenomeSource.sjdbGTFfeatureExon}'
                 #if str($refGenomeSource.sjdbGTFfile.ext) == 'gff3':
                     --sjdbGTFtagExonParentTranscript Parent
                 #end if
@@ -119,13 +116,20 @@
             #if str($refGenomeSource.genomeSAindexNbases):
                 --genomeSAindexNbases ${refGenomeSource.genomeSAindexNbases}
             #end if
+            ## Diploid mode
+            #if 'diploidconditional' in $refGenomeSource:
+                #if str($refGenomeSource.diploidconditional.diploid) == 'Yes':
+                    --genomeTransformVCF '${refGenomeSource.diploidconditional.genomeTransformVCF}'
+                    --genomeTransformType Diploid
+                #end if
+            #end if
             --runThreadN \${GALAXY_SLOTS:-4}
             ## in bytes
             --limitGenomeGenerateRAM \$((\${GALAXY_MEMORY_MB:-31000} * 1000000))
         &&
     #end if
     ]]></token>
-    <token name="@REFGENOMEHANDLING@" ><![CDATA[
+    <token name="@REFGENOMEHANDLING@"><![CDATA[
     --runThreadN \${GALAXY_SLOTS:-4}
     --genomeLoad NoSharedMemory
     --genomeDir
@@ -137,13 +141,14 @@
         #if str($refGenomeSource.GTFconditional.GTFselect) == 'without-gtf-with-gtf':
             --sjdbOverhang $refGenomeSource.GTFconditional.sjdbOverhang
             --sjdbGTFfile '${refGenomeSource.GTFconditional.sjdbGTFfile}'
+            --sjdbGTFfeatureExon '${refGenomeSource.GTFconditional.sjdbGTFfeatureExon}'
             #if str($refGenomeSource.GTFconditional.sjdbGTFfile.ext) == 'gff3':
                 --sjdbGTFtagExonParentTranscript Parent
             #end if
         #end if
     #end if
     ]]></token>
-    <token name="@READSHANDLING@" ><![CDATA[
+    <token name="@READSHANDLING@"><![CDATA[
     ## Check that the input pairs are of the same type
     ## otherwise STARsolo will run for a long time and then error out.
     ## We consume either repeats of two inputs R1 + R2
@@ -172,59 +177,57 @@
         @FASTQ_GZ_OPTION@
     #end if
     ]]></token>
-    <token name="@LIMITS@" ><![CDATA[
+    <token name="@LIMITS@"><![CDATA[
         --limitOutSJoneRead $getVar('algo.params.junction_limits.limitOutSJoneRead', $getVar('solo.junction_limits.limitOutSJoneRead', 1000))
         --limitOutSJcollapsed $getVar('algo.params.junction_limits.limitOutSJcollapsed', $getVar('solo.junction_limits.limitOutSJcollapsed', 1000000))
         --limitSjdbInsertNsj $getVar('algo.params.junction_limits.limitSjdbInsertNsj', $getVar('solo.junction_limits.limitSjdbInsertNsj', 1000000))
     ]]></token>
     <xml name="ref_selection">
-        <param argument="--genomeFastaFiles" type="data" format="fasta,fasta.gz" label="Select a reference genome" />
-          <param argument="--genomeSAindexNbases" type="integer" min="2" max="16" value="14" label="Length of the SA pre-indexing string" help="Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1)"/>
+        <param argument="--genomeFastaFiles" type="data" format="fasta,fasta.gz" label="Select a reference genome"/>
+        <param argument="--genomeSAindexNbases" type="integer" min="2" max="16" value="14" label="Length of the SA pre-indexing string" help="Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1)"/>
     </xml>
-    <xml name="stdio" >
+    <xml name="stdio">
         <stdio>
             <regex match="FATAL error" source="both" level="fatal"/>
             <regex match="EXITING: FATAL INPUT ERROR:" source="both" level="fatal"/>
             <regex match="EXITING: fatal error trying to allocate genome arrays, exception thrown: std::bad_alloc" source="both" level="fatal"/>
             <regex match="\[sam_read1\] missing header\? Abort!" source="both" level="fatal"/>
-            <yield />
+            <yield/>
         </stdio>
     </xml>
     <xml name="input_selection">
-        <conditional name="input_types" >
-            <param name="use" type="select" label="Input Type" >
-                <option value="repeat" >Separate barcode and cDNA reads</option>
-                <option value="list_paired" >Paired collection of barcode and cDNA reads</option>
+        <conditional name="input_types">
+            <param name="use" type="select" label="Input Type">
+                <option value="repeat">Separate barcode and cDNA reads</option>
+                <option value="list_paired">Paired collection of barcode and cDNA reads</option>
             </param>
             <when value="repeat">
-                <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input1" type="data"  multiple="true"
-                label="RNA-Seq FASTQ/FASTA file, Barcode reads" />
-                <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input2" type="data"  multiple="true"
-                label="RNA-Seq FASTQ/FASTA file, cDNA reads"/>
+                <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input1" type="data" multiple="true" label="RNA-Seq FASTQ/FASTA file, Barcode reads"/>
+                <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input2" type="data" multiple="true" label="RNA-Seq FASTQ/FASTA file, cDNA reads"/>
             </when>
             <when value="list_paired">
-                <param name="input_collection" collection_type="paired" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="Collection of Pairs" />
+                <param name="input_collection" collection_type="paired" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="Collection of Pairs"/>
             </when>
         </conditional>
     </xml>
     <xml name="input_selection_smart_seq">
-        <conditional name="input_types_smart_seq" >
-            <param name="use" type="select" label="Input Type" >
-                <option value="list_single_end" >Single-end FASTQ collection</option>
-                <option value="list_paired_end" >Paired FASTQ collection</option>
+        <conditional name="input_types_smart_seq">
+            <param name="use" type="select" label="Input Type">
+                <option value="list_single_end">Single-end FASTQ collection</option>
+                <option value="list_paired_end">Paired FASTQ collection</option>
             </param>
             <when value="list_single_end">
-                <param name="single_end_collection" collection_type="list" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="List of single-end FASTQ files" />
+                <param name="single_end_collection" collection_type="list" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="List of single-end FASTQ files"/>
             </when>
             <when value="list_paired_end">
-                <param name="paired_end_collection" collection_type="list:paired" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="List of paired-end FASTQ files" />
+                <param name="paired_end_collection" collection_type="list:paired" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="List of paired-end FASTQ files"/>
             </when>
         </conditional>
     </xml>
     <xml name="umidedup_options">
         <option value="1MM_All" selected="true">Collapse all UMIs with 1 mismatch distance to each other (1MM_All)</option>
-        <option value="1MM_Directional_UMItools" >Directional method from the UMI-tool</option>
-        <option value="1MM_Directional" >Directional with stringent UMI deduplication</option>
+        <option value="1MM_Directional_UMItools">Directional method from the UMI-tool</option>
+        <option value="1MM_Directional">Directional with stringent UMI deduplication</option>
     </xml>
     <xml name="anchor_types">
         <option value="0">Read start</option>
@@ -233,16 +236,16 @@
         <option value="3">Adapter end</option>
     </xml>
     <xml name="cb_match_wl_common">
-        <option value="Exact" >Exact</option>
-        <option value="1MM" >Single match (1MM)</option>
+        <option value="Exact">Exact</option>
+        <option value="1MM">Single match (1MM)</option>
     </xml>
     <xml name="cb_match_wl_cellranger">
-        <option value="1MM_multi" selected="true" >Multiple matches (CellRanger 2, 1MM_multi)</option>
-        <option value="1MM_multi_pseudocounts" >Multiple matches (CellRanger 3, 1MM_multi_pseudocounts)</option>
-        <option value="1MM_multi_Nbase_pseudocounts" >Multimatching to WL is allowed for CBs with N-bases (CellRanger 3, 1MM_multi_Nbase_pseudocounts)</option>
+        <option value="1MM_multi" selected="true">Multiple matches (CellRanger 2, 1MM_multi)</option>
+        <option value="1MM_multi_pseudocounts">Multiple matches (CellRanger 3, 1MM_multi_pseudocounts)</option>
+        <option value="1MM_multi_Nbase_pseudocounts">Multimatching to WL is allowed for CBs with N-bases (CellRanger 3, 1MM_multi_Nbase_pseudocounts)</option>
     </xml>
     <xml name="solo_adapter_params">
-        <param argument="--soloAdapterSequence" type="text" value="-" label="Adapter sequence to anchor barcodes." >
+        <param argument="--soloAdapterSequence" type="text" value="-" label="Adapter sequence to anchor barcodes.">
             <sanitizer>
                 <valid initial="string.digits">
                     <add value="-"/>
@@ -254,11 +257,11 @@
                 </valid>
             </sanitizer>
         </param>
-        <param argument="--soloAdapterMismatchesNmax" type="integer" min="1" value="1" label="Maximum number of mismatches allowed in adapter sequence" />
-        <param argument="--clipAdapterType" type="select" >
-            <option value="Hamming" selected="true" >Adapter clipping based on Hamming distance</option>
-            <option value="CellRanger4" >5p and 3p adapter clipping similar to CellRanger4</option>
-            <option value="None" >No adapter clipping</option>
+        <param argument="--soloAdapterMismatchesNmax" type="integer" min="1" value="1" label="Maximum number of mismatches allowed in adapter sequence"/>
+        <param argument="--clipAdapterType" type="select">
+            <option value="Hamming" selected="true">Adapter clipping based on Hamming distance</option>
+            <option value="CellRanger4">5p and 3p adapter clipping similar to CellRanger4</option>
+            <option value="None">No adapter clipping</option>
         </param>
     </xml>
     <xml name="common_SAM_attributes">
@@ -273,14 +276,14 @@
     </xml>
     <xml name="limits">
         <section name="junction_limits" title="Junction Limits" expanded="false">
-            <param argument="--limitOutSJoneRead" type="integer" min="1" value="1000" label="Maximum number of junctions for one read (including all multimappers)" />
-            <param argument="--limitOutSJcollapsed" type="integer" min="1" value="1000000" label="Maximum number of collapsed junctions" />
-            <param argument="--limitSjdbInsertNsj" type="integer" min="0" value="1000000" label="Maximum number of inserts to be inserted into the genome on the fly." />
+            <param argument="--limitOutSJoneRead" type="integer" min="1" value="1000" label="Maximum number of junctions for one read (including all multimappers)"/>
+            <param argument="--limitOutSJcollapsed" type="integer" min="1" value="1000000" label="Maximum number of collapsed junctions"/>
+            <param argument="--limitSjdbInsertNsj" type="integer" min="0" value="1000000" label="Maximum number of inserts to be inserted into the genome on the fly."/>
         </section>
     </xml>
     <xml name="outCountActions">
         <actions>
-            <action name="column_names" type="metadata" default="GeneID,Counts_unstrand,Counts_firstStrand,Counts_secondStrand" />
+            <action name="column_names" type="metadata" default="GeneID,Counts_unstrand,Counts_firstStrand,Counts_secondStrand"/>
             <expand macro="dbKeyAction"/>
         </actions>
     </xml>
@@ -293,7 +296,7 @@
             </param>
             <when value="None">
                 <!-- This is necessary for the filtering of output -->
-                <param name="outWigStrand" type="hidden" value="false" />
+                <param name="outWigStrand" type="hidden" value="false"/>
             </when>
             <when value="bedGraph">
                 <expand macro="outWigParams"/>
@@ -341,73 +344,92 @@
     <xml name="outWigOutputs">
         <data format="bedgraph" name="signal_unique_str1" label="${tool.name} on ${on_string}: Coverage Uniquely mapped strand 1" from_work_dir="Signal.Unique.str1.out">
             <filter>outWig['outWigType'] != "None"</filter>
-            <expand macro="dbKeyActions" />
+            <expand macro="dbKeyActions"/>
             <change_format>
-                <when input="outWig.outWigType" value="wiggle" format="wig" />
+                <when input="outWig.outWigType" value="wiggle" format="wig"/>
             </change_format>
         </data>
         <data format="bedgraph" name="signal_uniquemultiple_str1" label="${tool.name} on ${on_string}: Coverage Uniquely + Multiple mapped strand 1" from_work_dir="Signal.UniqueMultiple.str1.out">
             <filter>outWig['outWigType'] != "None"</filter>
-            <expand macro="dbKeyActions" />
+            <expand macro="dbKeyActions"/>
             <change_format>
-                <when input="outWig.outWigType" value="wiggle" format="wig" />
+                <when input="outWig.outWigType" value="wiggle" format="wig"/>
             </change_format>
         </data>
         <data format="bedgraph" name="signal_unique_str2" label="${tool.name} on ${on_string}: Coverage Uniquely mapped strand 2" from_work_dir="Signal.Unique.str2.out">
             <filter>outWig['outWigType'] != "None" and outWig['outWigStrand']</filter>
-            <expand macro="dbKeyActions" />
+            <expand macro="dbKeyActions"/>
             <change_format>
-                <when input="outWig.outWigType" value="wiggle" format="wig" />
+                <when input="outWig.outWigType" value="wiggle" format="wig"/>
             </change_format>
         </data>
         <data format="bedgraph" name="signal_uniquemultiple_str2" label="${tool.name} on ${on_string}: Coverage Uniquely + Multiple mapped strand 2" from_work_dir="Signal.UniqueMultiple.str2.out">
             <filter>outWig['outWigType'] != "None" and outWig['outWigStrand']</filter>
-            <expand macro="dbKeyActions" />
+            <expand macro="dbKeyActions"/>
             <change_format>
-                <when input="outWig.outWigType" value="wiggle" format="wig" />
+                <when input="outWig.outWigType" value="wiggle" format="wig"/>
             </change_format>
         </data>
     </xml>
     <xml name="quantMode">
         <conditional name="quantmode_output">
-            <param argument="--quantMode" type="select"
-            label="Per gene/transcript output"
-            help="STAR can provide analysis results not only with respect to the reference genome, but also with respect to genes and transcripts described by a gene model. Note: This functionality requires either the selection above of a cached index with a gene model, or a gene model provided alongside the index/reference genome in GTF or GFF3 format!">
+            <param argument="--quantMode" type="select" label="Per gene/transcript output" help="STAR can provide analysis results not only with respect to the reference genome, but also with respect to genes and transcripts described by a gene model. Note: This functionality requires either the selection above of a cached index with a gene model, or a gene model provided alongside the index/reference genome in GTF or GFF3 format!">
                 <option value="-">No per gene or transcript output</option>
                 <option value="GeneCounts">Per gene read counts (GeneCounts)</option>
                 <option value="TranscriptomeSAM">Transcript-based BAM output (TranscriptomeSAM)</option>
                 <option value="TranscriptomeSAM GeneCounts">Both per gene read counts and transcript-based BAM output (TranscriptomeSAM GeneCounts)</option>
             </param>
-            <when value="-" />
-            <when value="GeneCounts" />
+            <when value="-"/>
+            <when value="GeneCounts"/>
             <when value="TranscriptomeSAM">
-                <param argument="--quantTranscriptomeBan" type="boolean" truevalue="IndelSoftclipSingleend" falsevalue="Singleend"
-                label="Exclude alignments with indels or soft clipping from the transcriptome BAM output?"
-                help="You will need to exclude alignments with indels and soft-clipped bases from the transcriptome BAM output for compatibility with certain transcript quantification tools, most notably RSEM. If you are using a tool, like eXpress, that can deal with indels and soft-clipped bases, you can achieve better results by leaving this option disabled." />
+                <param argument="--quantTranscriptomeBan" type="boolean" truevalue="IndelSoftclipSingleend" falsevalue="Singleend" label="Exclude alignments with indels or soft clipping from the transcriptome BAM output?" help="You will need to exclude alignments with indels and soft-clipped bases from the transcriptome BAM output for compatibility with certain transcript quantification tools, most notably RSEM. If you are using a tool, like eXpress, that can deal with indels and soft-clipped bases, you can achieve better results by leaving this option disabled."/>
             </when>
             <when value="TranscriptomeSAM GeneCounts">
-                <param argument="--quantTranscriptomeBan" type="boolean" truevalue="IndelSoftclipSingleend" falsevalue="Singleend"
-                label="Exclude alignments with indels or soft clipping from the transcriptome BAM output?"
-                help="You will need to exclude alignments with indels and soft-clipped bases from the transcriptome BAM output for compatibility with certain transcript quantification tools, most notably RSEM. If you are using a tool, like eXpress, that can deal with indels and soft-clipped bases, you can achieve better results by leaving this option disabled." />
+                <param argument="--quantTranscriptomeBan" type="boolean" truevalue="IndelSoftclipSingleend" falsevalue="Singleend" label="Exclude alignments with indels or soft clipping from the transcriptome BAM output?" help="You will need to exclude alignments with indels and soft-clipped bases from the transcriptome BAM output for compatibility with certain transcript quantification tools, most notably RSEM. If you are using a tool, like eXpress, that can deal with indels and soft-clipped bases, you can achieve better results by leaving this option disabled."/>
             </when>
         </conditional>
     </xml>
     <xml name="quantModeNoGTF">
         <conditional name="quantmode_output">
-            <param argument="--quantMode" type="select"
-            label="Per gene/transcript output">
+            <param argument="--quantMode" type="select" label="Per gene/transcript output">
                 <option value="-">No per gene or transcript output as no GTF was provided</option>
             </param>
-            <when value="-" />
+            <when value="-"/>
         </conditional>
     </xml>
     <xml name="outSAMmapqUnique">
         <!-- MAPQ 255 is the default in STAR (coming from tophat behaviour and compatibility for Cufflinks) but it is a problematic value
         - according to SAM/BAM specs it means "undefined".
         - Using 255 as the max mapq causes problem with modern downstream tools like mutect2: https://sites.duke.edu/workblog/2021/08/18/star-rnaseq-gatk-mutect2/ and 60 has become an inofficial replacement for 255. -->
-        <param argument="--outSAMmapqUnique" type="integer" value="60" min="0" max="255"
-        label="MAPQ value for unique mappers"
-        help="STAR bases the mapping quality scores of alignment records in its BAM output on the number of alternative mappings for the read. If a read maps to multiple locations on the reference genome, the following MAPQ scoring scheme is
-used: >=5 mappings => MAPQ=0; 3-4 mappings => MAPQ=1; 2 mappings => MAPQ=3. This setting lets you control the MAPQ used for reads mapped to a single location. Set to 255 for compatibility with Cufflink (default in STAR) but keep to 60 for modern downstream tools like mutect2." />
+        <param argument="--outSAMmapqUnique" type="integer" value="60" min="0" max="255" label="MAPQ value for unique mappers" help="STAR bases the mapping quality scores of alignment records in its BAM output on the number of alternative mappings for the read. If a read maps to multiple locations on the reference genome, the following MAPQ scoring scheme is used: &gt;=5 mappings =&gt; MAPQ=0; 3-4 mappings =&gt; MAPQ=1; 2 mappings =&gt; MAPQ=3. This setting lets you control the MAPQ used for reads mapped to a single location. Set to 255 for compatibility with Cufflink (default in STAR) but keep to 60 for modern downstream tools like mutect2."/>
+    </xml>
+    <xml name="wasp">
+        <!--
+            This is re-implementation of the original WASP algorithm by Bryce van de Geijn, Graham McVicker,
+            Yoav Gilad and Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12,
+            1061–1063 (2015) https://www.nature.com/articles/nmeth.3582. WASP filtering is activated
+            with "waspOutputMode SAMtag".
+            -->
+        <conditional name="wasp_conditional">
+            <param argument="--waspOutputMode" type="select" label="Actiavte WASP filtering">
+                <help><![CDATA[This is a reimplementation of the original WASP algorithm by Bryce van de Geijn, Graham McVicker,
+                    Yoav Gilad and Jonathan K Pritchard. https://doi.org/10.1038/nmeth.3582. This option will add the vW tag to the SAM output. vW:i:1 means
+                    alignment passed WASP filtering, and all other values mean it did not:<br/>
+                    - vW:i:2 = multi-mapping read<br/>
+                    - vW:i:3 = variant base in the read is N (non-ACGT)<br/>
+                    - vW:i:4 = remapped read did not map <br/>
+                    - vW:i:5 = remapped read multi-maps <br/>
+                    - vW:i:6 = remapped read maps to a different locus <br/>
+                    - vW:i:7 = read overlaps too many variants <br/>
+                    ]]>
+                </help>
+                <option value="" selected="true">No WASP filtering</option>
+                <option value="wasp_mode">Activate WASP filtering</option>
+            </param>
+            <when value="wasp_mode">
+                <param argument="--varVCFfile" type="data" format="vcf" label="VCF file with personal variants" help="Each variant is expected to have a genotype with two alleles. The VCF file needs to have the 10th column with genotype recorded as 0/1, 1/0, 1/1 (or | instead of /)"/>
+            </when>
+            <when value=""/>
+        </conditional>
     </xml>
 </macros>
--- a/data_manager/rna_star_index_builder.py	Sun Apr 16 08:31:33 2023 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,49 +0,0 @@
-#!/usr/bin/env python
-
-import argparse
-import json
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--config-file')
-    parser.add_argument('--value')
-    parser.add_argument('--dbkey')
-    parser.add_argument('--name')
-    parser.add_argument('--subdir')
-    parser.add_argument('--data-table')
-    parser.add_argument('--with-gene-model', action='store_true')
-    parser.add_argument('--index-version')
-
-    args = parser.parse_args()
-
-    if args.dbkey in [None, '', '?']:
-        raise Exception(
-            '"%s" is not a valid dbkey. You must specify a valid dbkey.'
-            % (args.dbkey)
-        )
-
-    with_gene_model = "0"
-    if args.with_gene_model:
-        with_gene_model = "1"
-
-    data_manager_dict = {
-        'data_tables': {
-            args.data_table: [
-                {
-                    "value": args.value,
-                    "dbkey": args.dbkey,
-                    "name": args.name,
-                    "path": args.subdir,
-                    "with_gene_model": with_gene_model,
-                    "version": args.index_version
-                }
-            ]
-        }
-    }
-    with open(args.config_file, 'w') as fh:
-        json.dump(data_manager_dict, fh, sort_keys=True)
-
-
-if __name__ == "__main__":
-    main()
--- a/data_manager/rna_star_index_builder.xml	Sun Apr 16 08:31:33 2023 +0000
+++ b/data_manager/rna_star_index_builder.xml	Thu Dec 05 06:49:40 2024 +0000
@@ -1,15 +1,15 @@
-<tool id="rna_star_index_builder_data_manager" name="rnastar index versioned" tool_type="manage_data" version="@IDX_VERSION@+galaxy@IDX_VERSION_SUFFIX@" profile="19.05">
+<tool id="rna_star_index_builder_data_manager" name="RNAStar index versioned" tool_type="manage_data" version="@IDX_VERSION@+galaxy@IDX_VERSION_SUFFIX@" profile="23.0">
     <description>builder</description>

     <macros>
         <import>macros.xml</import>
     </macros>

-    <expand macro="requirements">
-        <requirement type="package" version="3.7">python</requirement>
-    </expand>
+    <expand macro="requirements"/>

     <command><![CDATA[
+. '$dmfxns' &&
+
 if [ -z "\$GALAXY_MEMORY_MB" ] ; then
     GALAXY_MEMORY_BYTES=31000000000 ;
 else
@@ -22,6 +22,27 @@

 mkdir '${target_directory}' &&

+#if $auto_sa_index_nbases or $auto_chr_bin_nbits:
+nbases="\$(grep -v '^>' '${all_fasta_source.fields.path}' | tr -d '\n' | wc -c)" &&
+echo "Bases in reference: \$nbases" &&
+#end if
+
+#if $auto_sa_index_nbases:
+saindex_nbases=\$((\$(log2 \$nbases) / 2 - 1)) &&
+[[ \$saindex_nbases -lt 14 ]] || saindex_nbases=14 &&
+#else if $advanced_options.advanced_options_selector == "advanced":
+saindex_nbases=${advanced_options.genomeSAindexNbases} &&
+#end if
+
+#if $auto_chr_bin_nbits:
+nseqs="\$(grep -c '>' '${all_fasta_source.fields.path}')" &&
+echo "Sequences in reference: \$nseqs" &&
+chr_bin_nbits=\$((\$(log2 \$nbases) / \$(log2 \$nseqs))) &&
+[[ \$chr_bin_nbits -lt 18 ]] || chr_bin_nbits=18 &&
+#else if $advanced_options.advanced_options_selector == "advanced":
+chr_bin_nbits=${advanced_options.genomeChrBinNbits} &&
+#end if
+
 STAR
 --runMode genomeGenerate
 --genomeFastaFiles '${all_fasta_source.fields.path}'
@@ -31,29 +52,52 @@
     --sjdbGTFfile '${GTFconditional.sjdbGTFfile}'
     --sjdbOverhang ${GTFconditional.sjdbOverhang}
 #end if
+#if $advanced_options.advanced_options_selector == "advanced" or $auto_sa_index_nbases:
+    --genomeSAindexNbases "\$saindex_nbases"
+#end if
+#if $advanced_options.advanced_options_selector == "advanced" or $auto_chr_bin_nbits:
+    --genomeChrBinNbits "\$chr_bin_nbits"
+#end if
 #if $advanced_options.advanced_options_selector == "advanced":
-    --genomeSAindexNbases ${advanced_options.genomeSAindexNbases}
-    --genomeChrBinNbits ${advanced_options.genomeChrBinNbits}
     --genomeSAsparseD ${advanced_options.genomeSAsparseD}
 #end if
 --runThreadN \${GALAXY_SLOTS:-2} &&

-python '${__tool_directory__}/rna_star_index_builder.py'
---config-file '${out_file}'
---value '${all_fasta_source.fields.value}'
---dbkey '${all_fasta_source.fields.dbkey}'
---index-version '@IDX_VERSION@'
-#if $name:
-    --name '$name'
-#else
-    --name '${all_fasta_source.fields.name}'
-#end if
-#if str($GTFconditional.GTFselect) == "withGTF":
-    --with-gene-model
-#end if
---data-table @IDX_DATA_TABLE@
---subdir '${subdir}'
+cp '$dmjson' '$out_file'
     ]]></command>
+    <configfiles>
+        <configfile name="dmfxns"><![CDATA[
+function log2() {
+    local n=\$1
+    local log2=0
+    while [[ \$n -gt 1 ]]; do
+        n=\$((n >> 1))
+        log2=\$((log2 + 1))
+    done
+    [[ \$log2 -gt 0 ]] && echo \$log2 || echo 1
+}
+]]></configfile>
+        <configfile name="dmjson"><![CDATA[#slurp
+#set $fasta_file_name = str($all_fasta_source.fields.path).split('/')[-1]
+#set $name = $name or $all_fasta_source.fields.name
+#set $target_directory = str($out_file.extra_files_path)
+#set $with_gene_model = 1 if str($GTFconditional.GTFselect) == "withGTF" else 0
+{
+  "data_tables":{
+    "@IDX_DATA_TABLE@":[
+      {
+        "value": "${all_fasta_source.fields.value}",
+        "dbkey": "${all_fasta_source.fields.dbkey}",
+        "name": "${name}",
+        "path": "SA",
+        "with_gene_model": "${with_gene_model}",
+        "version": "@IDX_VERSION@"
+      }
+    ]
+  }
+}
+]]></configfile>
+    </configfiles>
     <inputs>
         <param name="all_fasta_source" type="select" label="Source FASTA Sequence">
             <options from_data_table="all_fasta"/>
@@ -71,6 +115,12 @@
             </when>
             <when value="withoutGTF" />
         </conditional>
+        <param name="auto_sa_index_nbases" type="boolean" checked="true"
+            label="Automatically calculate --genomeSAindexNbases"
+            help="The value specified for --genomeSAindexNbases in advanced options will be ignored if this option is selected"/>
+        <param name="auto_chr_bin_nbits" type="boolean" checked="true"
+            label="Automatically calculate --genomeChrBinNbits"
+            help="The value specified for --genomeChrBinNbits in advanced options will be ignored if this option is selected"/>
         <conditional name="advanced_options">
             <param name="advanced_options_selector" type="select" label="Advanced options">
                 <option value="default" selected="true">Use default options</option>
@@ -90,7 +140,8 @@
                         of contigs, it is recommended to scale this parameter as min(18,
                         log2[max(GenomeLength/NumberOfReferences,ReadLength)]). For example, for 3 gigaBase
                         genome with 100,000 chromosomes/scaffolds, this is equal to 15."/>
-                <param argument="--genomeSAsparseD" type="integer" min="1" value="1" label="Suffix array sparsity"
+                <param argument="--genomeSAsparseD" type="integer" min="1" value="1"
+                    label="Suffix array sparsity"
                     help="The distance between indices: use bigger numbers to decrease needed RAM at the cost of
                         mapping speed reduction"/>
             </when>
@@ -104,11 +155,16 @@
     <tests>
         <test>
             <param name="all_fasta_source" value="phiX174"/>
-            <param name="sequence_name" value="phiX"/>
-            <param name="sequence_id" value="minimal-settings"/>
-            <param name="modelformat" value="None"/>
-
-            <output name="out_file" file="test_star_01.data_manager_json" compare="re_match"/>
+            <output name="out_file" file="test_star_01.data_manager_json"/>
+        </test>
+        <test>
+            <param name="all_fasta_source" value="phiX174"/>
+            <param name="name" value="phiX"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text='"name": "phiX"'/>
+                </assert_contents>
+            </output>
         </test>
     </tests>
--- a/data_manager_conf.xml	Sun Apr 16 08:31:33 2023 +0000
+++ b/data_manager_conf.xml	Thu Dec 05 06:49:40 2024 +0000
@@ -12,9 +12,9 @@
                             out_file.extra_files_path is used as base by default
                             if no source, eg for type=directory, then refers to base
                         -->
-                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">rnastar/${version}/${dbkey}/${value}/${path}</target>
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">genomes/${dbkey}/rnastar_index/v${version}/${value}</target>
                     </move>
-                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/rnastar/${version}/${dbkey}/${value}/${path}</value_translation>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/genomes/${dbkey}/rnastar_index/v${version}/${value}</value_translation>
                     <value_translation type="function">abspath</value_translation>
                 </column>
                 <column name="with_gene_model" />
--- a/test-data/test_star_01.data_manager_json	Sun Apr 16 08:31:33 2023 +0000
+++ b/test-data/test_star_01.data_manager_json	Thu Dec 05 06:49:40 2024 +0000
@@ -1,1 +1,14 @@
-{"data_tables": {"rnastar_index2x_versioned": \[{"dbkey": "phiX174", "name": "phiX174", "path": ".*", "value": "phiX174", "version": "2.7.4a", "with_gene_model": "0"}\]}}
+{
+  "data_tables":{
+    "rnastar_index2x_versioned":[
+      {
+        "value": "phiX174",
+        "dbkey": "phiX174",
+        "name": "phiX174",
+        "path": "SA",
+        "with_gene_model": "0",
+        "version": "2.7.4a"
+      }
+    ]
+  }
+}
--- a/tool_data_table_conf.xml.test	Sun Apr 16 08:31:33 2023 +0000
+++ b/tool_data_table_conf.xml.test	Thu Dec 05 06:49:40 2024 +0000
@@ -5,8 +5,8 @@
         <file path="${__HERE__}/test-data/all_fasta.loc" />
     </table>
     <!-- Locations of STAR indexes -->
-    <table name="rnastar_index2_versioned" comment_char="#" allow_duplicate_entries="False">
+    <table name="rnastar_index2x_versioned" comment_char="#" allow_duplicate_entries="False">
         <columns>value, dbkey, name, path, with_gene_model, version</columns>
-        <file path="${__HERE__}/test-data/rnastar_index2_versioned.loc" />
+        <file path="${__HERE__}/test-data/rnastar_index2x_versioned.loc" />
     </table>
 </tables>