Mercurial > repos > iuc > rna_starsolo

--- a/macros.xml	Thu Dec 05 06:50:56 2024 +0000
+++ b/macros.xml	Sat May 31 19:53:27 2025 +0000
@@ -4,8 +4,8 @@
     The data manager uses a symlink to this macro file to keep the STAR and
     the index versions in sync, but you should manually update @IDX_VERSION_SUFFIX@ -->
     <!-- STAR version to be used -->
-    <token name="@TOOL_VERSION@">2.7.11a</token>
-    <token name="@VERSION_SUFFIX@">1</token>
+    <token name="@TOOL_VERSION@">2.7.11b</token>
+    <token name="@VERSION_SUFFIX@">0</token>
     <token name="@PROFILE@">21.01</token>
     <!-- STAR index version compatible with this version of STAR
     This is the STAR version that introduced the index structure expected
@@ -121,7 +121,7 @@
                 #if str($refGenomeSource.diploidconditional.diploid) == 'Yes':
                     --genomeTransformVCF '${refGenomeSource.diploidconditional.genomeTransformVCF}'
                     --genomeTransformType Diploid
-                #end if
+                #end if
             #end if
             --runThreadN \${GALAXY_SLOTS:-4}
             ## in bytes
@@ -371,6 +371,13 @@
             </change_format>
         </data>
     </xml>
+    <xml name="quantTranscriptomeSAMoutput_param">
+        <param argument="--quantTranscriptomeSAMoutput" type="select" label="Alignment filtering for TranscriptomeSAM output">
+            <option value="BanSingleEnd_BanIndels_ExtendSoftclip" selected="true">prohibit indels and single-end alignments, extend softclips - compatible with RSEM</option>
+            <option value="BanSingleEnd">prohibit single-end alignments, allow indels and softclips</option>
+            <option value="BanSingleEnd_ExtendSoftclip">prohibit single-end alignments, extend softclips, allow indels</option>
+        </param>
+    </xml>
     <xml name="quantMode">
         <conditional name="quantmode_output">
             <param argument="--quantMode" type="select" label="Per gene/transcript output" help="STAR can provide analysis results not only with respect to the reference genome, but also with respect to genes and transcripts described by a gene model. Note: This functionality requires either the selection above of a cached index with a gene model, or a gene model provided alongside the index/reference genome in GTF or GFF3 format!">
@@ -382,10 +389,10 @@
             <when value="-"/>
             <when value="GeneCounts"/>
             <when value="TranscriptomeSAM">
-                <param argument="--quantTranscriptomeBan" type="boolean" truevalue="IndelSoftclipSingleend" falsevalue="Singleend" label="Exclude alignments with indels or soft clipping from the transcriptome BAM output?" help="You will need to exclude alignments with indels and soft-clipped bases from the transcriptome BAM output for compatibility with certain transcript quantification tools, most notably RSEM. If you are using a tool, like eXpress, that can deal with indels and soft-clipped bases, you can achieve better results by leaving this option disabled."/>
+                <expand macro="quantTranscriptomeSAMoutput_param"/>
             </when>
             <when value="TranscriptomeSAM GeneCounts">
-                <param argument="--quantTranscriptomeBan" type="boolean" truevalue="IndelSoftclipSingleend" falsevalue="Singleend" label="Exclude alignments with indels or soft clipping from the transcriptome BAM output?" help="You will need to exclude alignments with indels and soft-clipped bases from the transcriptome BAM output for compatibility with certain transcript quantification tools, most notably RSEM. If you are using a tool, like eXpress, that can deal with indels and soft-clipped bases, you can achieve better results by leaving this option disabled."/>
+                <expand macro="quantTranscriptomeSAMoutput_param"/>
             </when>
         </conditional>
     </xml>
@@ -432,4 +439,149 @@
             <when value=""/>
         </conditional>
     </xml>
+    <xml name="full_algo_params">
+        <section name="seed" title="Seed parameters" expanded="false">
+            <param argument="--seedSearchStartLmax" type="integer" min="1" value="50" label="Search start point through the read"/>
+            <param argument="--seedSearchStartLmaxOverLread" type="float" min="0" value="1.0" label="Search start point through the read, normalized to read length"/>
+            <param argument="--seedSearchLmax" type="integer" min="0" value="0" label="Maximum length of seeds" help="Default of 0 indicates no maximum length"/>
+            <param argument="--seedMultimapNmax" type="integer" min="1" value="10000" label="Maximum number of mappings to use a piece in stitching"/>
+            <param argument="--seedPerReadNmax" type="integer" min="1" value="1000" label="Maximum number of seeds per read"/>
+            <param argument="--seedPerWindowNmax" type="integer" min="1" value="50" label="Maximum number of seeds per window"/>
+            <param argument="--seedNoneLociPerWindow" type="integer" min="1" value="10" label="Maximum number of one seed loci per window"/>
+        </section>
+        <section name="align" title="Alignment parameters" expanded="false">
+            <param argument="--alignIntronMin" type="integer" min="0" value="21" label="Minimum intron size"/>
+            <param argument="--alignIntronMax" type="integer" min="0" value="0" label="Maximum intron size"/>
+            <param argument="--alignMatesGapMax" type="integer" min="0" value="0" label="Maximum gap between two mates"/>
+            <param argument="--alignSJoverhangMin" type="integer" min="1" value="5" label="Minimum overhang for spliced alignments"/>
+            <section name="alignSJstitchMismatchNmax" title="Maximum number of mismatches for stitching of the splice junctions (-1: no limit)" expanded="true">
+                <param argument="--alignSJstitchMismatchNmax" name="alignSJstitchMismatchNmax1" type="integer" min="-1" value="0" label="Non-canonical motifs"/>
+                <param argument="--alignSJstitchMismatchNmax" name="alignSJstitchMismatchNmax2" type="integer" min="-1" value="-1" label="GT/AG and CT/AC motif"/>
+                <param argument="--alignSJstitchMismatchNmax" name="alignSJstitchMismatchNmax3" type="integer" min="-1" value="0" label="GC/AG and CT/GC motif"/>
+                <param argument="--alignSJstitchMismatchNmax" name="alignSJstitchMismatchNmax4" type="integer" min="-1" value="0" label="AT/AC and GT/AT motif"/>
+            </section>
+            <param argument="--alignSJDBoverhangMin" type="integer" min="1" value="3" label="Minimum overhang for annotated spliced alignments"/>
+            <param argument="--alignSplicedMateMapLmin" type="integer" min="0" value="0" label="Minimum mapped length for a read mate that is spliced"/>
+            <param argument="--alignSplicedMateMapLminOverLmate" type="float" min="0" value="0.66" label="Minimum mapped length for a read mate that is spliced, normalized to mate length"/>
+            <param argument="--alignWindowsPerReadNmax" type="integer" min="1" value="10000" label="Maximum number of windows per read"/>
+            <param argument="--alignTranscriptsPerWindowNmax" type="integer" min="1" value="100" label="Maximum number of transcripts per window"/>
+            <param argument="--alignTranscriptsPerReadNmax" type="integer" min="1" value="10000" label="Maximum number of different alignments per read to consider"/>
+            <param argument="--alignEndsType" type="select" label="type of read ends alignment">
+                <option value="Local">standard local alignment with soft-clipping allowed</option>
+                <option value="EndToEnd">force end-to-end read alignment, do not soft-clip</option>
+                <option value="Extend5pOfRead1">fully extend only the 5p of the read1, all other ends: local alignment</option>
+                <option value="Extend5pOfReads12">fully extend only the 5p of the both read1 and read2, all other ends: local alignment</option>
+            </param>
+            <param argument="--peOverlapNbasesMin" type="integer" min="0" value="0"
+            label="minimum number of overlap bases to trigger mates merging and realignment" />
+            <param argument="--peOverlapMMp" type="float" min="0" max="1" value="0.01"
+            label="maximum proportion of mismatched bases in the overlap area" />
+        </section>
+        <section name="chim_settings" title="Chimeric alignment parameters" expanded="false">
+            <param argument="--chimSegmentMin" type="integer" min="1" value="12"
+            label="Minimum length of chimeric segment"
+            help="For small numbers this will cause large number of chimeric alignments. A value of 12 is commonly used." />
+            <param argument="--chimScoreMin" type="integer" min="0" value="0"
+            label="Minimum total (summed) score of chimeric segments"/>
+            <param argument="--chimScoreDropMax" type="integer" min="0" value="20"
+            label="Maximum difference of chimeric score from read length"/>
+            <param argument="--chimScoreSeparation" type="integer" min="0" value="10"
+            label="Minimum difference between the best chimeric score and the next one"/>
+            <param argument="--chimScoreJunctionNonGTAG" type="integer" value="-1"
+            label="Penalty for a non-GT/AG chimeric junction"/>
+            <param argument="--chimJunctionOverhangMin" type="integer" min="0" value="20"
+            label="Minimum overhang for a chimeric junction"/>
+            <param argument="--chimSegmentReadGapMax" type="integer" min="0" value="0"
+            label="Maximum gap in the read sequence between chimeric segments" />
+            <param argument="--chimFilter" type="boolean" truevalue="banGenomicN" falsevalue="None" checked="true"
+            label="Discard chimeric alignments with Ns in the genome sequence around the chimeric junction" />
+            <param argument="--chimMainSegmentMultNmax" type="integer" min="1" value="10"
+            label="Maximum number of multi-alignments for the main chimeric segment."
+            help="A value of 1 prohibits multimapping main segments"/>
+            <param argument="--chimMultimapNmax" type="integer" min="1" value="1"
+            label="Maximum number of chimeric multi-alignments"
+            help="The default value of 1 only considers unique alignments. If you chose to report chimeric alignments alongside regular ones in the BAM output, this setting is ignored and only uniquely mapping chimeric reads get reported. " />
+            <param argument="--chimMultimapScoreRange" type="integer" min="0" value="1"
+            label="Score range for multi-mapping chimeras"
+            help="The threshold below the best chimeric score that a multimapping chimera must have to be output. This is ignored unless --chimMultimapNmax is above 1" />
+        </section>
+        <expand macro="limits" />
+    </xml>
+    <xml name="chim_params">
+        <param argument="--chimOutType" type="select"
+        label="Report chimeric alignments?"
+        help="Choose if and how chimeric alignments should be reported. STAR-Fusion users should select the 'Junctions' option and use the resulting tabular dataset as input to STAR-Fusion. Everyone else: note that selecting 'WithinBAM' or 'WithinBAM Junctions' disables the --chimMultimapNmax setting in the algorithmic parameters section below (the tool will only consider uniquely mapped reads in the search for chimeric alignments). If you disable the reporting of chimeric alignments here, then all chimeric alignment settings in the algorithmic parameters section below will be ignored.">
+            <option value="">Don't report chimeric alignments</option>
+            <option value="Junctions">As separate tabular "Junctions" output (Junctions)</option>
+            <option value="WithinBAM">Within the BAM output (together with regular alignments; WithinBAM)</option>
+            <option value="WithinBAM HardClip">Within the BAM output (together with regular alignments; WithinBAM HardClip) hard-clipping in the CIGAR for supplemental chimeric alignments</option>
+            <option value="WithinBAM SoftClip">Within the BAM output (together with regular alignments; WithinBAM SoftClip) soft-clipping in the CIGAR for supplemental chimeric alignments</option>
+        </param>
+    </xml>
+    <token name="@ALGO_FULL@"><![CDATA[
+            ## Extended parameter options
+
+            ## Seed parameter options
+            --seedSearchStartLmax ${algo.params.seed.seedSearchStartLmax}
+            --seedSearchStartLmaxOverLread ${algo.params.seed.seedSearchStartLmaxOverLread}
+            --seedSearchLmax ${algo.params.seed.seedSearchLmax}
+            --seedMultimapNmax ${algo.params.seed.seedMultimapNmax}
+            --seedPerReadNmax ${algo.params.seed.seedPerReadNmax}
+            --seedPerWindowNmax ${algo.params.seed.seedPerWindowNmax}
+            --seedNoneLociPerWindow ${algo.params.seed.seedNoneLociPerWindow}
+
+            ## Alignment parameter options
+            --alignIntronMin ${algo.params.align.alignIntronMin}
+            --alignIntronMax ${algo.params.align.alignIntronMax}
+            --alignMatesGapMax ${algo.params.align.alignMatesGapMax}
+            --alignSJoverhangMin ${algo.params.align.alignSJoverhangMin}
+            --alignSJstitchMismatchNmax ${algo.params.align.alignSJstitchMismatchNmax.alignSJstitchMismatchNmax1} ${algo.params.align.alignSJstitchMismatchNmax.alignSJstitchMismatchNmax2} ${algo.params.align.alignSJstitchMismatchNmax.alignSJstitchMismatchNmax3} ${algo.params.align.alignSJstitchMismatchNmax.alignSJstitchMismatchNmax4}
+            --alignSJDBoverhangMin ${algo.params.align.alignSJDBoverhangMin}
+            --alignSplicedMateMapLmin ${algo.params.align.alignSplicedMateMapLmin}
+            --alignSplicedMateMapLminOverLmate ${algo.params.align.alignSplicedMateMapLminOverLmate}
+            --alignWindowsPerReadNmax ${algo.params.align.alignWindowsPerReadNmax}
+            --alignTranscriptsPerWindowNmax ${algo.params.align.alignTranscriptsPerWindowNmax}
+            --alignTranscriptsPerReadNmax ${algo.params.align.alignTranscriptsPerReadNmax}
+            --alignEndsType ${algo.params.align.alignEndsType}
+            --peOverlapNbasesMin ${algo.params.align.peOverlapNbasesMin}
+            --peOverlapMMp ${algo.params.align.peOverlapMMp}
+            ## Chimeric alignment parameter options
+            #if str($chimOutType):
+                --chimSegmentMin ${algo.params.chim_settings.chimSegmentMin}
+                --chimScoreMin ${algo.params.chim_settings.chimScoreMin}
+                --chimScoreDropMax $algo.params.chim_settings.chimScoreDropMax
+                --chimScoreSeparation $algo.params.chim_settings.chimScoreSeparation
+                --chimScoreJunctionNonGTAG $algo.params.chim_settings.chimScoreJunctionNonGTAG
+                --chimSegmentReadGapMax $algo.params.chim_settings.chimSegmentReadGapMax
+                --chimFilter $algo.params.chim_settings.chimFilter
+                --chimJunctionOverhangMin $algo.params.chim_settings.chimJunctionOverhangMin
+                --chimMainSegmentMultNmax $algo.params.chim_settings.chimMainSegmentMultNmax
+                #if str($chimOutType) == 'Junctions':
+                    --chimMultimapNmax $algo.params.chim_settings.chimMultimapNmax
+                #else:
+                    --chimMultimapNmax 0
+                #end if
+                --chimMultimapScoreRange $algo.params.chim_settings.chimMultimapScoreRange
+            #end if
+
+            ## Limits
+            @LIMITS@
+    ]]></token>
+    <token name="@ALGO_DEFAULT@"><![CDATA[
+            ## Go with STAR's default algorithmic settings,
+            ## but we need to provide a reasonable default
+            ## (taken from STAR-Fusion)
+            ## for --chimSegmentMin in case the user enabled chimeric
+            ## alignments (the STAR default is 0, which disables chimeric
+            ## alignments). For consistency, also set
+            ## --chimMultimapNmax to 1 when chimeric alignments are reported
+            ## in Junctions format only.
+            #if str($chimOutType):
+                --chimSegmentMin 12
+                #if str($chimOutType) == 'Junctions':
+                    --chimMultimapNmax 1
+                #end if
+            #end if
+    ]]></token>
+
 </macros>
--- a/rg_rnaStarSolo.xml	Thu Dec 05 06:50:56 2024 +0000
+++ b/rg_rnaStarSolo.xml	Sat May 31 19:53:27 2025 +0000
@@ -131,8 +131,12 @@
     $solo.outSAMunmapped
     ## Read MAPQ
     --outSAMmapqUnique ${solo.outSAMmapqUnique}
-    ## Limits
-    @LIMITS@
+
+    #if str( $algo.params.settingsType ) == 'full':
+    @ALGO_FULL@
+    #else:
+    @ALGO_DEFAULT@
+    #end if

     ##outWig:
     @OUTWIG@
@@ -273,7 +277,7 @@
                             <option value="MultiGeneUMI" >Remove lower-count UMIs that map to more than one gene</option>
                             <option value="MultiGeneUMI_All" >Remove all UMIs that map to more than one gene</option>
                             <option value="MultiGeneUMI_CR" >Remove lower-count UMIs that map to more than one gene, matching CellRanger > 3.0.0</option>
-                        </param>
+                        </param>
                     </when>
                 </conditional>
                 <param argument="--soloCBmatchWLtype" type="select" label="Matching the Cell Barcodes to the WhiteList" help="Exact: only exact matches allowed; 1MM: only one match in whitelist with 1 mismatched base allowed. Allowed
@@ -320,7 +324,7 @@
                             <option value="MultiGeneUMI" >Remove lower-count UMIs that map to more than one gene</option>
                             <option value="MultiGeneUMI_All" >Remove all UMIs that map to more than one gene</option>
                             <option value="MultiGeneUMI_CR" >Remove lower-count UMIs that map to more than one gene, matching CellRanger > 3.0.0</option>
-                        </param>
+                        </param>
                     </when>
                 </conditional>
                 <param argument="--soloCBmatchWLtype" type="select" label="Matching the Cell Barcodes to the WhiteList" help="Exact: only exact matches allowed; 1MM: only one match in whitelist with 1 mismatched base allowed. Allowed
@@ -414,8 +418,20 @@
             <param name="quantModeGene" type="boolean" truevalue="GeneCounts" falsevalue="" checked="false" label="Output global gene count" help="Can be used by MultiQC" />
             <param argument="--outSAMunmapped" type="boolean" truevalue="--outSAMunmapped Within" falsevalue="--outSAMunmapped None" checked="false" label="Output unmapped reads in the BAM" />
             <expand macro="outSAMmapqUnique"/>
-            <expand macro="limits" />
         </section>
+        <section name="algo" title="Algorithmic settings" expanded="true">
+            <conditional name="params">
+                <param name="settingsType" type="select" label="Configure seed, alignment and limits options">
+                    <option value="default" selected="true">Use Defaults</option>
+                    <option value="full">Extended parameter list</option>
+                </param>
+                <when value="default"/>
+                <when value="full">
+                    <expand macro="full_algo_params"/>
+                </when>
+            </conditional>
+        </section>
+        <expand macro="chim_params"/>
         <expand macro="outWig"/>
     </inputs>
     <outputs>
@@ -1381,6 +1397,93 @@
                 <metadata name="column_names" value="GeneID,Counts_unstrand,Counts_firstStrand,Counts_secondStrand" />
             </output>
         </test>
+        <test expect_num_outputs="7">
+            <!-- test 14 -->
+            <conditional name="refGenomeSource">
+                <param name="geneSource" value="history" />
+                <param name="genomeFastaFiles" value="filtered3.Homo_sapiens.GRCh38.dna.chromosome.21.fa.gz" />
+                <param name="genomeSAindexNbases" value="4" />
+                <param name="sjdbOverhang" value="100" />
+                <param name="sjdbGTFfile" value="filtered3.Homo_sapiens.GRCh38.100.chr21.gtf" ftype="gtf"/>
+            </conditional>
+            <conditional name="sc" >
+                <param name="solo_type" value="CB_UMI_Simple" />
+                <conditional name="input_types">
+                    <param name="use" value="repeat" />
+                    <param name="input1" value="pbmc_1k_v2_L001.R1.10k.fastq.gz" ftype="fastqsanger.gz" />
+                    <param name="input2" value="pbmc_1k_v2_L001.R2.10k.fastq.gz" ftype="fastqsanger.gz" />
+                </conditional>
+                <param name="soloCBwhitelist" value="filtered.barcodes.txt" />
+                <conditional name="params">
+                    <param name="chemistry" value="Cv3" />
+                </conditional>
+                <conditional name="umidedup">
+                    <param name="soloUMIdedup" value="1MM_All" />
+                </conditional>
+            </conditional>
+            <section name="solo" >
+                <conditional name="filter">
+                    <param name="filter_type" value="no_filter" />
+                </conditional>
+                <param name="soloStrand" value="Forward" />
+                <param name="soloFeatures" value="Gene" />
+                <param name="quantModeGene" value="true" />
+                <conditional name="wasp_conditional">
+                    <param name="waspOutputMode" value="wasp_mode"/>
+                    <param name="varVCFfile" value="filtered3.vcf" ftype="vcf" />
+                </conditional>
+            </section>
+            <section name="algo">
+                <conditional name="params">
+                    <param name="settingsType" value="full" />
+                    <section name="seed">
+                        <param name="seed_select" value="yes" />
+                        <param name="seedSearchStartLmax" value="25" />
+                        <param name="seedSearchStartLmax" value="25" />
+                    </section>
+                    <section name="align">
+                        <param name="alignIntronMax" value="100" />
+                        <param name="alignEndsType" value="EndToEnd" />
+                    </section>
+                </conditional>
+            </section>
+            <output name="output_barcodes" >
+                <assert_contents>
+                    <!-- first and last line -->
+                    <has_line line="AAACCTGAGCGCTCCA" />
+                    <has_line line="TTTGGTTAGTGGGCTA" />
+                    <has_n_lines n="394" />
+                </assert_contents>
+            </output>
+            <output name="output_genes">
+                <assert_contents>
+                    <has_line_matching expression="ENSG00000279493\s+FP565260\.4\s+Gene\s+Expression" />
+                    <has_line_matching expression="ENSG00000279064\s+FP236315\.1\s+Gene\s+Expression" />
+                    <has_n_lines n="14" />
+                </assert_contents>
+            </output>
+            <output name="output_matrix" >
+                <assert_contents>
+                    <has_line_matching expression="14\s+394\s+6" />
+                    <has_line_matching expression="4\s+284\s+1" />
+                    <has_n_lines n="9" />
+                </assert_contents>
+            </output>
+            <output name="output_stats" >
+                <assert_contents>
+                    <has_line_matching expression="\s+noUnmapped\s+6040" />
+                    <has_line_matching expression="\s+yesUMIs\s+6" />
+                </assert_contents>
+            </output>
+            <output name="output_BAM" value="filtered4_algo_full.bam" ftype="bam" lines_diff="6"/>
+            <output name="reads_per_gene" >
+                <assert_contents>
+                    <has_line_matching expression="ENSG00000279493\s+0\s+0\s+0" />
+                    <has_line_matching expression="ENSG00000275464\s+5\s+0\s+5" />
+                </assert_contents>
+                <metadata name="column_names" value="GeneID,Counts_unstrand,Counts_firstStrand,Counts_secondStrand" />
+            </output>
+        </test>
     </tests>
     <help><![CDATA[
 **What it does**
Binary file test-data/filtered4_algo_full.bam has changed