Mercurial > repos > iuc > rgrnastar

<macros>
    <!-- REMEMBER to bump the version of @IDX_VERSION_SUFFIX@
    whenever you make changes to the @TOOL_VERSION@ token!
    The data manager uses a symlink to this macro file to keep the STAR and
    the index versions in sync, but you should manually update @IDX_VERSION_SUFFIX@ -->
    <!-- STAR version to be used -->
    <token name="@TOOL_VERSION@">2.7.11a</token>
    <token name="@VERSION_SUFFIX@">0</token>
    <token name="@PROFILE@">21.01</token>
    <!-- STAR index version compatible with this version of STAR
    This is the STAR version that introduced the index structure expected
    by the current version.
    It can be found for any specific version of STAR with:
    STAR -h | grep versionGenome
    or by looking for the versionGenome parameter in source/parametersDefault
    of STAR's source code -->
    <token name="@IDX_VERSION@">2.7.4a</token>
    <token name="@IDX_VERSION_SUFFIX@">2</token>
    <token name="@IDX_DATA_TABLE@">rnastar_index2x_versioned</token>

    <xml name="requirements">
        <requirements>
            <requirement type="package" version="@TOOL_VERSION@">star</requirement>
            <requirement type="package" version="1.18">samtools</requirement>
            <requirement type="package" version="1.13">gzip</requirement>
            <yield />
        </requirements>
    </xml>

    <xml name="edam">
        <edam_topics>
            <edam_topic>topic_3170</edam_topic>
            <edam_topic>topic_3308</edam_topic>
        </edam_topics>
        <edam_operations>
            <edam_operation>operation_0292</edam_operation>
        </edam_operations>
    </xml>

    <xml name="index_selection" token_with_gene_model="0">
        <param argument="--genomeDir" type="select"
        label="Select reference genome"
        help="If your genome of interest is not listed, contact the Galaxy team">
            <options from_data_table="@IDX_DATA_TABLE@">
                <filter type="static_value" column="4" value="@WITH_GENE_MODEL@" />
                <filter type="static_value" column="5" value="@IDX_VERSION@" />
                <filter type="sort_by" column="2" />
                <validator type="no_options" message="No indexes are available for the selected input dataset" />
            </options>
        </param>
    </xml>

    <token name="@FASTQ_GZ_OPTION@">
        --readFilesCommand zcat
    </token>
    <xml name="citations">
        <citations>
            <citation type="doi">10.1093/bioinformatics/bts635</citation>
        </citations>
    </xml>
    <xml name="SJDBOPTIONS">
         <param argument="--sjdbGTFfile" type="data" format="gff3,gtf" label="Gene model (gff3,gtf) file for splice junctions" optional="false" help="Exon junction information for mapping splices"/>
         <param argument="--sjdbGTFfeatureExon" type="text" value="exon" label="Elements to use from the gene model to use for splice junctions" help="By default and for almost all cases: 'exon', referring to finding junctions at the RNA splice sites. This can optionally be changed to allow splicing at other levels, such as 'gene', 'transcript', 'CDS'."/>
         <param argument="--sjdbOverhang" type="integer" min="1" value="100" label="Length of the genomic sequence around annotated junctions" help="Used in constructing the splice junctions database. Ideal value is ReadLength-1"/>
    </xml>
    <xml name="dbKeyActions">
        <actions>
            <expand macro="dbKeyAction"/>
        </actions>
    </xml>
    <xml name="dbKeyAction">
        <conditional name="refGenomeSource.geneSource">
            <when value="indexed">
                <action type="metadata" name="dbkey">
                    <option type="from_data_table" name="@IDX_DATA_TABLE@" column="1" offset="0">
                        <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
                        <filter type="param_value" ref="refGenomeSource.GTFconditional.genomeDir" column="0"/>
                    </option>
                </action>
            </when>
            <when value="history">
                <action type="metadata" name="dbkey">
                    <option type="from_param" name="refGenomeSource.genomeFastaFiles" param_attribute="dbkey" />
                </action>
            </when>
        </conditional>
    </xml>
    <token name="@TEMPINDEX@"><![CDATA[
    ## Create temporary index for custom reference
    #if str($refGenomeSource.geneSource) == 'history':
        #if $refGenomeSource.genomeFastaFiles.ext == "fasta"
            ln -s '$refGenomeSource.genomeFastaFiles' refgenome.fa &&
        #else
            gunzip -c '$refGenomeSource.genomeFastaFiles' > refgenome.fa &&
        #end if
        mkdir -p tempstargenomedir &&
        STAR
            --runMode genomeGenerate
            --genomeDir 'tempstargenomedir'
            --genomeFastaFiles refgenome.fa
            ## Handle difference between indices with/without annotations
            #if 'GTFconditional' in $refGenomeSource:
                ## GTFconditional exists only in STAR, but not STARsolo
                #if str($refGenomeSource.GTFconditional.GTFselect) == 'with-gtf':
                    --sjdbOverhang '${refGenomeSource.GTFconditional.sjdbOverhang}'
                    --sjdbGTFfile '${refGenomeSource.GTFconditional.sjdbGTFfile}'
                    --sjdbGTFfeatureExon '${refGenomeSource.GTFconditional.sjdbGTFfeatureExon}'
                    #if str($refGenomeSource.GTFconditional.sjdbGTFfile.ext) == 'gff3':
                        --sjdbGTFtagExonParentTranscript Parent
                    #end if
                #end if
            #else:
                ## ref genome selection is less complex for STARsolo because
                ## with-gtf is mandatory there
                --sjdbOverhang '${refGenomeSource.sjdbOverhang}'
                --sjdbGTFfile '${refGenomeSource.sjdbGTFfile}'
                --sjdbGTFfeatureExon '${refGenomeSource.sjdbGTFfeatureExon}'
                #if str($refGenomeSource.sjdbGTFfile.ext) == 'gff3':
                    --sjdbGTFtagExonParentTranscript Parent
                #end if
            #end if
            #if str($refGenomeSource.genomeSAindexNbases):
                --genomeSAindexNbases ${refGenomeSource.genomeSAindexNbases}
            #end if
            ## Diploid mode
            #if 'diploidconditional' in $refGenomeSource:
                #if str($refGenomeSource.diploidconditional.diploid) == 'Yes':
                    --genomeTransformVCF '${refGenomeSource.diploidconditional.genomeTransformVCF}'
                    --genomeTransformType Diploid
                #end if
            #end if
            --runThreadN \${GALAXY_SLOTS:-4}
            ## in bytes
            --limitGenomeGenerateRAM \$((\${GALAXY_MEMORY_MB:-31000} * 1000000))
        &&
    #end if
    ]]></token>
    <token name="@REFGENOMEHANDLING@" ><![CDATA[
    --runThreadN \${GALAXY_SLOTS:-4}
    --genomeLoad NoSharedMemory
    --genomeDir
    #if str($refGenomeSource.geneSource) == 'history':
        tempstargenomedir
    #else:
        '${refGenomeSource.GTFconditional.genomeDir.fields.path}'
        ## Handle difference between indices with/without annotations
        #if str($refGenomeSource.GTFconditional.GTFselect) == 'without-gtf-with-gtf':
            --sjdbOverhang $refGenomeSource.GTFconditional.sjdbOverhang
            --sjdbGTFfile '${refGenomeSource.GTFconditional.sjdbGTFfile}'
            --sjdbGTFfeatureExon '${refGenomeSource.GTFconditional.sjdbGTFfeatureExon}'
            #if str($refGenomeSource.GTFconditional.sjdbGTFfile.ext) == 'gff3':
                --sjdbGTFtagExonParentTranscript Parent
            #end if
        #end if
    #end if
    ]]></token>
    <token name="@READSHANDLING@" ><![CDATA[
    ## Check that the input pairs are of the same type
    ## otherwise STARsolo will run for a long time and then error out.
    ## We consume either repeats of two inputs R1 + R2
    ## or a collection of paired reads.
    #if str($sc.input_types.use) == "repeat":
        #set $reads1 = []
        #set $reads2 = []
        #for $r1, $r2 in zip($sc.input_types.input1, $sc.input_types.input2):
            #assert $r1.datatype == $r2.datatype
            #silent $reads1.append(str($r1))
            #silent $reads2.append(str($r2))
        #end for
        #set $reads1 = ','.join($reads1)
        #set $reads2 = ','.join($reads2)
    #elif str($sc.input_types.use) == "list_paired":
        #set $r1 = $sc.input_types.input_collection.forward
        #set $r2 = $sc.input_types.input_collection.reverse
        #set $reads1 = $r1
        #set $reads2 = $r2
    #end if
    ## cDNA sequence(s) [R2] always go first, then barcode(s) [R1]
    ## see: Section 3.2 of STAR manual for multiple inputs, and Section 13 for STARsolo inputs
    --readFilesIn $reads2 $reads1
    --soloCBmatchWLtype $sc.soloCBmatchWLtype
    #if $r1.is_of_type('fastq.gz', 'fastqsanger.gz'):
        @FASTQ_GZ_OPTION@
    #end if
    ]]></token>
    <token name="@LIMITS@" ><![CDATA[
        --limitOutSJoneRead $getVar('algo.params.junction_limits.limitOutSJoneRead', $getVar('solo.junction_limits.limitOutSJoneRead', 1000))
        --limitOutSJcollapsed $getVar('algo.params.junction_limits.limitOutSJcollapsed', $getVar('solo.junction_limits.limitOutSJcollapsed', 1000000))
        --limitSjdbInsertNsj $getVar('algo.params.junction_limits.limitSjdbInsertNsj', $getVar('solo.junction_limits.limitSjdbInsertNsj', 1000000))
    ]]></token>
    <xml name="ref_selection">
        <param argument="--genomeFastaFiles" type="data" format="fasta,fasta.gz" label="Select a reference genome" />
          <param argument="--genomeSAindexNbases" type="integer" min="2" max="16" value="14" label="Length of the SA pre-indexing string" help="Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1)"/>
    </xml>
    <xml name="stdio" >
        <stdio>
            <regex match="FATAL error" source="both" level="fatal"/>
            <regex match="EXITING: FATAL INPUT ERROR:" source="both" level="fatal"/>
            <regex match="EXITING: fatal error trying to allocate genome arrays, exception thrown: std::bad_alloc" source="both" level="fatal"/>
            <regex match="\[sam_read1\] missing header\? Abort!" source="both" level="fatal"/>
            <yield />
        </stdio>
    </xml>
    <xml name="input_selection">
        <conditional name="input_types" >
            <param name="use" type="select" label="Input Type" >
                <option value="repeat" >Separate barcode and cDNA reads</option>
                <option value="list_paired" >Paired collection of barcode and cDNA reads</option>
            </param>
            <when value="repeat">
                <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input1" type="data"  multiple="true"
                label="RNA-Seq FASTQ/FASTA file, Barcode reads" />
                <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input2" type="data"  multiple="true"
                label="RNA-Seq FASTQ/FASTA file, cDNA reads"/>
            </when>
            <when value="list_paired">
                <param name="input_collection" collection_type="paired" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="Collection of Pairs" />
            </when>
        </conditional>
    </xml>
    <xml name="input_selection_smart_seq">
        <conditional name="input_types_smart_seq" >
            <param name="use" type="select" label="Input Type" >
                <option value="list_single_end" >Single-end FASTQ collection</option>
                <option value="list_paired_end" >Paired FASTQ collection</option>
            </param>
            <when value="list_single_end">
                <param name="single_end_collection" collection_type="list" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="List of single-end FASTQ files" />
            </when>
            <when value="list_paired_end">
                <param name="paired_end_collection" collection_type="list:paired" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="List of paired-end FASTQ files" />
            </when>
        </conditional>
    </xml>
    <xml name="umidedup_options">
        <option value="1MM_All" selected="true">Collapse all UMIs with 1 mismatch distance to each other (1MM_All)</option>
        <option value="1MM_Directional_UMItools" >Directional method from the UMI-tool</option>
        <option value="1MM_Directional" >Directional with stringent UMI deduplication</option>
    </xml>
    <xml name="anchor_types">
        <option value="0">Read start</option>
        <option value="1">Read end</option>
        <option value="2">Adapter start</option>
        <option value="3">Adapter end</option>
    </xml>
    <xml name="cb_match_wl_common">
        <option value="Exact" >Exact</option>
        <option value="1MM" >Single match (1MM)</option>
    </xml>
    <xml name="cb_match_wl_cellranger">
        <option value="1MM_multi" selected="true" >Multiple matches (CellRanger 2, 1MM_multi)</option>
        <option value="1MM_multi_pseudocounts" >Multiple matches (CellRanger 3, 1MM_multi_pseudocounts)</option>
        <option value="1MM_multi_Nbase_pseudocounts" >Multimatching to WL is allowed for CBs with N-bases (CellRanger 3, 1MM_multi_Nbase_pseudocounts)</option>
    </xml>
    <xml name="solo_adapter_params">
        <param argument="--soloAdapterSequence" type="text" value="-" label="Adapter sequence to anchor barcodes." >
            <sanitizer>
                <valid initial="string.digits">
                    <add value="-"/>
                    <add value="A"/>
                    <add value="T"/>
                    <add value="C"/>
                    <add value="G"/>
                    <add value="N"/>
                </valid>
            </sanitizer>
        </param>
        <param argument="--soloAdapterMismatchesNmax" type="integer" min="1" value="1" label="Maximum number of mismatches allowed in adapter sequence" />
        <param argument="--clipAdapterType" type="select" >
            <option value="Hamming" selected="true" >Adapter clipping based on Hamming distance</option>
            <option value="CellRanger4" >5p and 3p adapter clipping similar to CellRanger4</option>
            <option value="None" >No adapter clipping</option>
        </param>
    </xml>
    <xml name="common_SAM_attributes">
        <option value="NH" selected="true">NH (number of reported alignments/hits for the read)</option>
        <option value="HI" selected="true">HI (query hit index)</option>
        <option value="AS" selected="true">AS (local alignment score)</option>
        <option value="nM" selected="true">nM (number of mismatches per (paired) alignment)</option>
        <option value="NM">NM (edit distance of the aligned read to the reference)</option>
        <option value="MD">MD (string for mismatching positions)</option>
        <option value="jM">jM (intron motifs for all junctions)</option>
        <option value="jI">jI (1-based start and end of introns for all junctions)</option>
    </xml>
    <xml name="limits">
        <section name="junction_limits" title="Junction Limits" expanded="false">
            <param argument="--limitOutSJoneRead" type="integer" min="1" value="1000" label="Maximum number of junctions for one read (including all multimappers)" />
            <param argument="--limitOutSJcollapsed" type="integer" min="1" value="1000000" label="Maximum number of collapsed junctions" />
            <param argument="--limitSjdbInsertNsj" type="integer" min="0" value="1000000" label="Maximum number of inserts to be inserted into the genome on the fly." />
        </section>
    </xml>
    <xml name="outCountActions">
        <actions>
            <action name="column_names" type="metadata" default="GeneID,Counts_unstrand,Counts_firstStrand,Counts_secondStrand" />
            <expand macro="dbKeyAction"/>
        </actions>
    </xml>
    <xml name="outWig">
        <conditional name="outWig">
            <param name="outWigType" type="select" label="Compute coverage">
                <option value="None">No coverage</option>
                <option value="bedGraph">Yes in bedgraph format</option>
                <option value="wiggle">Yes in wiggle format</option>
            </param>
            <when value="None">
                <!-- This is necessary for the filtering of output -->
                <param name="outWigStrand" type="hidden" value="false" />
            </when>
            <when value="bedGraph">
                <expand macro="outWigParams"/>
            </when>
            <when value="wiggle">
                <expand macro="outWigParams"/>
            </when>
        </conditional>
    </xml>
    <xml name="outWigParams">
        <param name="outWigTypeSecondWord" type="select" label="Input for coverage">
            <option value="">Default (everything that mapped)</option>
            <option value="read_5p">signal from only 5’ of the 1st read</option>
            <option value="read2">signal from only 2nd read</option>
        </param>
        <param argument="--outWigStrand" type="boolean" truevalue="Stranded" falsevalue="Unstranded" checked="true" label="Generate a coverage for each strand (stranded coverage)"/>
        <param argument="--outWigReferencesPrefix" type="text" value="-" label="prefix matching reference name" help="For example, set 'chr' if you mapped on an ensembl genome but you want to display on UCSC"/>
        <param argument="--outWigNorm" type="boolean" truevalue="RPM" falsevalue="None" checked="true" label="Normalize coverage to million of mapped reads (RPM)"/>
    </xml>
    <token name="@OUTWIG@"><![CDATA[
        #if str($outWig.outWigType) != 'None':
            --outWigType '$outWig.outWigType' '$outWig.outWigTypeSecondWord'
            --outWigStrand '$outWig.outWigStrand'
            --outWigReferencesPrefix '$outWig.outWigReferencesPrefix'
            --outWigNorm '$outWig.outWigNorm'
        #end if
    ]]></token>
    <token name="@OUTWIGOUTPUTS@"><![CDATA[
        #if str($outWig.outWigType) == "bedGraph":
            && mv Signal.Unique.str1.out.bg Signal.Unique.str1.out
            && mv Signal.UniqueMultiple.str1.out.bg Signal.UniqueMultiple.str1.out
            #if str($outWig.outWigStrand) == "Stranded":
                && mv Signal.Unique.str2.out.bg Signal.Unique.str2.out
                && mv Signal.UniqueMultiple.str2.out.bg Signal.UniqueMultiple.str2.out
            #end if
        #elif str($outWig.outWigType) == "wiggle":
            && mv Signal.Unique.str1.out.wig Signal.Unique.str1.out
            && mv Signal.UniqueMultiple.str1.out.wig Signal.UniqueMultiple.str1.out
            #if str($outWig.outWigStrand) == "Stranded":
                && mv Signal.Unique.str2.out.wig Signal.Unique.str2.out
                && mv Signal.UniqueMultiple.str2.out.wig Signal.UniqueMultiple.str2.out
            #end if
        #end if
    ]]></token>
    <xml name="outWigOutputs">
        <data format="bedgraph" name="signal_unique_str1" label="${tool.name} on ${on_string}: Coverage Uniquely mapped strand 1" from_work_dir="Signal.Unique.str1.out">
            <filter>outWig['outWigType'] != "None"</filter>
            <expand macro="dbKeyActions" />
            <change_format>
                <when input="outWig.outWigType" value="wiggle" format="wig" />
            </change_format>
        </data>
        <data format="bedgraph" name="signal_uniquemultiple_str1" label="${tool.name} on ${on_string}: Coverage Uniquely + Multiple mapped strand 1" from_work_dir="Signal.UniqueMultiple.str1.out">
            <filter>outWig['outWigType'] != "None"</filter>
            <expand macro="dbKeyActions" />
            <change_format>
                <when input="outWig.outWigType" value="wiggle" format="wig" />
            </change_format>
        </data>
        <data format="bedgraph" name="signal_unique_str2" label="${tool.name} on ${on_string}: Coverage Uniquely mapped strand 2" from_work_dir="Signal.Unique.str2.out">
            <filter>outWig['outWigType'] != "None" and outWig['outWigStrand']</filter>
            <expand macro="dbKeyActions" />
            <change_format>
                <when input="outWig.outWigType" value="wiggle" format="wig" />
            </change_format>
        </data>
        <data format="bedgraph" name="signal_uniquemultiple_str2" label="${tool.name} on ${on_string}: Coverage Uniquely + Multiple mapped strand 2" from_work_dir="Signal.UniqueMultiple.str2.out">
            <filter>outWig['outWigType'] != "None" and outWig['outWigStrand']</filter>
            <expand macro="dbKeyActions" />
            <change_format>
                <when input="outWig.outWigType" value="wiggle" format="wig" />
            </change_format>
        </data>
    </xml>
    <xml name="quantMode">
        <conditional name="quantmode_output">
            <param argument="--quantMode" type="select"
            label="Per gene/transcript output"
            help="STAR can provide analysis results not only with respect to the reference genome, but also with respect to genes and transcripts described by a gene model. Note: This functionality requires either the selection above of a cached index with a gene model, or a gene model provided alongside the index/reference genome in GTF or GFF3 format!">
                <option value="-">No per gene or transcript output</option>
                <option value="GeneCounts">Per gene read counts (GeneCounts)</option>
                <option value="TranscriptomeSAM">Transcript-based BAM output (TranscriptomeSAM)</option>
                <option value="TranscriptomeSAM GeneCounts">Both per gene read counts and transcript-based BAM output (TranscriptomeSAM GeneCounts)</option>
            </param>
            <when value="-" />
            <when value="GeneCounts" />
            <when value="TranscriptomeSAM">
                <param argument="--quantTranscriptomeBan" type="boolean" truevalue="IndelSoftclipSingleend" falsevalue="Singleend"
                label="Exclude alignments with indels or soft clipping from the transcriptome BAM output?"
                help="You will need to exclude alignments with indels and soft-clipped bases from the transcriptome BAM output for compatibility with certain transcript quantification tools, most notably RSEM. If you are using a tool, like eXpress, that can deal with indels and soft-clipped bases, you can achieve better results by leaving this option disabled." />
            </when>
            <when value="TranscriptomeSAM GeneCounts">
                <param argument="--quantTranscriptomeBan" type="boolean" truevalue="IndelSoftclipSingleend" falsevalue="Singleend"
                label="Exclude alignments with indels or soft clipping from the transcriptome BAM output?"
                help="You will need to exclude alignments with indels and soft-clipped bases from the transcriptome BAM output for compatibility with certain transcript quantification tools, most notably RSEM. If you are using a tool, like eXpress, that can deal with indels and soft-clipped bases, you can achieve better results by leaving this option disabled." />
            </when>
        </conditional>
    </xml>
    <xml name="quantModeNoGTF">
        <conditional name="quantmode_output">
            <param argument="--quantMode" type="select"
            label="Per gene/transcript output">
                <option value="-">No per gene or transcript output as no GTF was provided</option>
            </param>
            <when value="-" />
        </conditional>
    </xml>
    <xml name="outSAMmapqUnique">
        <!-- MAPQ 255 is the default in STAR (coming from tophat behaviour and compatibility for Cufflinks) but it is a problematic value
        - according to SAM/BAM specs it means "undefined".
        - Using 255 as the max mapq causes problem with modern downstream tools like mutect2: https://sites.duke.edu/workblog/2021/08/18/star-rnaseq-gatk-mutect2/ and 60 has become an inofficial replacement for 255. -->
        <param argument="--outSAMmapqUnique" type="integer" value="60" min="0" max="255"
        label="MAPQ value for unique mappers"
        help="STAR bases the mapping quality scores of alignment records in its BAM output on the number of alternative mappings for the read. If a read maps to multiple locations on the reference genome, the following MAPQ scoring scheme is
used: >=5 mappings => MAPQ=0; 3-4 mappings => MAPQ=1; 2 mappings => MAPQ=3. This setting lets you control the MAPQ used for reads mapped to a single location. Set to 255 for compatibility with Cufflink (default in STAR) but keep to 60 for modern downstream tools like mutect2." />
    </xml>
</macros>
author	iuc
date	Wed, 14 Feb 2024 09:03:31 +0000
parents	3e94726bfa9d
children