Mercurial > repos > iuc > rna_starsolo

--- a/macros.xml	Fri Jan 15 17:39:11 2021 +0000
+++ b/macros.xml	Mon Mar 15 13:46:45 2021 +0000
@@ -5,7 +5,7 @@
     the index versions in sync, but you should manually adjust the +galaxy
     version number. -->
     <!-- STAR version to be used -->
-    <token name="@VERSION@">2.7.7a</token>
+    <token name="@VERSION@">2.7.8a</token>
     <!-- STAR index version compatible with this version of STAR
     This is the STAR version that introduced the index structure expected
     by the current version.
@@ -163,10 +163,7 @@
     ]]></token>
     <xml name="ref_selection">
         <param argument="--genomeFastaFiles" type="data" format="fasta" label="Select a reference genome" />
-        <!-- Currently, this parameter is not exposed in the wrapper,
-             but used only in the tests to avoid excessive index sizes for
-             the tiny test genomes. -->
-        <param name="genomeSAindexNbases" type="hidden" value="" />
+          <param argument="--genomeSAindexNbases" type="integer" min="2" max="16" value="14" label="Length of the SA pre-indexing string" help="Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1)"/>
     </xml>
     <xml name="stdio" >
         <stdio>
@@ -209,8 +206,9 @@
         </conditional>
     </xml>
     <xml name="umidedup_options">
-        <option value="1MM_All" selected="true">All</option>
-        <option value="1MM_Directional" >Directional</option>
+        <option value="1MM_All" selected="true">Collapse all UMIs with 1 mismatch distance to each other</option>
+        <option value="1MM_Directional_UMItools" >Directional method from the UMI-tool</option>
+        <option value="1MM_Directional" >Directional with stringent UMI deduplication</option>
     </xml>
     <xml name="anchor_types">
         <option value="0">Read start</option>
@@ -225,5 +223,26 @@
     <xml name="cb_match_wl_cellranger">
         <option value="1MM_multi" selected="true" >Multiple matches (CellRanger 2)</option>
         <option value="1MM_multi_pseudocounts" >Multiple matches (CellRanger 3)</option>
+        <option value="1MM_multi_Nbase_pseudocounts" >Multimatching to WL is allowed for CBs with N-bases (CellRanger 3)</option>
+    </xml>
+    <xml name="solo_adapter_params">
+        <param argument="--soloAdapterSequence" type="text" value="-" label="Adapter sequence to anchor barcodes." >
+            <sanitizer>
+                <valid initial="string.digits">
+                    <add value="-"/>
+                    <add value="A"/>
+                    <add value="T"/>
+                    <add value="C"/>
+                    <add value="G"/>
+                    <add value="N"/>
+                </valid>
+            </sanitizer>
+        </param>
+        <param argument="--soloAdapterMismatchesNmax" type="integer" min="1" value="1" label="Maximum number of mismatches allowed in adapter sequence" />
+        <param argument="--clipAdapterType" type="select" >
+            <option value="Hamming" selected="true" >Adapter clipping based on Hamming distance</option>
+            <option value="CellRanger4" >5p and 3p adapter clipping similar to CellRanger4</option>
+            <option value="None" >No adapter clipping</option>
+        </param>
     </xml>
 </macros>
--- a/rg_rnaStarSolo.xml	Fri Jan 15 17:39:11 2021 +0000
+++ b/rg_rnaStarSolo.xml	Mon Mar 15 13:46:45 2021 +0000
@@ -41,6 +41,17 @@
     --soloCBlen $sc.params.soloCBlen
     --soloUMIstart $sc.params.soloUMIstart
     --soloUMIlen $sc.params.soloUMIlen
+        #if $sc.params.bccdna_mate.bc_location == "same_mate":
+        --soloBarcodeMate $sc.params.bccdna_mate.soloBarcodeMate
+            #if $sc.params.bccdna_mate.soloBarcodeMate == "1":
+            --clip5pNbases $sc.params.bccdna_mate.clip_n_bases 0
+            #else if $sc.params.bccdna_mate.soloBarcodeMate == "2":
+            --clip3pNbases 0 $sc.params.bccdna_mate.clip_n_bases
+            #end if
+        #end if
+    --soloAdapterSequence '$sc.params.soloAdapterSequence'
+    --soloAdapterMismatchesNmax $sc.params.soloAdapterMismatchesNmax
+    --clipAdapterType $sc.params.clipAdapterType
     #end if

     #elif str($sc.solo_type) == "CB_UMI_Complex":
@@ -58,8 +69,9 @@
     --soloCBposition $cb_pos
     #set $umi_pos = '_'.join([str($sc.umi_start_anchor), str($sc.umi_start_anchor_pos), str($sc.umi_end_anchor), str($sc.umi_end_anchor_pos)])
     --soloUMIposition $umi_pos
-    --soloAdapterSequence $sc.soloAdapterSequence
+    --soloAdapterSequence '$sc.soloAdapterSequence'
     --soloAdapterMismatchesNmax $sc.soloAdapterMismatchesNmax
+    --clipAdapterType $sc.clipAdapterType

     #elif str($sc.solo_type) == "SmartSeq":
     ## Create a manifest file with fastq files and their corresponding cell-ids
@@ -87,6 +99,8 @@

     #if str($solo.filter.filter_type) == "cellranger2":
     --soloCellFilter CellRanger2.2 $solo.filter.n_expected $solo.filter.max_perc $solo.filter.max_min_ratio
+    #else if str($solo.filter.filter_type) == "emptydrops":
+    --soloCellFilter EmptyDrops_CR $solo.filter.nExpectedCells $solo.filter.maxPercentile $solo.filter.maxMinRatio $solo.filter.indMin $solo.filter.indMax $solo.filter.umiMin $solo.filter.umiMinFracMedian $solo.filter.candMaxN $solo.filter.FDR $solo.filter.simN
     #else if str($solo.filter.filter_type) == "topcells":
     --soloCellFilter TopCells $solo.filter.n_cells
     #else if str($solo.filter.filter_type) == "no_filter":
@@ -187,12 +201,28 @@
                         <param argument="--soloCBlen" type="integer" min="1" value="16" label="Cell Barcode Length" />
                         <param argument="--soloUMIstart" type="integer" min="1" value="17" label="UMI Start Base" />
                         <param argument="--soloUMIlen" type="integer" min="1" value="10" label="UMI Length" />
+                        <conditional name="bccdna_mate" >
+                            <param name="bc_location" type="select" label="Barcode and cDNA on the same mate\?" >
+                                <option value="other_mate" selected="true">BC and cDNA are on different mates of paired-end read</option>
+                                <option value="same_mate">BC and cDNA are on the same mate of paired-end read</option>
+                            </param>
+                            <when value="other_mate" />
+                            <when value="same_mate" >
+                                <param argument="--soloBarcodeMate" type="select" label="Barcode sequence is a part of">
+                                    <option value="1" selected="true">mate 1</option>
+                                    <option value="2">mate 2</option>
+                                </param>
+                                <param name="clip_n_bases" type="integer" value="39" label="Number of bases to clip (=CB+UMI+adapter)"/>
+                            </when>
+                        </conditional>
+                        <expand macro="solo_adapter_params" />
                     </when>
                 </conditional>
                 <param argument="--soloBarcodeReadLength" type="boolean" truevalue="1" falsevalue="0" checked="true" label="Barcode Size is same size of the Read" help="Disable this if your R1 barcodes contain poly-T bases after the barcode sequence." />
                 <param argument="--soloUMIdedup" type="select" label="UMI deduplication (collapsing) algorithm" help="All has all UMIs with 1 mismatch distance to each other collapsed, Directional follows the 'directional' method given in UMI-tools, Exact collapses only exactly matching UMIs.">
                     <expand macro="umidedup_options" />
                     <option value="Exact" >Exact</option>
+                    <option value="1MM_CR" >CellRanger2-4 algorithm</option>
                 </param>
                 <param argument="--soloCBmatchWLtype" type="select" label="Matching the Cell Barcodes to the WhiteList" help="Exact: only exact matches allowed; 1MM: only one match in whitelist with 1 mismatched base allowed. Allowed
     CBs have to have at least one read with exact match; 1MM_multi: multiple matches in whitelist with 1 mismatched base allowed, posterior probability calculation is used choose one of the matches; 1MM_multi_pseudocounts: same as 1MM_Multi, but pseudocounts of 1 are added to all whitelist barcodes.">
@@ -221,11 +251,11 @@
                     <expand macro="anchor_types" />
                 </param>
                 <param name="umi_end_anchor_pos" type="integer" value="0" label="0-based position of the UMI end with respect to the anchor base" />
-                <param argument="--soloAdapterSequence" type="text" value="-" label="Adapter sequence to anchor barcodes." />
-                <param argument="--soloAdapterMismatchesNmax" type="integer" min="1" value="1" label="Maximum number of mismatches allowed in adapter sequence" />
+                <expand macro="solo_adapter_params" />
                 <param argument="--soloUMIdedup" type="select" label="UMI deduplication (collapsing) algorithm" help="All has all UMIs with 1 mismatch distance to each other collapsed, Directional follows the 'directional' method given in UMI-tools, Exact collapses only exactly matching UMIs.">
                     <expand macro="umidedup_options" />
                     <option value="Exact" >Exact</option>
+                    <option value="1MM_CR" >CellRanger2-4 algorithm</option>
                 </param>
                 <param argument="--soloCBmatchWLtype" type="select" label="Matching the Cell Barcodes to the WhiteList" help="Exact: only exact matches allowed; 1MM: only one match in whitelist with 1 mismatched base allowed. Allowed
     CBs have to have at least one read with exact match; 1MM_multi: multiple matches in whitelist with 1 mismatched base allowed, posterior probability calculation is used choose one of the matches; 1MM_multi_pseudocounts: same as 1MM_Multi, but pseudocounts of 1 are added to all whitelist barcodes.">
@@ -234,7 +264,7 @@
             </when>
             <when value="SmartSeq">
                 <expand macro="input_selection_smart_seq" />
-                <param name="cell_ids" type="data" label="File containing cell IDs of the samples. One ID per line in order of samples in the above collection."/>
+                <param name="cell_ids" format="txt,tsv" type="data" label="File containing cell IDs of the samples. One ID per line in order of samples in the above collection."/>
                 <param argument="--soloUMIdedup" type="select" label="UMI deduplication (collapsing) algorithm" help="All has all UMIs with 1 mismatch distance to each other collapsed, Directional follows the 'directional' method given in UMI-tools, Exact collapses only exactly matching UMIs.">
                     <option value="Exact" >Exact</option>
                     <option value="NoDedup">Do not deduplicate UMIs</option>
@@ -254,11 +284,13 @@
             </param>
             <param argument="--soloUMIfiltering" type="select" label="Type of UMI filtering" >
                 <option value="-" selected="true">Remove UMIs with N and homopolymers (similar to CellRanger 2.2.0)</option>
-                <option value="MultiGeneUMI" >Remove lower-count UMIs that map to more than one gene ((introduced in CellRanger 3.x.x)</option>
+                <option value="MultiGeneUMI" >Remove lower-count UMIs that map to more than one gene</option>
+                <option value="MultiGeneUMI_CR" >Remove lower-count UMIs that map to more than one gene, matching CellRanger > 3.0.0</option>
             </param>
             <conditional name="filter" >
                 <param name="filter_type" type="select" label="Cell filtering type and parameters" >
                     <option value="cellranger2" selected="true" >Simple filtering of CellRanger v2</option>
+                    <option value="emptydrops" >EmptyDrops filtering in CellRanger flavor</option>
                     <option value="topcells" >Filter top N cells</option>
                     <option value="no_filter" >Do not filter</option>
                 </param>
@@ -267,6 +299,18 @@
                     <param name="max_perc" type="float" min="0" max="1" value="0.99" label="Robust maximum percentile for UMI count" />
                     <param name="max_min_ratio" type="float" min="1" value="10" label="Maximum to minimum ratio for UMI count" />
                 </when>
+                <when value="emptydrops" >
+                    <param name="nExpectedCells" type="integer" min="1" value="3000" label="Number of expected cells" />
+                    <param name="maxPercentile" type="float" min="0" max="1" value="0.99" label="Robust maximum percentile for UMI count" />
+                    <param name="maxMinRatio" type="float" min="1" value="10" label="Maximum to minimum ratio for UMI count" />
+                    <param name="indMin" type="integer" value="45000" label="Minimum number of barcodes (used as partition parameter for ambient estimation)" />
+                    <param name="indMax" type="integer" value="90000" label="Maximum number of barcodes (used as partition parameter for ambient estimation)" />
+                    <param name="umiMin" type="integer" value="500" label="Consider at least these many UMIs per barcode after initial cell calling" />
+                    <param name="umiMinFracMedian" type="float" value="0.01" label="Minimum UMI:median ratio after initial cell calling" />
+                    <param name="candMaxN" type="integer" value="20000" label="Number of extra barcodes after initial cell calling" />
+                    <param name="FDR" type="float" value="0.01" label="Maximum adjusted p-value for determining a barcode as non-ambient" />
+                    <param name="simN" type="integer" value="10000" label="Number of log likelihood simulations" />
+                </when>
                 <when value="topcells" >
                     <param name="n_cells" type="integer" min="1" value="3000" label="Number of top cells to report sorted by UMI count" />
                 </when>
@@ -420,7 +464,6 @@
                     <param name="soloUMIstart" value="17" />
                     <param name="soloUMIlen" value="12" />
                 </conditional>
-                <param name="soloUMIdedup" value="1MM_All" />
             </conditional>
             <section name="solo" >
                 <param name="soloStrand" value="Forward" />
@@ -577,6 +620,72 @@
             </output>
         </test>
         <test expect_num_outputs="6">
+            <!-- Emptydrops filtering -->
+            <conditional name="refGenomeSource">
+                <param name="geneSource" value="history" />
+                <param name="genomeFastaFiles" value="filtered3.Homo_sapiens.GRCh38.dna.chromosome.21.fa.gz" />
+                <param name="genomeSAindexNbases" value="4" />
+                <param name="sjdbOverhang" value="100" />
+                <param name="sjdbGTFfile" value="filtered3.Homo_sapiens.GRCh38.100.chr21.gtf" ftype="gtf"/>
+            </conditional>
+            <conditional name="sc" >
+                <param name="solo_type" value="CB_UMI_Simple" />
+                <conditional name="input_types">
+                    <param name="use" value="repeat" />
+                    <param name="input1" value="pbmc_1k_v2_L001.R1.10k.fastq.gz" ftype="fastqsanger.gz" />
+                    <param name="input2" value="pbmc_1k_v2_L001.R2.10k.fastq.gz" ftype="fastqsanger.gz" />
+                </conditional>
+                <param name="soloCBwhitelist" value="filtered.barcodes.txt" />
+                <conditional name="params">
+                    <param name="chemistry" value="CR3" />
+                </conditional>
+                <param name="soloUMIdedup" value="1MM_All" />
+            </conditional>
+            <section name="solo" >
+                <conditional name="filter">
+                    <param name="filter_type" value="emptydrops" />
+                    <param name="nExpectedCells" value="5" />
+                    <param name="maxPercentile" value="0.99" />
+                    <param name="maxMinRatio" value="10" />
+                    <param name="indMin" value="45000" />
+                    <param name="indMax" value="90000" />
+                    <param name="umiMin" value="500" />
+                    <param name="umiMinFracMedian" value="0.01" />
+                    <param name="candMaxN" value="20000" />
+                    <param name="FDR" value="0.01" />
+                    <param name="simN" value="10000" />
+                </conditional>
+                <param name="soloStrand" value="Forward" />
+                <param name="soloFeatures" value="Gene" />
+            </section>
+            <output name="output_barcodes_filtered">
+                <assert_contents>
+                    <!-- first and last line -->
+                    <has_line line="ACACCGGTCTAACGGT" />
+                    <has_line line="TTCTCAATCCACGTTC" />
+                </assert_contents>
+            </output>
+            <output name="output_genes_filtered">
+                <assert_contents>
+                    <has_line_matching expression="ENSG00000279493\s+FP565260\.4\s+Gene\s+Expression" />
+                    <has_line_matching expression="ENSG00000279064\s+FP236315\.1\s+Gene\s+Expression" />
+                </assert_contents>
+            </output>
+            <output name="output_matrix_filtered" >
+                <assert_contents>
+                    <has_line_matching expression="14\s+7\s+7" />
+                    <has_line_matching expression="4\s+7\s+1" />
+                </assert_contents>
+            </output>
+            <output name="output_stats" >
+                <assert_contents>
+                    <has_line_matching expression="\s+nUnmapped\s+5823" />
+                    <has_line_matching expression="\s+nUMIs\s+8" />
+                </assert_contents>
+            </output>
+            <output name="output_BAM" value="filtered3.bam" compare="sim_size" delta="600" />
+        </test>
+        <test expect_num_outputs="6">
             <!-- Test soloType CB_UMI_Complex -->
             <conditional name="refGenomeSource">
                 <param name="geneSource" value="history" />
@@ -612,6 +721,7 @@
                 <param name="umi_end_anchor_pos" value="14" />
                 <param name="soloAdapterSequence" value="GAGTGATTGCTTGTGACGCCTT"  />
                 <param name="soloAdapterMismatchesNmax" value="1" />
+                <param name="clipAdapterType" value="CellRanger4" />
                 <param name="soloUMIdedup" value="1MM_All" />
                 <param name="soloCBmatchWLtype" value="1MM" />
             </conditional>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rnastar_test_genomeSAindexNbases.bed	Mon Mar 15 13:46:45 2021 +0000
@@ -0,0 +1,2 @@
+test_chromosome	251	350	1	1	0	27	0	37
+test_chromosome	401	500	1	1	0	25	0	36
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rnastar_test_genomeSAindexNbases.log	Mon Mar 15 13:46:45 2021 +0000
@@ -0,0 +1,37 @@
+                                 Started job on |	Mar 08 19:43:44
+                             Started mapping on |	Mar 08 19:43:45
+                                    Finished on |	Mar 08 19:43:45
+       Mapping speed, Million of reads per hour |	inf
+
+                          Number of input reads |	100
+                      Average input read length |	75
+                                    UNIQUE READS:
+                   Uniquely mapped reads number |	99
+                        Uniquely mapped reads % |	99.00%
+                          Average mapped length |	74.65
+                       Number of splices: Total |	52
+            Number of splices: Annotated (sjdb) |	0
+                       Number of splices: GT/AG |	52
+                       Number of splices: GC/AG |	0
+                       Number of splices: AT/AC |	0
+               Number of splices: Non-canonical |	0
+                      Mismatch rate per base, % |	2.00%
+                         Deletion rate per base |	0.00%
+                        Deletion average length |	0.00
+                        Insertion rate per base |	0.00%
+                       Insertion average length |	0.00
+                             MULTI-MAPPING READS:
+        Number of reads mapped to multiple loci |	1
+             % of reads mapped to multiple loci |	1.00%
+        Number of reads mapped to too many loci |	0
+             % of reads mapped to too many loci |	0.00%
+                                  UNMAPPED READS:
+  Number of reads unmapped: too many mismatches |	0
+       % of reads unmapped: too many mismatches |	0.00%
+            Number of reads unmapped: too short |	0
+                 % of reads unmapped: too short |	0.00%
+                Number of reads unmapped: other |	0
+                     % of reads unmapped: other |	0.00%
+                                  CHIMERIC READS:
+                       Number of chimeric reads |	0
+                            % of chimeric reads |	0.00%
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rnastar_test_genomeSAindexNbases_02.bed	Mon Mar 15 13:46:45 2021 +0000
@@ -0,0 +1,2 @@
+test_chromosome	251	350	1	1	0	27	0	37
+test_chromosome	401	500	1	1	0	25	0	36
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rnastar_test_genomeSAindexNbases_02.log	Mon Mar 15 13:46:45 2021 +0000
@@ -0,0 +1,37 @@
+                                 Started job on |	Mar 08 19:43:59
+                             Started mapping on |	Mar 08 19:43:59
+                                    Finished on |	Mar 08 19:43:59
+       Mapping speed, Million of reads per hour |	inf
+
+                          Number of input reads |	100
+                      Average input read length |	75
+                                    UNIQUE READS:
+                   Uniquely mapped reads number |	99
+                        Uniquely mapped reads % |	99.00%
+                          Average mapped length |	74.65
+                       Number of splices: Total |	52
+            Number of splices: Annotated (sjdb) |	0
+                       Number of splices: GT/AG |	52
+                       Number of splices: GC/AG |	0
+                       Number of splices: AT/AC |	0
+               Number of splices: Non-canonical |	0
+                      Mismatch rate per base, % |	2.00%
+                         Deletion rate per base |	0.00%
+                        Deletion average length |	0.00
+                        Insertion rate per base |	0.00%
+                       Insertion average length |	0.00
+                             MULTI-MAPPING READS:
+        Number of reads mapped to multiple loci |	1
+             % of reads mapped to multiple loci |	1.00%
+        Number of reads mapped to too many loci |	0
+             % of reads mapped to too many loci |	0.00%
+                                  UNMAPPED READS:
+  Number of reads unmapped: too many mismatches |	0
+       % of reads unmapped: too many mismatches |	0.00%
+            Number of reads unmapped: too short |	0
+                 % of reads unmapped: too short |	0.00%
+                Number of reads unmapped: other |	0
+                     % of reads unmapped: other |	0.00%
+                                  CHIMERIC READS:
+                       Number of chimeric reads |	0
+                            % of chimeric reads |	0.00%
Binary file test-data/rnastar_test_mapped_reads_genomeSAindexNbases.bam has changed
Binary file test-data/rnastar_test_mapped_reads_genomeSAindexNbases_02.bam has changed