Mercurial > repos > iuc > rna_starsolo
changeset 10:a6fba3d92531 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/rgrnastar commit d0c9fa48df667ffad1abd71164e6bb1d9cb16bd9"
author | iuc |
---|---|
date | Mon, 15 Mar 2021 13:46:45 +0000 |
parents | ec9cbd6b9a49 |
children | eec9494fdafa |
files | macros.xml rg_rnaStarSolo.xml test-data/rnastar_test_genomeSAindexNbases.bed test-data/rnastar_test_genomeSAindexNbases.log test-data/rnastar_test_genomeSAindexNbases_02.bed test-data/rnastar_test_genomeSAindexNbases_02.log test-data/rnastar_test_mapped_reads_genomeSAindexNbases.bam test-data/rnastar_test_mapped_reads_genomeSAindexNbases_02.bam |
diffstat | 8 files changed, 220 insertions(+), 13 deletions(-) [+] |
line wrap: on
line diff
--- a/macros.xml Fri Jan 15 17:39:11 2021 +0000 +++ b/macros.xml Mon Mar 15 13:46:45 2021 +0000 @@ -5,7 +5,7 @@ the index versions in sync, but you should manually adjust the +galaxy version number. --> <!-- STAR version to be used --> - <token name="@VERSION@">2.7.7a</token> + <token name="@VERSION@">2.7.8a</token> <!-- STAR index version compatible with this version of STAR This is the STAR version that introduced the index structure expected by the current version. @@ -163,10 +163,7 @@ ]]></token> <xml name="ref_selection"> <param argument="--genomeFastaFiles" type="data" format="fasta" label="Select a reference genome" /> - <!-- Currently, this parameter is not exposed in the wrapper, - but used only in the tests to avoid excessive index sizes for - the tiny test genomes. --> - <param name="genomeSAindexNbases" type="hidden" value="" /> + <param argument="--genomeSAindexNbases" type="integer" min="2" max="16" value="14" label="Length of the SA pre-indexing string" help="Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1)"/> </xml> <xml name="stdio" > <stdio> @@ -209,8 +206,9 @@ </conditional> </xml> <xml name="umidedup_options"> - <option value="1MM_All" selected="true">All</option> - <option value="1MM_Directional" >Directional</option> + <option value="1MM_All" selected="true">Collapse all UMIs with 1 mismatch distance to each other</option> + <option value="1MM_Directional_UMItools" >Directional method from the UMI-tool</option> + <option value="1MM_Directional" >Directional with stringent UMI deduplication</option> </xml> <xml name="anchor_types"> <option value="0">Read start</option> @@ -225,5 +223,26 @@ <xml name="cb_match_wl_cellranger"> <option value="1MM_multi" selected="true" >Multiple matches (CellRanger 2)</option> <option value="1MM_multi_pseudocounts" >Multiple matches (CellRanger 3)</option> + <option value="1MM_multi_Nbase_pseudocounts" >Multimatching to WL is allowed for CBs with N-bases (CellRanger 3)</option> + </xml> + <xml name="solo_adapter_params"> + <param argument="--soloAdapterSequence" type="text" value="-" label="Adapter sequence to anchor barcodes." > + <sanitizer> + <valid initial="string.digits"> + <add value="-"/> + <add value="A"/> + <add value="T"/> + <add value="C"/> + <add value="G"/> + <add value="N"/> + </valid> + </sanitizer> + </param> + <param argument="--soloAdapterMismatchesNmax" type="integer" min="1" value="1" label="Maximum number of mismatches allowed in adapter sequence" /> + <param argument="--clipAdapterType" type="select" > + <option value="Hamming" selected="true" >Adapter clipping based on Hamming distance</option> + <option value="CellRanger4" >5p and 3p adapter clipping similar to CellRanger4</option> + <option value="None" >No adapter clipping</option> + </param> </xml> </macros>
--- a/rg_rnaStarSolo.xml Fri Jan 15 17:39:11 2021 +0000 +++ b/rg_rnaStarSolo.xml Mon Mar 15 13:46:45 2021 +0000 @@ -41,6 +41,17 @@ --soloCBlen $sc.params.soloCBlen --soloUMIstart $sc.params.soloUMIstart --soloUMIlen $sc.params.soloUMIlen + #if $sc.params.bccdna_mate.bc_location == "same_mate": + --soloBarcodeMate $sc.params.bccdna_mate.soloBarcodeMate + #if $sc.params.bccdna_mate.soloBarcodeMate == "1": + --clip5pNbases $sc.params.bccdna_mate.clip_n_bases 0 + #else if $sc.params.bccdna_mate.soloBarcodeMate == "2": + --clip3pNbases 0 $sc.params.bccdna_mate.clip_n_bases + #end if + #end if + --soloAdapterSequence '$sc.params.soloAdapterSequence' + --soloAdapterMismatchesNmax $sc.params.soloAdapterMismatchesNmax + --clipAdapterType $sc.params.clipAdapterType #end if #elif str($sc.solo_type) == "CB_UMI_Complex": @@ -58,8 +69,9 @@ --soloCBposition $cb_pos #set $umi_pos = '_'.join([str($sc.umi_start_anchor), str($sc.umi_start_anchor_pos), str($sc.umi_end_anchor), str($sc.umi_end_anchor_pos)]) --soloUMIposition $umi_pos - --soloAdapterSequence $sc.soloAdapterSequence + --soloAdapterSequence '$sc.soloAdapterSequence' --soloAdapterMismatchesNmax $sc.soloAdapterMismatchesNmax + --clipAdapterType $sc.clipAdapterType #elif str($sc.solo_type) == "SmartSeq": ## Create a manifest file with fastq files and their corresponding cell-ids @@ -87,6 +99,8 @@ #if str($solo.filter.filter_type) == "cellranger2": --soloCellFilter CellRanger2.2 $solo.filter.n_expected $solo.filter.max_perc $solo.filter.max_min_ratio + #else if str($solo.filter.filter_type) == "emptydrops": + --soloCellFilter EmptyDrops_CR $solo.filter.nExpectedCells $solo.filter.maxPercentile $solo.filter.maxMinRatio $solo.filter.indMin $solo.filter.indMax $solo.filter.umiMin $solo.filter.umiMinFracMedian $solo.filter.candMaxN $solo.filter.FDR $solo.filter.simN #else if str($solo.filter.filter_type) == "topcells": --soloCellFilter TopCells $solo.filter.n_cells #else if str($solo.filter.filter_type) == "no_filter": @@ -187,12 +201,28 @@ <param argument="--soloCBlen" type="integer" min="1" value="16" label="Cell Barcode Length" /> <param argument="--soloUMIstart" type="integer" min="1" value="17" label="UMI Start Base" /> <param argument="--soloUMIlen" type="integer" min="1" value="10" label="UMI Length" /> + <conditional name="bccdna_mate" > + <param name="bc_location" type="select" label="Barcode and cDNA on the same mate\?" > + <option value="other_mate" selected="true">BC and cDNA are on different mates of paired-end read</option> + <option value="same_mate">BC and cDNA are on the same mate of paired-end read</option> + </param> + <when value="other_mate" /> + <when value="same_mate" > + <param argument="--soloBarcodeMate" type="select" label="Barcode sequence is a part of"> + <option value="1" selected="true">mate 1</option> + <option value="2">mate 2</option> + </param> + <param name="clip_n_bases" type="integer" value="39" label="Number of bases to clip (=CB+UMI+adapter)"/> + </when> + </conditional> + <expand macro="solo_adapter_params" /> </when> </conditional> <param argument="--soloBarcodeReadLength" type="boolean" truevalue="1" falsevalue="0" checked="true" label="Barcode Size is same size of the Read" help="Disable this if your R1 barcodes contain poly-T bases after the barcode sequence." /> <param argument="--soloUMIdedup" type="select" label="UMI deduplication (collapsing) algorithm" help="All has all UMIs with 1 mismatch distance to each other collapsed, Directional follows the 'directional' method given in UMI-tools, Exact collapses only exactly matching UMIs."> <expand macro="umidedup_options" /> <option value="Exact" >Exact</option> + <option value="1MM_CR" >CellRanger2-4 algorithm</option> </param> <param argument="--soloCBmatchWLtype" type="select" label="Matching the Cell Barcodes to the WhiteList" help="Exact: only exact matches allowed; 1MM: only one match in whitelist with 1 mismatched base allowed. Allowed CBs have to have at least one read with exact match; 1MM_multi: multiple matches in whitelist with 1 mismatched base allowed, posterior probability calculation is used choose one of the matches; 1MM_multi_pseudocounts: same as 1MM_Multi, but pseudocounts of 1 are added to all whitelist barcodes."> @@ -221,11 +251,11 @@ <expand macro="anchor_types" /> </param> <param name="umi_end_anchor_pos" type="integer" value="0" label="0-based position of the UMI end with respect to the anchor base" /> - <param argument="--soloAdapterSequence" type="text" value="-" label="Adapter sequence to anchor barcodes." /> - <param argument="--soloAdapterMismatchesNmax" type="integer" min="1" value="1" label="Maximum number of mismatches allowed in adapter sequence" /> + <expand macro="solo_adapter_params" /> <param argument="--soloUMIdedup" type="select" label="UMI deduplication (collapsing) algorithm" help="All has all UMIs with 1 mismatch distance to each other collapsed, Directional follows the 'directional' method given in UMI-tools, Exact collapses only exactly matching UMIs."> <expand macro="umidedup_options" /> <option value="Exact" >Exact</option> + <option value="1MM_CR" >CellRanger2-4 algorithm</option> </param> <param argument="--soloCBmatchWLtype" type="select" label="Matching the Cell Barcodes to the WhiteList" help="Exact: only exact matches allowed; 1MM: only one match in whitelist with 1 mismatched base allowed. Allowed CBs have to have at least one read with exact match; 1MM_multi: multiple matches in whitelist with 1 mismatched base allowed, posterior probability calculation is used choose one of the matches; 1MM_multi_pseudocounts: same as 1MM_Multi, but pseudocounts of 1 are added to all whitelist barcodes."> @@ -234,7 +264,7 @@ </when> <when value="SmartSeq"> <expand macro="input_selection_smart_seq" /> - <param name="cell_ids" type="data" label="File containing cell IDs of the samples. One ID per line in order of samples in the above collection."/> + <param name="cell_ids" format="txt,tsv" type="data" label="File containing cell IDs of the samples. One ID per line in order of samples in the above collection."/> <param argument="--soloUMIdedup" type="select" label="UMI deduplication (collapsing) algorithm" help="All has all UMIs with 1 mismatch distance to each other collapsed, Directional follows the 'directional' method given in UMI-tools, Exact collapses only exactly matching UMIs."> <option value="Exact" >Exact</option> <option value="NoDedup">Do not deduplicate UMIs</option> @@ -254,11 +284,13 @@ </param> <param argument="--soloUMIfiltering" type="select" label="Type of UMI filtering" > <option value="-" selected="true">Remove UMIs with N and homopolymers (similar to CellRanger 2.2.0)</option> - <option value="MultiGeneUMI" >Remove lower-count UMIs that map to more than one gene ((introduced in CellRanger 3.x.x)</option> + <option value="MultiGeneUMI" >Remove lower-count UMIs that map to more than one gene</option> + <option value="MultiGeneUMI_CR" >Remove lower-count UMIs that map to more than one gene, matching CellRanger > 3.0.0</option> </param> <conditional name="filter" > <param name="filter_type" type="select" label="Cell filtering type and parameters" > <option value="cellranger2" selected="true" >Simple filtering of CellRanger v2</option> + <option value="emptydrops" >EmptyDrops filtering in CellRanger flavor</option> <option value="topcells" >Filter top N cells</option> <option value="no_filter" >Do not filter</option> </param> @@ -267,6 +299,18 @@ <param name="max_perc" type="float" min="0" max="1" value="0.99" label="Robust maximum percentile for UMI count" /> <param name="max_min_ratio" type="float" min="1" value="10" label="Maximum to minimum ratio for UMI count" /> </when> + <when value="emptydrops" > + <param name="nExpectedCells" type="integer" min="1" value="3000" label="Number of expected cells" /> + <param name="maxPercentile" type="float" min="0" max="1" value="0.99" label="Robust maximum percentile for UMI count" /> + <param name="maxMinRatio" type="float" min="1" value="10" label="Maximum to minimum ratio for UMI count" /> + <param name="indMin" type="integer" value="45000" label="Minimum number of barcodes (used as partition parameter for ambient estimation)" /> + <param name="indMax" type="integer" value="90000" label="Maximum number of barcodes (used as partition parameter for ambient estimation)" /> + <param name="umiMin" type="integer" value="500" label="Consider at least these many UMIs per barcode after initial cell calling" /> + <param name="umiMinFracMedian" type="float" value="0.01" label="Minimum UMI:median ratio after initial cell calling" /> + <param name="candMaxN" type="integer" value="20000" label="Number of extra barcodes after initial cell calling" /> + <param name="FDR" type="float" value="0.01" label="Maximum adjusted p-value for determining a barcode as non-ambient" /> + <param name="simN" type="integer" value="10000" label="Number of log likelihood simulations" /> + </when> <when value="topcells" > <param name="n_cells" type="integer" min="1" value="3000" label="Number of top cells to report sorted by UMI count" /> </when> @@ -420,7 +464,6 @@ <param name="soloUMIstart" value="17" /> <param name="soloUMIlen" value="12" /> </conditional> - <param name="soloUMIdedup" value="1MM_All" /> </conditional> <section name="solo" > <param name="soloStrand" value="Forward" /> @@ -577,6 +620,72 @@ </output> </test> <test expect_num_outputs="6"> + <!-- Emptydrops filtering --> + <conditional name="refGenomeSource"> + <param name="geneSource" value="history" /> + <param name="genomeFastaFiles" value="filtered3.Homo_sapiens.GRCh38.dna.chromosome.21.fa.gz" /> + <param name="genomeSAindexNbases" value="4" /> + <param name="sjdbOverhang" value="100" /> + <param name="sjdbGTFfile" value="filtered3.Homo_sapiens.GRCh38.100.chr21.gtf" ftype="gtf"/> + </conditional> + <conditional name="sc" > + <param name="solo_type" value="CB_UMI_Simple" /> + <conditional name="input_types"> + <param name="use" value="repeat" /> + <param name="input1" value="pbmc_1k_v2_L001.R1.10k.fastq.gz" ftype="fastqsanger.gz" /> + <param name="input2" value="pbmc_1k_v2_L001.R2.10k.fastq.gz" ftype="fastqsanger.gz" /> + </conditional> + <param name="soloCBwhitelist" value="filtered.barcodes.txt" /> + <conditional name="params"> + <param name="chemistry" value="CR3" /> + </conditional> + <param name="soloUMIdedup" value="1MM_All" /> + </conditional> + <section name="solo" > + <conditional name="filter"> + <param name="filter_type" value="emptydrops" /> + <param name="nExpectedCells" value="5" /> + <param name="maxPercentile" value="0.99" /> + <param name="maxMinRatio" value="10" /> + <param name="indMin" value="45000" /> + <param name="indMax" value="90000" /> + <param name="umiMin" value="500" /> + <param name="umiMinFracMedian" value="0.01" /> + <param name="candMaxN" value="20000" /> + <param name="FDR" value="0.01" /> + <param name="simN" value="10000" /> + </conditional> + <param name="soloStrand" value="Forward" /> + <param name="soloFeatures" value="Gene" /> + </section> + <output name="output_barcodes_filtered"> + <assert_contents> + <!-- first and last line --> + <has_line line="ACACCGGTCTAACGGT" /> + <has_line line="TTCTCAATCCACGTTC" /> + </assert_contents> + </output> + <output name="output_genes_filtered"> + <assert_contents> + <has_line_matching expression="ENSG00000279493\s+FP565260\.4\s+Gene\s+Expression" /> + <has_line_matching expression="ENSG00000279064\s+FP236315\.1\s+Gene\s+Expression" /> + </assert_contents> + </output> + <output name="output_matrix_filtered" > + <assert_contents> + <has_line_matching expression="14\s+7\s+7" /> + <has_line_matching expression="4\s+7\s+1" /> + </assert_contents> + </output> + <output name="output_stats" > + <assert_contents> + <has_line_matching expression="\s+nUnmapped\s+5823" /> + <has_line_matching expression="\s+nUMIs\s+8" /> + </assert_contents> + </output> + <output name="output_BAM" value="filtered3.bam" compare="sim_size" delta="600" /> + </test> + <test expect_num_outputs="6"> <!-- Test soloType CB_UMI_Complex --> <conditional name="refGenomeSource"> <param name="geneSource" value="history" /> @@ -612,6 +721,7 @@ <param name="umi_end_anchor_pos" value="14" /> <param name="soloAdapterSequence" value="GAGTGATTGCTTGTGACGCCTT" /> <param name="soloAdapterMismatchesNmax" value="1" /> + <param name="clipAdapterType" value="CellRanger4" /> <param name="soloUMIdedup" value="1MM_All" /> <param name="soloCBmatchWLtype" value="1MM" /> </conditional>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rnastar_test_genomeSAindexNbases.bed Mon Mar 15 13:46:45 2021 +0000 @@ -0,0 +1,2 @@ +test_chromosome 251 350 1 1 0 27 0 37 +test_chromosome 401 500 1 1 0 25 0 36
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rnastar_test_genomeSAindexNbases.log Mon Mar 15 13:46:45 2021 +0000 @@ -0,0 +1,37 @@ + Started job on | Mar 08 19:43:44 + Started mapping on | Mar 08 19:43:45 + Finished on | Mar 08 19:43:45 + Mapping speed, Million of reads per hour | inf + + Number of input reads | 100 + Average input read length | 75 + UNIQUE READS: + Uniquely mapped reads number | 99 + Uniquely mapped reads % | 99.00% + Average mapped length | 74.65 + Number of splices: Total | 52 + Number of splices: Annotated (sjdb) | 0 + Number of splices: GT/AG | 52 + Number of splices: GC/AG | 0 + Number of splices: AT/AC | 0 + Number of splices: Non-canonical | 0 + Mismatch rate per base, % | 2.00% + Deletion rate per base | 0.00% + Deletion average length | 0.00 + Insertion rate per base | 0.00% + Insertion average length | 0.00 + MULTI-MAPPING READS: + Number of reads mapped to multiple loci | 1 + % of reads mapped to multiple loci | 1.00% + Number of reads mapped to too many loci | 0 + % of reads mapped to too many loci | 0.00% + UNMAPPED READS: + Number of reads unmapped: too many mismatches | 0 + % of reads unmapped: too many mismatches | 0.00% + Number of reads unmapped: too short | 0 + % of reads unmapped: too short | 0.00% + Number of reads unmapped: other | 0 + % of reads unmapped: other | 0.00% + CHIMERIC READS: + Number of chimeric reads | 0 + % of chimeric reads | 0.00%
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rnastar_test_genomeSAindexNbases_02.bed Mon Mar 15 13:46:45 2021 +0000 @@ -0,0 +1,2 @@ +test_chromosome 251 350 1 1 0 27 0 37 +test_chromosome 401 500 1 1 0 25 0 36
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rnastar_test_genomeSAindexNbases_02.log Mon Mar 15 13:46:45 2021 +0000 @@ -0,0 +1,37 @@ + Started job on | Mar 08 19:43:59 + Started mapping on | Mar 08 19:43:59 + Finished on | Mar 08 19:43:59 + Mapping speed, Million of reads per hour | inf + + Number of input reads | 100 + Average input read length | 75 + UNIQUE READS: + Uniquely mapped reads number | 99 + Uniquely mapped reads % | 99.00% + Average mapped length | 74.65 + Number of splices: Total | 52 + Number of splices: Annotated (sjdb) | 0 + Number of splices: GT/AG | 52 + Number of splices: GC/AG | 0 + Number of splices: AT/AC | 0 + Number of splices: Non-canonical | 0 + Mismatch rate per base, % | 2.00% + Deletion rate per base | 0.00% + Deletion average length | 0.00 + Insertion rate per base | 0.00% + Insertion average length | 0.00 + MULTI-MAPPING READS: + Number of reads mapped to multiple loci | 1 + % of reads mapped to multiple loci | 1.00% + Number of reads mapped to too many loci | 0 + % of reads mapped to too many loci | 0.00% + UNMAPPED READS: + Number of reads unmapped: too many mismatches | 0 + % of reads unmapped: too many mismatches | 0.00% + Number of reads unmapped: too short | 0 + % of reads unmapped: too short | 0.00% + Number of reads unmapped: other | 0 + % of reads unmapped: other | 0.00% + CHIMERIC READS: + Number of chimeric reads | 0 + % of chimeric reads | 0.00%