Mercurial > repos > iuc > scpipe
changeset 2:5c4bca9dd4a2 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scpipe commit 60e2a9e9129a22924c55b11b218b39d913c7e686
author | iuc |
---|---|
date | Mon, 14 Jan 2019 08:06:47 -0500 |
parents | 4ec6717872b1 |
children | 7397e6badc11 |
files | scpipe.R scpipe.xml test-data/aligned.mapped.bam test-data/barcode_anno.tsv test-data/plots.pdf |
diffstat | 5 files changed, 555 insertions(+), 149 deletions(-) [+] |
line wrap: on
line diff
--- a/scpipe.R Fri Aug 17 08:22:49 2018 -0400 +++ b/scpipe.R Mon Jan 14 08:06:47 2019 -0500 @@ -18,8 +18,10 @@ }) option_list <- list( + make_option(c("-bam","--bam"), type="character", help="BAM file"), make_option(c("-fasta","--fasta"), type="character", help="Genome fasta file"), make_option(c("-exons","--exons"), type="character", help="Exon annotation gff3 file"), + make_option(c("-organism","--organism"), type="character", help="Organism e.g. hsapiens_gene_ensembl"), make_option(c("-barcodes","--barcodes"), type="character", help="Cell barcodes csv file"), make_option(c("-read1","--read1"), type="character", help="Read 1 fastq.gz"), make_option(c("-read2","--read2"), type="character", help="Read 2 fastq.gz"), @@ -40,6 +42,8 @@ make_option(c("-gene_fl","--gene_fl"), type="logical", help="Remove low abundant genes"), make_option(c("-max_reads","--max_reads"), type="integer", help="Maximum reads processed"), make_option(c("-min_count","--min_count"), type="integer", help="Minimum count to keep"), + make_option(c("-metrics_matrix","--metrics_matrix"), type="logical", help="QC metrics matrix"), + make_option(c("-keep_outliers","--keep_outliers"), type="logical", help="Keep outlier cells"), make_option(c("-report","--report"), type="logical", help="HTML report of plots"), make_option(c("-rdata","--rdata"), type="logical", help="Output RData file"), make_option(c("-nthreads","--nthreads"), type="integer", help="Number of threads") @@ -48,6 +52,7 @@ parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) args = parse_args(parser) +bam = args$bam fa_fn = args$fasta anno_fn = args$exons fq_R1 = args$read1 @@ -71,42 +76,48 @@ # Outputs out_dir = "." -fasta_index = file.path(out_dir, paste0(fa_fn, ".fasta_index")) -combined_fastq = file.path(out_dir, "combined.fastq") -aligned_bam = file.path(out_dir, "aligned.bam") mapped_bam = file.path(out_dir, "aligned.mapped.bam") +# if input is fastqs +if (!is.null(fa_fn)) { + fasta_index = file.path(out_dir, paste0(fa_fn, ".fasta_index")) + combined_fastq = file.path(out_dir, "combined.fastq") + aligned_bam = file.path(out_dir, "aligned.bam") -print("Trimming barcodes") -sc_trim_barcode(combined_fastq, - fq_R1, - fq_R2, - read_structure=read_structure, - filter_settings=filter_settings) + print("Trimming barcodes") + sc_trim_barcode(combined_fastq, + fq_R1, + fq_R2, + read_structure=read_structure, + filter_settings=filter_settings) -print("Building genome index") -Rsubread::buildindex(basename=fasta_index, reference=fa_fn) + print("Building genome index") + Rsubread::buildindex(basename=fasta_index, reference=fa_fn) -print("Aligning reads to genome") -Rsubread::align(index=fasta_index, - readfile1=combined_fastq, - output_file=aligned_bam, - nthreads=args$nthreads) + print("Aligning reads to genome") + Rsubread::align(index=fasta_index, + readfile1=combined_fastq, + output_file=aligned_bam, + nthreads=args$nthreads) -if (!is.null(args$barcodes)) { - barcode_anno=args$barcodes + if (!is.null(args$barcodes)) { + barcode_anno=args$barcodes + } else { + print("Detecting barcodes") + # detect 10X barcodes and generate sample_index.csv file + barcode_anno = "sample_index.csv" + sc_detect_bc( + infq=combined_fastq, + outcsv=barcode_anno, + bc_len=read_structure$bl2, + max_reads=args$max_reads, + min_count=args$min_count, + max_mismatch=args$max_mis + ) + } } else { - print("Detecting barcodes") - # detect 10X barcodes and generate sample_index.csv file - barcode_anno = "sample_index.csv" - sc_detect_bc( - infq=combined_fastq, - outcsv=barcode_anno, - bc_len=read_structure$bl2, - max_reads=args$max_reads, - min_count=args$min_count, - max_mismatch=args$max_mis - ) + aligned_bam = file.path(out_dir, bam) + barcode_anno=args$barcodes } print("Assigning reads to exons") @@ -118,32 +129,59 @@ print("Counting genes") sc_gene_counting(out_dir, barcode_anno, UMI_cor=args$UMI_cor, gene_fl=args$gene_fl) -print("Creating SingleCellExperiment object") +print("Performing QC") sce <- create_sce_by_dir(out_dir) +pdf("plots.pdf") +plot_demultiplex(sce) +if (has_umi) { + p = plot_UMI_dup(sce) + print(p) +} +sce = calculate_QC_metrics(sce) +sce = detect_outlier(sce) +p = plot_mapping(sce, percentage=TRUE, dataname=args$samplename) +print(p) +p = plot_QC_pairs(sce) +print(p) +dev.off() + +print("Removing outliers") +if (is.null(args$keep_outliers)) { + sce = remove_outliers(sce) + gene_counts <- counts(sce) + write.table(data.frame("gene_id"=rownames(gene_counts), gene_counts), file="gene_count.tsv", sep="\t", quote=FALSE, row.names=FALSE) +} -if (!is.null(args$report)) { -print("Creating report") -create_report(sample_name=args$samplename, - outdir=out_dir, - r1=fq_R1, - r2=fq_R2, - outfq=combined_fastq, - read_structure=read_structure, - filter_settings=filter_settings, - align_bam=aligned_bam, - genome_index=fasta_index, - map_bam=mapped_bam, - exon_anno=anno_fn, - stnd=args$stnd, - fix_chr=FALSE, - barcode_anno=barcode_anno, - max_mis=args$max_mis, - UMI_cor=args$UMI_cor, - gene_fl=args$gene_fl) +if (!is.null(args$metrics_matrix)) { + metrics <- colData(sce, internal=TRUE) + write.table(data.frame("cell_id"=rownames(metrics), metrics), file="metrics_matrix.tsv", sep="\t", quote=FALSE, row.names=FALSE) +} + +if (!is.null(args$report) & (!is.null(fa_fn))) { + print("Creating report") + create_report(sample_name=args$samplename, + outdir=out_dir, + r1=fq_R1, + r2=fq_R2, + outfq=combined_fastq, + read_structure=read_structure, + filter_settings=filter_settings, + align_bam=aligned_bam, + genome_index=fasta_index, + map_bam=mapped_bam, + exon_anno=anno_fn, + stnd=args$stnd, + fix_chr=FALSE, + barcode_anno=barcode_anno, + max_mis=args$max_mis, + UMI_cor=args$UMI_cor, + gene_fl=args$gene_fl, + organism=args$organism, + gene_id_type="ensembl_gene_id") } if (!is.null(args$rdata) ) { - save(sce, file = file.path(out_dir,"scPipe_analysis.RData")) + save(sce, file = file.path(out_dir,"scPipe_analysis.RData")) } sessionInfo()
--- a/scpipe.xml Fri Aug 17 08:22:49 2018 -0400 +++ b/scpipe.xml Mon Jan 14 08:06:47 2019 -0500 @@ -1,8 +1,10 @@ -<tool id="scpipe" name="scPipe" version="1.0.0"> +<tool id="scpipe" name="scPipe" version="1.0.0+galaxy1"> <description>- preprocessing pipeline for single cell RNA-seq</description> <requirements> <requirement type="package" version="1.0.0">bioconductor-scpipe</requirement> <requirement type="package" version="1.28.1">bioconductor-rsubread</requirement> + <!-- rhtslib can be removed with a newer scpipe package --> + <requirement type="package" version="1.10.0">bioconductor-rhtslib</requirement> <requirement type="package" version="1.20">r-knitr</requirement> <requirement type="package" version="1.10">r-rmarkdown</requirement> <requirement type="package" version="1.1.1">r-readr</requirement> @@ -11,9 +13,9 @@ <requirement type="package" version="1.6.0">bioconductor-scater</requirement> <requirement type="package" version="1.6.2">bioconductor-scran</requirement> <requirement type="package" version="0.13">r-rtsne</requirement> - <!-- Using older version of ggplot2 as getting error like this with 3.0.0: + <!-- Using older version of ggplot2 as getting error like this with 3.0.0: https://github.com/ggobi/ggally/issues/263 --> - <requirement type="package" version="2.2.1">r-ggplot2</requirement> + <requirement type="package" version="2.2.1">r-ggplot2</requirement> <requirement type="package" version="1.6.0">r-optparse</requirement> </requirements> <version_command><![CDATA[ @@ -24,56 +26,78 @@ ## Link input files -#if $ref_fasta.fasta_source == "history": - #set $fasta_name = re.sub('[^\w\-\s]', '_', str($ref_fasta.ref_fa_hist.element_identifier)) - ln -s '$ref_fasta.ref_fa_hist' '$fasta_name' && +#if $samples.format_select == "bam": + #set $bam_name = re.sub('[^\w\-\s]', '_', str($samples.bam.element_identifier)) + ln -s '$samples.bam' '$bam_name' && + ln -s '$samples.bam.metadata.bam_index' '${bam_name}.bai' && #else: - #set $fasta_name = os.path.basename(str($ref_fasta.ref_fa_builtin.fields.path)) - ln -s '$ref_fasta.ref_fa_builtin.fields.path' '$fasta_name' && + + ## FASTA ## + + #if $samples.ref_fasta.fasta_source == "history": + #set $fasta_name = re.sub('[^\w\-\s]', '_', str($samples.ref_fasta.ref_fa_hist.element_identifier)) + ln -s '$samples.ref_fasta.ref_fa_hist' '$fasta_name' && + #else: + #set $fasta_name = os.path.basename(str($samples.ref_fasta.ref_fa_builtin.fields.path)) + ln -s '$samples.ref_fasta.ref_fa_builtin.fields.path' '$fasta_name' && + #end if + + ## Reads ## + + #if $samples.paired_format.paired_format_selector == 'paired_collection': + #set $in1 = $samples.paired_format.paired_input.forward + #set $in2 = $samples.paired_format.paired_input.reverse + #set $in1_name = re.sub('[^\w\-\s]', '_', str($samples.paired_format.paired_input.name)) + #set $in2_name = re.sub('[^\w\-\s]', '_', str("%s_%s" % ($samples.paired_format.paired_input.name, "R2"))) + ln -s '$in1' '$in1_name' && + ln -s '$in2' '$in2_name' && + #elif $samples.paired_format.paired_format_selector == 'paired': + #set $in1 = $samples.paired_format.in1 + #set $in2 = $samples.paired_format.in2 + #set $in1_name = re.sub('[^\w\-\s]', '_', str($samples.paired_format.in1.element_identifier)) + #set $in2_name = re.sub('[^\w\-\s]', '_', str($samples.paired_format.in2.element_identifier)) + ln -s '$in1' '$in1_name' && + ln -s '$in2' '$in2_name' && + #end if #end if +## GFF3 ## + #set $anno_name = re.sub('[^\w\-\s]', '_', str($exons.element_identifier)) #set $anno_name = $anno_name + ".gff3" ln -s '${exons}' '$anno_name' && -#if $paired_format.paired_format_selector == 'paired_collection': - #set $in1 = $paired_format.paired_input.forward - #set $in2 = $paired_format.paired_input.reverse - #set $in1_name = re.sub('[^\w\-\s]', '_', str($paired_format.paired_input.name)) - #set $in2_name = re.sub('[^\w\-\s]', '_', str("%s_%s" % ($paired_format.paired_input.name, "R2"))) - #set out1 = $output_paired_coll.forward - #set out2 = $output_paired_coll.reverse - ln -s '$in1' '$in1_name' && - ln -s '$in2' '$in2_name' && -#else - #set $in1_name = re.sub('[^\w\-\s]', '_', str($in1.element_identifier)) - ln -s '$in1' '$in1_name' && - - #if str($paired_format.paired_format_selector) == 'paired': - #set $in2_name = re.sub('[^\w\-\s]', '_', str($in2.element_identifier)) - ln -s '$in2' '$in2_name' && - #end if -#end if - -#if $rscript: +#if $out.rscript: cp '$__tool_directory__/scpipe.R' '$out_rscript' && #end if TAB=\$(printf '\t') && -#if $barcodes: - sed -i.bak -e "s/\${TAB}/,/g" '$barcodes' && +#if $samples.barcodes: + sed -i.bak -e "s/\${TAB}/,/g" '$samples.barcodes' && #end if ## Run scPipe Rscript '$__tool_directory__/scpipe.R' ---fasta '$fasta_name' +#if $samples.format_select == "bam": + --bam '$bam_name' + --samplename '$bam_name' + --barcodes '$samples.barcodes' +#else: + --fasta '$fasta_name' + --read1 '$in1_name' + --read2 '$in2_name' + --samplename '$in1_name' + #if $barcodes: + --barcodes '$samples.barcodes' + #end if +#end if + --exons '$anno_name' ---samplename '$in1_name' ---read1 '$in1_name' ---read2 '$in2_name' +--organism '$organism' + --bs1 $bs1 --bl1 $bl1 --bs2 $bs2 @@ -81,96 +105,129 @@ --us $us --ul $ul -#if $barcodes: - --barcodes '$barcodes' +#if $out.metrics_matrix: + --metrics_matrix '$out.metrics_matrix' #end if -#if $report: - --report '$report' +#if $out.report: + --report '$out.report' #end if -#if $rdata: - --rdata '$rdata' +#if $out.rdata: + --rdata '$out.rdata' #end if ---rmlow $adv.rmlow ---rmN $adv.rmN ---minq $adv.minq ---numbq $adv.numbq +--rmlow $adv.f.rmlow +--rmN $adv.f.rmN +--minq $adv.f.minq +--numbq $adv.f.numbq +--max_mis $adv.f.max_mis +--max_reads $adv.f.max_reads +--min_count $adv.f.min_count + +--UMI_cor $adv.UMI_cor --stnd $adv.stnd ---max_mis $adv.max_mis ---UMI_cor $adv.UMI_cor --gene_fl $adv.gene_fl ---max_reads $adv.max_reads ---min_count $adv.min_count + --nthreads \${GALAXY_SLOTS:-2} -&& -sed -e "s/,/\${TAB}/g" gene_count.csv > gene_count.tsv +#if $keep_outliers: + --keep_outliers '$keep_outliers' + && sed -e "s/,/\${TAB}/g" gene_count.csv > gene_count.tsv +#end if ]]></command> <inputs> - <conditional name="ref_fasta"> - <param name="fasta_source" type="select" label="Reference genome FASTA"> - <option value="cached" selected="true">Use a built-in genome</option> - <option value="history">Use a FASTA from history</option> + <conditional name="samples"> + <param name="format_select" type="select" label="FASTQs or BAM" help="Select the format of the input sample"> + <option value="fastq" selected="True">FASTQ</option> + <option value="bam">BAM</option> </param> - <when value="cached"> - <param name="ref_fa_builtin" type="select" label="Select a built-in FASTA" help="If your genome of interest is not listed, contact your Galaxy administrator"> - <options from_data_table="all_fasta"> - <filter type="sort_by" column="2" /> - <validator type="no_options" message="No FASTA is available for the selected input dataset" /> - </options> - </param> + <when value="bam"> + <param name="bam" type="data" format="bam" label="BAM files"/> + <param name="barcodes" type="data" format="tabular,tsv" label="Cell barcodes file" help="File of cell barcodes. Should contain at least two columns, where the first column has the cell id and the second column contains the barcode sequence."/> </when> - <when value="history"> - <param name="ref_fa_hist" type="data" format="fasta" label="Select a history FASTA" /> + <when value="fastq"> + <conditional name="ref_fasta"> + <param name="fasta_source" type="select" label="Reference genome FASTA"> + <option value="cached" selected="true">Use a built-in genome</option> + <option value="history">Use a FASTA from history</option> + </param> + <when value="cached"> + <param name="ref_fa_builtin" type="select" label="Select a built-in FASTA" help="If your genome of interest is not listed, contact your Galaxy administrator"> + <options from_data_table="all_fasta"> + <filter type="sort_by" column="2" /> + <validator type="no_options" message="No FASTA is available for the selected input dataset" /> + </options> + </param> + </when> + <when value="history"> + <param name="ref_fa_hist" type="data" format="fasta" label="Select a history FASTA" /> + </when> + </conditional> + <conditional name="paired_format"> + <param name="paired_format_selector" type="select" label="Paired reads or Paired collection"> + <option value="paired">Paired</option> + <option value="paired_collection">Paired Collection</option> + </param> + <when value="paired"> + <param name="in1" type="data" format="fastq.gz,fastq" label="Input Read 1" help="Read 1 should contain the transcripts in fastq.gz format"/> + <param name="in2" type="data" format="fastq.gz,fastq" label="Input Read 2" help="Read 2 should contain UMI and barcodes in fastq.gz format"/> + </when> + <when value="paired_collection"> + <param name="paired_input" type="data_collection" collection_type="paired" format="fastq.gz,fastq" label="Select paired collection(s)"/> + </when> + </conditional> + <param name="barcodes" type="data" format="tabular,tsv" optional="True" label="Cell barcodes file" help="Optional file of cell barcodes. If not provied the barcodes will be detected from the reads. Should contain at least two columns, where the first column has the cell id and the second column contains the barcode sequence."/> </when> </conditional> - <param name="exons" type="data" format="gff3" label="Exon annotation GFF3 file" help="Current supported sources: ENSEMBL, GENCODE and RefSeq"/> - - <conditional name="paired_format"> - <param name="paired_format_selector" type="select" label="Paired reads or Paired collection"> - <option value="paired">Paired</option> - <option value="paired_collection">Paired Collection</option> - </param> - <when value="paired"> - <param name="in1" type="data" format="fastq.gz,fastq" label="Input Read 1" help="Read 1 should contain the transcripts in fastq.gz format"/> - <param name="in2" type="data" format="fastq.gz,fastq" label="Input Read 2" help="Read 2 should contain UMI and barcodes in fastq.gz format"/> - </when> - <when value="paired_collection"> - <param name="paired_input" type="data_collection" collection_type="paired" format="fastq.gz,fastq" label="Select paired collection(s)"/> - </when> - </conditional> - <param name="barcodes" type="data" format="tabular,tsv" optional="True" label="Cell barcodes file" help="Optional file of cell barcodes. Should contain at least two columns, where the first column has the cell id and the second column contains the barcode sequence."/> + <param name="exons" type="data" format="gff3" label="Exon annotation GFF3 file" help="Current supported source is ENSEMBL"/> + <param name="organism" type="text" label="Species gene id" help="This must be in biomaRt ENSEMBL listDatasets() format e.g. hsapiens_gene_ensembl. See the biomaRt user guide here: https://www.bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/biomaRt.html"> + <validator type="empty_field" /> + <validator type="regex" message="Only letters and underscores are allowed">^[\(\w\)]+$</validator> + </param> <param argument="--bs1" type="integer" min="-1" value="-1" label="Barcode start Read 1" help="Barcode start position in Read 1. Positions are 0-indexed so the first base is considered base 0, -1 indicates no barcode. Default: -1" /> <param argument="--bl1" type="integer" min="0" value="0" label="Barcode length Read 1" help="Barcode length in Read 1, 0 if no barcode present. Default: 0" /> <param argument="--bs2" type="integer" min="-1" value="6" label="Barcode start Read 2" help="Barcode start position in Read 2. Positions are 0-indexed so the first base is considered base 0, -1 indicates no barcode. Default: 6" /> <param argument="--bl2" type="integer" min="0" value="8" label="Barcode length Read 2" help="Barcode length in Read 2, 0 if no barcode present. Default: 8" /> <param argument="--us" type="integer" min="-1" value="0" label="UMI start Read 2" help="UMI start position in Read 2. Positions are 0-indexed so the first base is considered base 0, -1 indicates no UMI. Default: 0" /> <param argument="--ul" type="integer" min="0" value="6" label="UMI length Read 2" help="UMI length in Read 2, 0 if no UMI present. Default: 6" /> - <param name="report" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="True" label="Output HTML Report?" help="If this option is set to Yes, a HTML report containing QC metrics will be output. Default: Yes" /> - <param name="rscript" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output Rscript?" help="If this option is set to Yes, the Rscript used to annotate the IDs will be provided as a text file in the output. Default: No" /> - <param name="rdata" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Output RData file?" - help="Output all the data used by R to construct the tables and plots, can be loaded into R. Default: No"> - </param> + <param name="keep_outliers" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Keep outliers?" help="If this option is set to Yes, outlier cells will not be removed from the gene count matrix. Default: No" /> + <section name="out" title="Output Options"> + <param name="plots" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="True" label="Output PDF with plots?" help="If this option is set to Yes, a PDF containing QC plots will be output. Default: Yes" /> + <param name="metrics_matrix" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output QC metrics matrix?" help="If this option is set a matrix of QC metrics will be output. Default: No" /> + <param name="report" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output HTML Report?" help="Only valid if FASTQs are input. If this option is set to Yes, a HTML report containing QC metrics will be output. Default: No" /> + <param name="rscript" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output Rscript?" help="If this option is set to Yes, the Rscript used to annotate the IDs will be provided as a text file in the output. Default: No" /> + <param name="rdata" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Output RData file?" + help="Output all the data used by R to construct the tables and plots, can be loaded into R. Default: No"> + </param> + </section> <section name="adv" title="Advanced Options"> - <param argument="--rmlow" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Remove reads with N in barcode or UMI" help="Default: Yes" /> - <param argument="--rmN" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Remove reads with low quality" help="Default: Yes" /> - <param argument="--minq" type="integer" min="0" value="20" label="Minimum read quality" help="Default: 20" /> - <param argument="--numbq" type="integer" min="0" value="2" label="Maximum number of bases below minq" help="Default: 2" /> - <param argument="--stnd" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Perform strand-specific mapping" help="Default: Yes" /> - <param argument="--max_mis" type="integer" min="0" value="1" label="Maximum mismatch allowed in barcode" help="Default: 1" /> - <param argument="--UMI_cor" type="integer" min="0" value="1" label="Correct UMI sequence error" help="0 means no correction, 1 means simple correction and merge UMI with distance 1. Default: 1" /> - <param argument="--gene_fl" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Remove low abundant genes" help="Low abundant is defined as only one copy of one UMI for this gene. Default: No" /> - <param argument="--max_reads" type="integer" min="0" value="1000000" label="Maximum reads processed" help="Maximum reads processed if detecting barcodes. Default: 1,000,000" /> - <param argument="--min_count" type="integer" min="0" value="10" label="Minimum count to keep" help="Minimum count to keep if detecting barcodes. Barcode will be discarded if it has lower count. This should be set according to --max_reads. Default: 10" /> + <section name="f" title="FASTQ input only"> + <param argument="--rmlow" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Remove reads with N in barcode or UMI" help="Default: Yes" /> + <param argument="--rmN" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Remove reads with low quality" help="Default: Yes" /> + <param argument="--minq" type="integer" min="0" value="20" label="Minimum read quality" help="Default: 20" /> + <param argument="--numbq" type="integer" min="0" value="2" label="Maximum number of bases below minq" help="Default: 2" /> + <param argument="--max_mis" type="integer" min="0" value="1" label="Maximum mismatch allowed in barcode" help="Default: 1" /> + + <param argument="--max_reads" type="integer" min="0" value="1000000" label="Maximum reads processed" help="Maximum reads processed if detecting barcodes. Default: 1,000,000" /> + <param argument="--min_count" type="integer" min="0" value="10" label="Minimum count to keep" help="Minimum count to keep if detecting barcodes. Barcode will be discarded if it has lower count. This should be set according to --max_reads. Default: 10" /> + </section> + <param argument="--UMI_cor" type="integer" min="0" value="1" label="Correct UMI sequence error" help="0 means no correction, 1 means simple correction and merge UMI with distance 1. Default: 1" /> + <param argument="--stnd" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Perform strand-specific mapping" help="Default: Yes" /> + <param argument="--gene_fl" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Remove low abundant genes" help="Low abundant is defined as only one copy of one UMI for this gene. Default: No" /> </section> </inputs> <outputs> <data name="out_matrix" format="tabular" from_work_dir="gene_count.tsv" label="${tool.name} on ${on_string}: Count Matrix" /> + <data name="out_plots" format="pdf" from_work_dir="plots.pdf" label="${tool.name} on ${on_string}: Plots"> + <filter>plots</filter> + </data> + <data name="out_metrics_matrix" format="tabular" from_work_dir="metrics_matrix.tsv" label="${tool.name} on ${on_string}: QC metrics matrix"> + <filter>metrics_matrix</filter> + </data> <data name="out_report" format="html" from_work_dir="report.nb.html" label="${tool.name} on ${on_string}: HTML Report" > <filter>report</filter> </data> @@ -185,9 +242,11 @@ <tests> <!-- Ensure outputs work --> <test> + <param name="format_select" value="fastq" /> <param name="fasta_source" value="history"/> <param name="ref_fa_hist" ftype="fasta" value="mm10_MT19.fa.gz"/> <param name="exons" ftype="gff3" value="mm10_MT19.gff3.gz"/> + <param name="organism" value="mmusculus_gene_ensembl"/> <param name="paired_format_selector" value="paired" /> <param name="in1" ftype="fastqsanger.gz" value="CB51_MT19_R1.gz"/> <param name="in2" ftype="fastqsanger.gz" value="CB51_MT19_R2.gz"/> @@ -208,38 +267,81 @@ </test> <!-- Ensure built-in fasta works --> <test> + <param name="format_select" value="fastq" /> <param name="fasta_source" value="cached"/> <param name="exons" ftype="gff3" value="mm10_MT19.gff3.gz"/> + <param name="organism" value="mmusculus_gene_ensembl"/> <param name="paired_format_selector" value="paired" /> <param name="in1" ftype="fastqsanger.gz" dbkey="mm10" value="CB51_MT19_R1.gz"/> <param name="in2" ftype="fastqsanger.gz" dbkey="mm10" value="CB51_MT19_R2.gz"/> <param name="us" value="-1"/> <param name="max_reads" value="5000000"/> <param name="min_count" value="100"/> - <param name="report" value="False" /> + <output name="out_matrix" > + <assert_contents> + <has_text text="ENSMUSG00000064351" /> + </assert_contents> + </output> + </test> + <!-- Ensure BAM input works --> + <test> + <param name="format_select" value="bam" /> + <param name="bam" ftype="bam" value="aligned.mapped.bam"/> + <param name="barcodes" ftype="tabular" value="barcode_anno.tsv"/> + <param name="exons" ftype="gff3" value="mm10_MT19.gff3.gz"/> + <param name="organism" value="mmusculus_gene_ensembl"/> + <param name="us" value="-1"/> <output name="out_matrix" > <assert_contents> <has_text text="ENSMUSG00000064351" /> </assert_contents> </output> </test> + <!-- Ensure BAM input with QC outputs works --> + <test> + <param name="format_select" value="bam" /> + <param name="bam" ftype="bam" value="aligned.mapped.bam"/> + <param name="barcodes" ftype="tabular" value="barcode_anno.tsv"/> + <param name="exons" ftype="gff3" value="mm10_MT19.gff3.gz"/> + <param name="organism" value="mmusculus_gene_ensembl"/> + <param name="us" value="-1"/> + <param name="plots" value="True"/> + <param name="metrics_matrix" value="True"/> + <output name="out_matrix" > + <assert_contents> + <has_text text="ENSMUSG00000064351" /> + </assert_contents> + </output> + <output name="out_metrics_matrix" > + <assert_contents> + <has_line_matching expression="cell_id.*unaligned.*aligned_unmapped.*mapped_to_exon.*mapped_to_intron.*ambiguous_mapping.*mapped_to_ERCC.*mapped_to_MT.*number_of_genes.*total_count_per_cell.*non_mt_percent\toutliers" /> + </assert_contents> + </output> + <output name="out_plots" ftype="pdf" value="plots.pdf" compare="sim_size" /> + </test> + </tests> <help><![CDATA[ .. class:: infomark **What it does** -scPipe_ is an `R/Bioconductor package`_ that integrates barcode demultiplexing, read alignment, UMI-aware gene-level quantification and quality control of raw sequencing data generated by multiple protocols that include CEL-seq, MARS-seq, Chromium 10X, Drop-seq and Smart-seq. scPipe produces a count matrix that is essential for downstream analysis along with an HTML report that summarises data quality. These results can be used as input for downstream analyses including normalization, visualization and statistical testing. -Examples of the report output can be found here_. +scPipe_ is an `R/Bioconductor package`_ that integrates barcode demultiplexing, read alignment, UMI-aware gene-level quantification and quality control of raw sequencing data generated by multiple protocols that include CEL-seq, MARS-seq, Chromium 10X, Drop-seq and Smart-seq. scPipe produces a count matrix that is essential for downstream analysis along with QC metrics and a HTML report that summarises data quality. These results can be used as input for downstream analyses including normalization, visualization and statistical testing. +The scPipe workflow is described in this vignette_ and examples of the report output can be found here_. Note that outlier cells are detected and removed by default but they can be kept if "Keep outliers?" is selected. ----- **Inputs** +Either * Reference genome in FASTA format - * Exon annotation in GFF3 format * Paired-end FASTQ.GZ reads * Cell barcodes TAB-separated file (Optional) +OR + * BAM file + * Cell barcodes TAB-separated file +AND + * Exon annotation in ENSEMBL GFF3 format *Read Structure* @@ -266,12 +368,15 @@ Optionally you can choose to output - * HTML report (default is Yes) + * PDF of QC Plots (default is Yes) + * QC metrics matrix + * HTML report (if FASTQs are input) * Rscript * RData .. _scPipe: http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1006361 .. _R/Bioconductor package: https://bioconductor.org/packages/release/bioc/html/scPipe.html +.. _vignette: https://bioconductor.org/packages/release/bioc/vignettes/scPipe/inst/doc/scPipe_tutorial.html .. _here: http://bioinf.wehi.edu.au/scPipe/ ]]></help>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/barcode_anno.tsv Mon Jan 14 08:06:47 2019 -0500 @@ -0,0 +1,263 @@ +CELL_0000 CGACCATA 146 +CELL_0001 TACACGTG 193 +CELL_0002 ATGCAGTC 144 +CELL_0003 TAGGCTGT 280 +CELL_0004 CACAAGTC 1087 +CELL_0005 TGACTCAG 266 +CELL_0006 TGACCATG 156 +CELL_0007 GTTGCTGA 150 +CELL_0008 CAGTACCA 999 +CELL_0009 GCTTGATG 206 +CELL_0010 ACGTGTGT 151 +CELL_0011 GCAAGCTT 255 +CELL_0012 GACCTACA 183 +CELL_0013 TACGCTGA 208 +CELL_0014 GGTACACT 202 +CELL_0015 GCGTTAGT 141 +CELL_0016 CAACATGC 201 +CELL_0017 CATTGTGG 941 +CELL_0018 AAGGAGGA 178 +CELL_0019 CGTAGTGT 150 +CELL_0020 GGTTAGCT 234 +CELL_0021 AGGTTCCA 466 +CELL_0022 TACCGATG 434 +CELL_0023 AGGTTCAC 351 +CELL_0024 CAACTGAC 299 +CELL_0025 GCAAGTTC 220 +CELL_0026 GGTGATTC 138 +CELL_0027 CTGGTATG 347 +CELL_0028 TGCCAGAT 282 +CELL_0029 CGTGATGT 122 +CELL_0030 GACCATCA 103 +CELL_0031 TACCGTGA 186 +CELL_0032 TAGTGCGT 105 +CELL_0033 GCATTAGC 219 +CELL_0034 GCGATATC 345 +CELL_0035 CAAGTGTC 215 +CELL_0036 TGCAGACT 536 +CELL_0037 ATCCGCAA 283 +CELL_0038 GGTTATGC 415 +CELL_0039 CGCAACAT 186 +CELL_0040 AGGAAGGA 109 +CELL_0041 AGTCACGT 291 +CELL_0042 GCACGTAT 113 +CELL_0043 ATTGGCGT 101 +CELL_0044 GGACTTAC 316 +CELL_0045 ACTGTGAC 693 +CELL_0046 TACTGTGG 122 +CELL_0047 CATCAAGC 184 +CELL_0048 TGACTACG 155 +CELL_0049 GTAGTGTC 203 +CELL_0050 TATCCGAC 171 +CELL_0051 GTCATGCA 259 +CELL_0052 CACAATCG 118 +CELL_0053 GCAATGTC 140 +CELL_0054 TGTATGGC 290 +CELL_0055 GCTAGTGT 226 +CELL_0056 ATACCACG 141 +CELL_0057 GGTATCGT 162 +CELL_0058 ATGCTGAC 129 +CELL_0059 GCAGTCAT 148 +CELL_0060 CAATCCAG 142 +CELL_0061 TAGGTTGC 454 +CELL_0062 AACCGTAC 510 +CELL_0063 CGTCGATA 454 +CELL_0064 ATCCGTAG 379 +CELL_0065 CCACTAAG 267 +CELL_0066 GTCAGTAC 426 +CELL_0067 CCTAGTAG 138 +CELL_0068 TGCGACTA 201 +CELL_0069 ACCGACAT 164 +CELL_0070 TCGCTGAA 168 +CELL_0071 AGCGTATC 1291 +CELL_0072 TCACGGTA 146 +CELL_0073 TGCAATGC 237 +CELL_0074 GGCCAATT 289 +CELL_0075 TCTCGGAA 336 +CELL_0076 GAGACCTT 291 +CELL_0077 CGCTAGTA 511 +CELL_0078 CCTGTAAG 215 +CELL_0079 GCGTGATT 338 +CELL_0080 CTCAAGAC 182 +CELL_0081 TAGCGATC 250 +CELL_0082 GGACTTGT 711 +CELL_0083 CTCACAGA 259 +CELL_0084 GTAGACCT 495 +CELL_0085 AGTGTGCT 477 +CELL_0086 ACTGTTGG 208 +CELL_0087 ACGCCATA 159 +CELL_0088 GGTCCAAT 281 +CELL_0089 TATCCGGA 159 +CELL_0090 GTCGCTAA 245 +CELL_0091 ACCAAGCT 135 +CELL_0092 TGTCGATG 112 +CELL_0093 GCCAATTG 161 +CELL_0094 ATGCCAGT 1068 +CELL_0095 ACGATGTC 211 +CELL_0096 CATCCAGA 156 +CELL_0097 CAACGCTA 229 +CELL_0098 GAAGGAAG 110 +CELL_0099 TGGCGATT 414 +CELL_0100 AGTCATGC 248 +CELL_0101 AGCGTTAC 123 +CELL_0102 ACTCAAGC 148 +CELL_0103 CGATAGCT 170 +CELL_0104 CTACGCAA 254 +CELL_0105 AAAAAAAA 261 +CELL_0106 TAACCGTG 136 +CELL_0107 TAGCGTCA 385 +CELL_0108 ACCACTGA 163 +CELL_0109 CGCATGTA 109 +CELL_0110 ACCGTGAT 137 +CELL_0111 CTCCAAGA 179 +CELL_0112 CACTACAG 144 +CELL_0113 AGCCTAAC 104 +CELL_0114 CTGTGAGT 112 +CELL_0115 CTGCAACA 320 +CELL_0116 CTCACGAA 1181 +CELL_0117 ATTCGCAG 858 +CELL_0118 CGCAATAC 127 +CELL_0119 TCAGTGCA 151 +CELL_0120 GTGAGTTC 123 +CELL_0121 CCTATGAG 136 +CELL_0122 CAGCTTAG 141 +CELL_0123 TCTCAAGG 151 +CELL_0124 GTGGTCTA 242 +CELL_0125 ATTGTGGC 160 +CELL_0126 ACTACGGT 206 +CELL_0127 TCATGGTG 134 +CELL_0128 AGCAGCTT 164 +CELL_0129 GTACTGTG 341 +CELL_0130 CACGAGTT 103 +CELL_0131 TCAGGTGT 103 +CELL_0132 ACCAACGT 203 +CELL_0133 CGACTACA 226 +CELL_0134 AACGTCAC 375 +CELL_0135 GACATGCT 372 +CELL_0136 TCCATGAG 182 +CELL_0137 GGATGCTT 114 +CELL_0138 GGCACTTA 176 +CELL_0139 AGTGTCTG 205 +CELL_0140 TGCGTAGT 319 +CELL_0141 TAAGCTGC 340 +CELL_0142 GTACATCG 159 +CELL_0143 TGGCAACT 1094 +CELL_0144 GGTGCATT 143 +CELL_0145 TGCATGGT 136 +CELL_0146 GGTCAATC 248 +CELL_0147 CGTGAATC 233 +CELL_0148 CAATCGGT 377 +CELL_0149 ACTACACG 165 +CELL_0150 TGCACTGA 116 +CELL_0151 GCGCAATT 132 +CELL_0152 ACAACGTC 987 +CELL_0153 AGCATGTC 260 +CELL_0154 TCACACGA 106 +CELL_0155 GAATCCAC 282 +CELL_0156 TAGGCCAT 335 +CELL_0157 ACACTCGA 374 +CELL_0158 GAACTTCG 155 +CELL_0159 CACTGCAA 119 +CELL_0160 TACGGATC 159 +CELL_0161 GCCAACTA 349 +CELL_0162 TACCACGA 151 +CELL_0163 GATACTGC 127 +CELL_0164 TCAGGCAT 125 +CELL_0165 TGCGGTTA 136 +CELL_0166 TCAGCGTA 245 +CELL_0167 GCTAAGCT 476 +CELL_0168 GTCCAACA 117 +CELL_0169 CCATCAAG 123 +CELL_0170 CACGGTAT 240 +CELL_0171 GTCATGTG 130 +CELL_0172 CGTAACTG 113 +CELL_0173 GTTACCGA 126 +CELL_0174 ATAGGCTC 1103 +CELL_0175 CCATCGAA 162 +CELL_0176 AGGAGGAA 105 +CELL_0177 ATTGCGTG 317 +CELL_0178 ACGTATCG 142 +CELL_0179 GCCATCAA 153 +CELL_0180 AGCTTGCA 169 +CELL_0181 TCACAGAC 275 +CELL_0182 CGACACAT 148 +CELL_0183 AGCACTTG 717 +CELL_0184 GTGCGTTA 163 +CELL_0185 GAGTTCCA 122 +CELL_0186 ATGGTCCA 171 +CELL_0187 GTTGAGTC 196 +CELL_0188 GCGTGTAT 577 +CELL_0189 TCGAGTTG 176 +CELL_0190 TACGAGTC 169 +CELL_0191 CAATTGGC 738 +CELL_0192 GGGGGGGG 183 +CELL_0193 GCCTTAAG 102 +CELL_0194 GGTCTATG 233 +CELL_0195 ACACACGT 1061 +CELL_0196 TGGTCTGA 150 +CELL_0197 GAGAGAAG 101 +CELL_0198 GCATGTGT 101 +CELL_0199 ACGCACTA 173 +CELL_0200 TCGCACAA 203 +CELL_0201 GATTCGGT 102 +CELL_0202 GCTGCATA 282 +CELL_0203 TATGGCTG 167 +CELL_0204 GTTCCAAG 377 +CELL_0205 CTAGGATC 200 +CELL_0206 TCTTCCTC 181 +CELL_0207 GTGCTCAA 602 +CELL_0208 TGAGTTGC 200 +CELL_0209 AACACCAC 163 +CELL_0210 TGCTAGGT 361 +CELL_0211 TGCACGTA 325 +CELL_0212 ATGTCCAG 687 +CELL_0213 ACCAAGTC 172 +CELL_0214 CACACTAG 247 +CELL_0215 AGTATCCG 126 +CELL_0216 ACCATGTG 155 +CELL_0217 CAAGCCAT 231 +CELL_0218 AGCTCATG 169 +CELL_0219 TCGTTGGA 192 +CELL_0220 GACTGTAC 250 +CELL_0221 ACTCTGAG 157 +CELL_0222 ACATGCCA 399 +CELL_0223 GACTACTG 735 +CELL_0224 ACACGCAT 401 +CELL_0225 CGTAGCAT 199 +CELL_0226 TAGCCGAT 319 +CELL_0227 ATCACGAC 129 +CELL_0228 ATGGTGCT 2031 +CELL_0229 GGTAACTC 155 +CELL_0230 AGCTCTGA 162 +CELL_0231 TCGAGACT 279 +CELL_0232 TGTCGCAA 121 +CELL_0233 CCAATAGC 145 +CELL_0234 CTATCGGA 812 +CELL_0235 CAACTAGC 188 +CELL_0236 TGCGGATT 132 +CELL_0237 CCGTTAAG 123 +CELL_0238 TCGACTAG 455 +CELL_0239 TATTGCGG 325 +CELL_0240 GCGCTTAA 379 +CELL_0241 CCGATGTA 488 +CELL_0242 TGGTCGAT 163 +CELL_0243 CTACGAAC 346 +CELL_0244 CAGTCCAA 105 +CELL_0245 AGCTTACG 221 +CELL_0246 CGACATAC 124 +CELL_0247 ATTGCCGA 1198 +CELL_0248 TCCTCTTC 160 +CELL_0249 AGCTGTTG 335 +CELL_0250 GCATAGTC 319 +CELL_0251 CGATTGTG 386 +CELL_0252 TGGTTGCA 339 +CELL_0253 CGAATCCA 158 +CELL_0254 TGGTGATC 318 +CELL_0255 AGAAGAGG 153 +CELL_0256 GTCACGTA 105 +CELL_0257 GTACTTGG 688 +CELL_0258 GTCCGAAT 265 +CELL_0259 TCTCCTCT 501 +CELL_0260 GATTGGTC 372 +CELL_0261 TACTGGCA 117 +CELL_0262 GTATGGTC 621