Mercurial > repos > iuc > scpipe

--- a/scpipe.R	Fri Aug 17 08:22:49 2018 -0400
+++ b/scpipe.R	Mon Jan 14 08:06:47 2019 -0500
@@ -18,8 +18,10 @@
 })

 option_list <- list(
+    make_option(c("-bam","--bam"), type="character", help="BAM file"),
     make_option(c("-fasta","--fasta"), type="character", help="Genome fasta file"),
     make_option(c("-exons","--exons"), type="character", help="Exon annotation gff3 file"),
+    make_option(c("-organism","--organism"), type="character", help="Organism e.g. hsapiens_gene_ensembl"),
     make_option(c("-barcodes","--barcodes"), type="character", help="Cell barcodes csv file"),
     make_option(c("-read1","--read1"), type="character", help="Read 1 fastq.gz"),
     make_option(c("-read2","--read2"), type="character", help="Read 2 fastq.gz"),
@@ -40,6 +42,8 @@
     make_option(c("-gene_fl","--gene_fl"), type="logical", help="Remove low abundant genes"),
     make_option(c("-max_reads","--max_reads"), type="integer", help="Maximum reads processed"),
     make_option(c("-min_count","--min_count"), type="integer", help="Minimum count to keep"),
+    make_option(c("-metrics_matrix","--metrics_matrix"), type="logical", help="QC metrics matrix"),
+    make_option(c("-keep_outliers","--keep_outliers"), type="logical", help="Keep outlier cells"),
     make_option(c("-report","--report"), type="logical", help="HTML report of plots"),
     make_option(c("-rdata","--rdata"), type="logical", help="Output RData file"),
     make_option(c("-nthreads","--nthreads"), type="integer", help="Number of threads")
@@ -48,6 +52,7 @@
 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 args = parse_args(parser)

+bam = args$bam
 fa_fn = args$fasta
 anno_fn = args$exons
 fq_R1 = args$read1
@@ -71,42 +76,48 @@

 # Outputs
 out_dir = "."
-fasta_index = file.path(out_dir, paste0(fa_fn, ".fasta_index"))
-combined_fastq = file.path(out_dir, "combined.fastq")
-aligned_bam = file.path(out_dir, "aligned.bam")
 mapped_bam = file.path(out_dir, "aligned.mapped.bam")

+# if input is fastqs
+if (!is.null(fa_fn)) {
+  fasta_index = file.path(out_dir, paste0(fa_fn, ".fasta_index"))
+  combined_fastq = file.path(out_dir, "combined.fastq")
+  aligned_bam = file.path(out_dir, "aligned.bam")

-print("Trimming barcodes")
-sc_trim_barcode(combined_fastq,
-                fq_R1,
-                fq_R2,
-                read_structure=read_structure,
-                filter_settings=filter_settings)
+  print("Trimming barcodes")
+  sc_trim_barcode(combined_fastq,
+                  fq_R1,
+                  fq_R2,
+                  read_structure=read_structure,
+                  filter_settings=filter_settings)

-print("Building genome index")
-Rsubread::buildindex(basename=fasta_index, reference=fa_fn)
+  print("Building genome index")
+  Rsubread::buildindex(basename=fasta_index, reference=fa_fn)

-print("Aligning reads to genome")
-Rsubread::align(index=fasta_index,
-    readfile1=combined_fastq,
-    output_file=aligned_bam,
-    nthreads=args$nthreads)
+  print("Aligning reads to genome")
+  Rsubread::align(index=fasta_index,
+      readfile1=combined_fastq,
+      output_file=aligned_bam,
+      nthreads=args$nthreads)

-if (!is.null(args$barcodes)) {
-  barcode_anno=args$barcodes
+  if (!is.null(args$barcodes)) {
+    barcode_anno=args$barcodes
+  } else {
+    print("Detecting barcodes")
+    # detect 10X barcodes and generate sample_index.csv file
+    barcode_anno = "sample_index.csv"
+    sc_detect_bc(
+        infq=combined_fastq,
+        outcsv=barcode_anno,
+        bc_len=read_structure$bl2,
+        max_reads=args$max_reads,
+        min_count=args$min_count,
+        max_mismatch=args$max_mis
+    )
+  }
 } else {
-  print("Detecting barcodes")
-  # detect 10X barcodes and generate sample_index.csv file
-  barcode_anno = "sample_index.csv"
-  sc_detect_bc(
-      infq=combined_fastq,
-      outcsv=barcode_anno,
-      bc_len=read_structure$bl2,
-      max_reads=args$max_reads,
-      min_count=args$min_count,
-      max_mismatch=args$max_mis
-  )
+   aligned_bam = file.path(out_dir, bam)
+   barcode_anno=args$barcodes
 }

 print("Assigning reads to exons")
@@ -118,32 +129,59 @@
 print("Counting genes")
 sc_gene_counting(out_dir, barcode_anno, UMI_cor=args$UMI_cor, gene_fl=args$gene_fl)

-print("Creating SingleCellExperiment object")
+print("Performing QC")
 sce <- create_sce_by_dir(out_dir)
+pdf("plots.pdf")
+plot_demultiplex(sce)
+if (has_umi) {
+  p = plot_UMI_dup(sce)
+  print(p)
+}
+sce = calculate_QC_metrics(sce)
+sce = detect_outlier(sce)
+p = plot_mapping(sce, percentage=TRUE, dataname=args$samplename)
+print(p)
+p = plot_QC_pairs(sce)
+print(p)
+dev.off()
+
+print("Removing outliers")
+if (is.null(args$keep_outliers)) {
+  sce = remove_outliers(sce)
+  gene_counts <- counts(sce)
+  write.table(data.frame("gene_id"=rownames(gene_counts), gene_counts), file="gene_count.tsv", sep="\t", quote=FALSE, row.names=FALSE)
+}

-if (!is.null(args$report)) {
-print("Creating report")
-create_report(sample_name=args$samplename,
-   outdir=out_dir,
-   r1=fq_R1,
-   r2=fq_R2,
-   outfq=combined_fastq,
-   read_structure=read_structure,
-   filter_settings=filter_settings,
-   align_bam=aligned_bam,
-   genome_index=fasta_index,
-   map_bam=mapped_bam,
-   exon_anno=anno_fn,
-   stnd=args$stnd,
-   fix_chr=FALSE,
-   barcode_anno=barcode_anno,
-   max_mis=args$max_mis,
-   UMI_cor=args$UMI_cor,
-   gene_fl=args$gene_fl)
+if (!is.null(args$metrics_matrix)) {
+  metrics <- colData(sce, internal=TRUE)
+  write.table(data.frame("cell_id"=rownames(metrics), metrics), file="metrics_matrix.tsv", sep="\t", quote=FALSE, row.names=FALSE)
+}
+
+if (!is.null(args$report) & (!is.null(fa_fn))) {
+  print("Creating report")
+  create_report(sample_name=args$samplename,
+     outdir=out_dir,
+     r1=fq_R1,
+     r2=fq_R2,
+     outfq=combined_fastq,
+     read_structure=read_structure,
+     filter_settings=filter_settings,
+     align_bam=aligned_bam,
+     genome_index=fasta_index,
+     map_bam=mapped_bam,
+     exon_anno=anno_fn,
+     stnd=args$stnd,
+     fix_chr=FALSE,
+     barcode_anno=barcode_anno,
+     max_mis=args$max_mis,
+     UMI_cor=args$UMI_cor,
+     gene_fl=args$gene_fl,
+     organism=args$organism,
+     gene_id_type="ensembl_gene_id")
 }

 if (!is.null(args$rdata) ) {
-    save(sce, file = file.path(out_dir,"scPipe_analysis.RData"))
+  save(sce, file = file.path(out_dir,"scPipe_analysis.RData"))
 }

 sessionInfo()
--- a/scpipe.xml	Fri Aug 17 08:22:49 2018 -0400
+++ b/scpipe.xml	Mon Jan 14 08:06:47 2019 -0500
@@ -1,8 +1,10 @@
-<tool id="scpipe" name="scPipe" version="1.0.0">
+<tool id="scpipe" name="scPipe" version="1.0.0+galaxy1">
     <description>- preprocessing pipeline for single cell RNA-seq</description>
     <requirements>
         <requirement type="package" version="1.0.0">bioconductor-scpipe</requirement>
         <requirement type="package" version="1.28.1">bioconductor-rsubread</requirement>
+        <!-- rhtslib can be removed with a newer scpipe package -->
+        <requirement type="package" version="1.10.0">bioconductor-rhtslib</requirement>
         <requirement type="package" version="1.20">r-knitr</requirement>
         <requirement type="package" version="1.10">r-rmarkdown</requirement>
         <requirement type="package" version="1.1.1">r-readr</requirement>
@@ -11,9 +13,9 @@
         <requirement type="package" version="1.6.0">bioconductor-scater</requirement>
         <requirement type="package" version="1.6.2">bioconductor-scran</requirement>
         <requirement type="package" version="0.13">r-rtsne</requirement>
-        <!-- Using older version of ggplot2 as getting error like this with 3.0.0:
+        <!-- Using older version of ggplot2 as getting error like this with 3.0.0:
         https://github.com/ggobi/ggally/issues/263  -->
-        <requirement type="package" version="2.2.1">r-ggplot2</requirement>
+        <requirement type="package" version="2.2.1">r-ggplot2</requirement>
         <requirement type="package" version="1.6.0">r-optparse</requirement>
     </requirements>
     <version_command><![CDATA[
@@ -24,56 +26,78 @@

 ## Link input files

-#if $ref_fasta.fasta_source == "history":
-    #set $fasta_name = re.sub('[^\w\-\s]', '_', str($ref_fasta.ref_fa_hist.element_identifier))
-    ln -s '$ref_fasta.ref_fa_hist' '$fasta_name' &&
+#if $samples.format_select == "bam":
+    #set $bam_name = re.sub('[^\w\-\s]', '_', str($samples.bam.element_identifier))
+    ln -s '$samples.bam' '$bam_name' &&
+    ln -s '$samples.bam.metadata.bam_index' '${bam_name}.bai' &&
 #else:
-    #set $fasta_name = os.path.basename(str($ref_fasta.ref_fa_builtin.fields.path))
-    ln -s '$ref_fasta.ref_fa_builtin.fields.path' '$fasta_name' &&
+
+    ## FASTA ##
+
+    #if $samples.ref_fasta.fasta_source == "history":
+        #set $fasta_name = re.sub('[^\w\-\s]', '_', str($samples.ref_fasta.ref_fa_hist.element_identifier))
+        ln -s '$samples.ref_fasta.ref_fa_hist' '$fasta_name' &&
+    #else:
+        #set $fasta_name = os.path.basename(str($samples.ref_fasta.ref_fa_builtin.fields.path))
+        ln -s '$samples.ref_fasta.ref_fa_builtin.fields.path' '$fasta_name' &&
+    #end if
+
+    ## Reads ##
+
+    #if $samples.paired_format.paired_format_selector == 'paired_collection':
+        #set $in1 = $samples.paired_format.paired_input.forward
+        #set $in2 = $samples.paired_format.paired_input.reverse
+        #set $in1_name = re.sub('[^\w\-\s]', '_', str($samples.paired_format.paired_input.name))
+        #set $in2_name = re.sub('[^\w\-\s]', '_', str("%s_%s" % ($samples.paired_format.paired_input.name, "R2")))
+        ln -s '$in1' '$in1_name' &&
+        ln -s '$in2' '$in2_name' &&
+    #elif $samples.paired_format.paired_format_selector == 'paired':
+        #set $in1 = $samples.paired_format.in1
+        #set $in2 = $samples.paired_format.in2
+        #set $in1_name = re.sub('[^\w\-\s]', '_', str($samples.paired_format.in1.element_identifier))
+        #set $in2_name = re.sub('[^\w\-\s]', '_', str($samples.paired_format.in2.element_identifier))
+        ln -s '$in1' '$in1_name' &&
+        ln -s '$in2' '$in2_name' &&
+    #end if
 #end if

+## GFF3 ##
+
 #set $anno_name = re.sub('[^\w\-\s]', '_', str($exons.element_identifier))
 #set $anno_name = $anno_name + ".gff3"
 ln -s '${exons}' '$anno_name' &&

-#if $paired_format.paired_format_selector == 'paired_collection':
-    #set $in1 = $paired_format.paired_input.forward
-    #set $in2 = $paired_format.paired_input.reverse
-    #set $in1_name = re.sub('[^\w\-\s]', '_', str($paired_format.paired_input.name))
-    #set $in2_name = re.sub('[^\w\-\s]', '_', str("%s_%s" % ($paired_format.paired_input.name, "R2")))
-    #set out1 = $output_paired_coll.forward
-    #set out2 = $output_paired_coll.reverse
-    ln -s '$in1' '$in1_name' &&
-    ln -s '$in2' '$in2_name' &&
-#else
-    #set $in1_name = re.sub('[^\w\-\s]', '_', str($in1.element_identifier))
-    ln -s '$in1' '$in1_name' &&
-
-    #if str($paired_format.paired_format_selector) == 'paired':
-        #set $in2_name = re.sub('[^\w\-\s]', '_', str($in2.element_identifier))
-        ln -s '$in2' '$in2_name' &&
-    #end if
-#end if
-
-#if $rscript:
+#if $out.rscript:
     cp '$__tool_directory__/scpipe.R' '$out_rscript' &&
 #end if

 TAB=\$(printf '\t') &&

-#if $barcodes:
-    sed -i.bak -e "s/\${TAB}/,/g" '$barcodes' &&
+#if $samples.barcodes:
+    sed -i.bak -e "s/\${TAB}/,/g" '$samples.barcodes' &&
 #end if

 ## Run scPipe

 Rscript '$__tool_directory__/scpipe.R'

---fasta '$fasta_name'
+#if $samples.format_select == "bam":
+    --bam '$bam_name'
+    --samplename '$bam_name'
+    --barcodes '$samples.barcodes'
+#else:
+    --fasta '$fasta_name'
+    --read1 '$in1_name'
+    --read2 '$in2_name'
+    --samplename '$in1_name'
+    #if $barcodes:
+        --barcodes '$samples.barcodes'
+    #end if
+#end if
+
 --exons '$anno_name'
---samplename '$in1_name'
---read1 '$in1_name'
---read2 '$in2_name'
+--organism '$organism'
+
 --bs1 $bs1
 --bl1 $bl1
 --bs2 $bs2
@@ -81,96 +105,129 @@
 --us $us
 --ul $ul

-#if $barcodes:
-    --barcodes '$barcodes'
+#if $out.metrics_matrix:
+    --metrics_matrix '$out.metrics_matrix'
 #end if

-#if $report:
-    --report '$report'
+#if $out.report:
+    --report '$out.report'
 #end if

-#if $rdata:
-    --rdata '$rdata'
+#if $out.rdata:
+    --rdata '$out.rdata'
 #end if

---rmlow $adv.rmlow
---rmN $adv.rmN
---minq $adv.minq
---numbq $adv.numbq
+--rmlow $adv.f.rmlow
+--rmN $adv.f.rmN
+--minq $adv.f.minq
+--numbq $adv.f.numbq
+--max_mis $adv.f.max_mis
+--max_reads $adv.f.max_reads
+--min_count $adv.f.min_count
+
+--UMI_cor $adv.UMI_cor
 --stnd $adv.stnd
---max_mis $adv.max_mis
---UMI_cor $adv.UMI_cor
 --gene_fl $adv.gene_fl
---max_reads $adv.max_reads
---min_count $adv.min_count
+
 --nthreads \${GALAXY_SLOTS:-2}

-&&
-sed -e "s/,/\${TAB}/g" gene_count.csv > gene_count.tsv
+#if $keep_outliers:
+    --keep_outliers '$keep_outliers'
+    && sed -e "s/,/\${TAB}/g" gene_count.csv > gene_count.tsv
+#end if

 ]]></command>

     <inputs>
-        <conditional name="ref_fasta">
-            <param name="fasta_source" type="select" label="Reference genome FASTA">
-                <option value="cached" selected="true">Use a built-in genome</option>
-                <option value="history">Use a FASTA from history</option>
+        <conditional name="samples">
+            <param name="format_select" type="select" label="FASTQs or BAM" help="Select the format of the input sample">
+                <option value="fastq" selected="True">FASTQ</option>
+                <option value="bam">BAM</option>
             </param>
-            <when value="cached">
-                <param name="ref_fa_builtin" type="select" label="Select a built-in FASTA" help="If your genome of interest is not listed, contact your Galaxy administrator">
-                    <options from_data_table="all_fasta">
-                        <filter type="sort_by" column="2" />
-                        <validator type="no_options" message="No FASTA is available for the selected input dataset" />
-                    </options>
-                </param>
+            <when value="bam">
+                <param name="bam" type="data" format="bam" label="BAM files"/>
+                <param name="barcodes" type="data" format="tabular,tsv" label="Cell barcodes file" help="File of cell barcodes. Should contain at least two columns, where the first column has the cell id and the second column contains the barcode sequence."/>
             </when>
-            <when value="history">
-                <param name="ref_fa_hist" type="data" format="fasta" label="Select a history FASTA" />
+            <when value="fastq">
+                <conditional name="ref_fasta">
+                    <param name="fasta_source" type="select" label="Reference genome FASTA">
+                        <option value="cached" selected="true">Use a built-in genome</option>
+                        <option value="history">Use a FASTA from history</option>
+                    </param>
+                    <when value="cached">
+                        <param name="ref_fa_builtin" type="select" label="Select a built-in FASTA" help="If your genome of interest is not listed, contact your Galaxy administrator">
+                            <options from_data_table="all_fasta">
+                                <filter type="sort_by" column="2" />
+                                <validator type="no_options" message="No FASTA is available for the selected input dataset" />
+                            </options>
+                        </param>
+                    </when>
+                    <when value="history">
+                        <param name="ref_fa_hist" type="data" format="fasta" label="Select a history FASTA" />
+                    </when>
+                </conditional>
+                <conditional name="paired_format">
+                    <param name="paired_format_selector" type="select" label="Paired reads or Paired collection">
+                        <option value="paired">Paired</option>
+                        <option value="paired_collection">Paired Collection</option>
+                    </param>
+                    <when value="paired">
+                        <param name="in1" type="data" format="fastq.gz,fastq" label="Input Read 1" help="Read 1 should contain the transcripts in fastq.gz format"/>
+                        <param name="in2" type="data" format="fastq.gz,fastq" label="Input Read 2" help="Read 2 should contain UMI and barcodes in fastq.gz format"/>
+                    </when>
+                    <when value="paired_collection">
+                        <param name="paired_input" type="data_collection" collection_type="paired" format="fastq.gz,fastq" label="Select paired collection(s)"/>
+                    </when>
+                </conditional>
+                <param name="barcodes" type="data" format="tabular,tsv" optional="True" label="Cell barcodes file" help="Optional file of cell barcodes. If not provied the barcodes will be detected from the reads. Should contain at least two columns, where the first column has the cell id and the second column contains the barcode sequence."/>
             </when>
         </conditional>
-        <param name="exons" type="data" format="gff3" label="Exon annotation GFF3 file" help="Current supported sources: ENSEMBL, GENCODE and RefSeq"/>
-
-        <conditional name="paired_format">
-            <param name="paired_format_selector" type="select" label="Paired reads or Paired collection">
-                <option value="paired">Paired</option>
-                <option value="paired_collection">Paired Collection</option>
-            </param>
-            <when value="paired">
-                <param name="in1" type="data" format="fastq.gz,fastq" label="Input Read 1" help="Read 1 should contain the transcripts in fastq.gz format"/>
-                <param name="in2" type="data" format="fastq.gz,fastq" label="Input Read 2" help="Read 2 should contain UMI and barcodes in fastq.gz format"/>
-            </when>
-            <when value="paired_collection">
-                <param name="paired_input" type="data_collection" collection_type="paired" format="fastq.gz,fastq" label="Select paired collection(s)"/>
-            </when>
-        </conditional>
-        <param name="barcodes" type="data" format="tabular,tsv" optional="True" label="Cell barcodes file" help="Optional file of cell barcodes. Should contain at least two columns, where the first column has the cell id and the second column contains the barcode sequence."/>
+        <param name="exons" type="data" format="gff3" label="Exon annotation GFF3 file" help="Current supported source is ENSEMBL"/>
+        <param name="organism" type="text" label="Species gene id" help="This must be in biomaRt ENSEMBL listDatasets() format e.g. hsapiens_gene_ensembl. See the biomaRt user guide here: https://www.bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/biomaRt.html">
+            <validator type="empty_field" />
+            <validator type="regex" message="Only letters and underscores are allowed">^[\(\w\)]+$</validator>
+        </param>
         <param argument="--bs1" type="integer" min="-1" value="-1" label="Barcode start Read 1" help="Barcode start position in Read 1. Positions are 0-indexed so the first base is considered base 0, -1 indicates no barcode. Default: -1" />
         <param argument="--bl1" type="integer" min="0" value="0" label="Barcode length Read 1" help="Barcode length in Read 1, 0 if no barcode present. Default: 0" />
         <param argument="--bs2" type="integer" min="-1" value="6" label="Barcode start Read 2" help="Barcode start position in Read 2. Positions are 0-indexed so the first base is considered base 0, -1 indicates no barcode.  Default: 6" />
         <param argument="--bl2" type="integer" min="0" value="8" label="Barcode length Read 2" help="Barcode length in Read 2, 0 if no barcode present. Default: 8" />
         <param argument="--us" type="integer" min="-1" value="0" label="UMI start Read 2" help="UMI start position in Read 2. Positions are 0-indexed so the first base is considered base 0, -1 indicates no UMI.  Default: 0" />
         <param argument="--ul" type="integer" min="0" value="6" label="UMI length Read 2" help="UMI length in Read 2, 0 if no UMI present. Default: 6" />
-        <param name="report" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="True" label="Output HTML Report?" help="If this option is set to Yes, a HTML report containing QC metrics will be output. Default: Yes" />
-        <param name="rscript" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output Rscript?" help="If this option is set to Yes, the Rscript used to annotate the IDs will be provided as a text file in the output. Default: No" />
-        <param name="rdata" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Output RData file?"
-            help="Output all the data used by R to construct the tables and plots, can be loaded into R. Default: No">
-        </param>
+        <param name="keep_outliers" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Keep outliers?" help="If this option is set to Yes, outlier cells will not be removed from the gene count matrix. Default: No" />
+        <section name="out" title="Output Options">
+            <param name="plots" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="True" label="Output PDF with plots?" help="If this option is set to Yes, a PDF containing QC plots will be output. Default: Yes" />
+            <param name="metrics_matrix" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output QC metrics matrix?" help="If this option is set a matrix of QC metrics will be output. Default: No" />
+            <param name="report" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output HTML Report?" help="Only valid if FASTQs are input. If this option is set to Yes, a HTML report containing QC metrics will be output. Default: No" />
+            <param name="rscript" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output Rscript?" help="If this option is set to Yes, the Rscript used to annotate the IDs will be provided as a text file in the output. Default: No" />
+            <param name="rdata" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Output RData file?"
+                help="Output all the data used by R to construct the tables and plots, can be loaded into R. Default: No">
+            </param>
+        </section>
         <section name="adv" title="Advanced Options">
-            <param argument="--rmlow" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Remove reads with N in barcode or UMI" help="Default: Yes" />
-            <param argument="--rmN" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Remove reads with low quality" help="Default: Yes" />
-            <param argument="--minq" type="integer" min="0" value="20" label="Minimum read quality" help="Default: 20" />
-            <param argument="--numbq" type="integer" min="0" value="2" label="Maximum number of bases below minq" help="Default: 2" />
-            <param argument="--stnd" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Perform strand-specific mapping" help="Default: Yes" />
-            <param argument="--max_mis" type="integer" min="0" value="1" label="Maximum mismatch allowed in barcode" help="Default: 1" />
-            <param argument="--UMI_cor" type="integer" min="0" value="1" label="Correct UMI sequence error" help="0 means no correction, 1 means simple correction and merge UMI with distance 1. Default: 1" />
-            <param argument="--gene_fl" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Remove low abundant genes" help="Low abundant is defined as only one copy of one UMI for this gene. Default: No" />
-            <param argument="--max_reads" type="integer" min="0" value="1000000" label="Maximum reads processed" help="Maximum reads processed if detecting barcodes. Default: 1,000,000" />
-            <param argument="--min_count" type="integer" min="0" value="10" label="Minimum count to keep" help="Minimum count to keep if detecting barcodes. Barcode will be discarded if it has lower count. This should be set according to --max_reads. Default: 10" />
+            <section name="f" title="FASTQ input only">
+                <param argument="--rmlow" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Remove reads with N in barcode or UMI" help="Default: Yes" />
+                <param argument="--rmN" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Remove reads with low quality" help="Default: Yes" />
+                <param argument="--minq" type="integer" min="0" value="20" label="Minimum read quality" help="Default: 20" />
+                <param argument="--numbq" type="integer" min="0" value="2" label="Maximum number of bases below minq" help="Default: 2" />
+                <param argument="--max_mis" type="integer" min="0" value="1" label="Maximum mismatch allowed in barcode" help="Default: 1" />
+
+                <param argument="--max_reads" type="integer" min="0" value="1000000" label="Maximum reads processed" help="Maximum reads processed if detecting barcodes. Default: 1,000,000" />
+                <param argument="--min_count" type="integer" min="0" value="10" label="Minimum count to keep" help="Minimum count to keep if detecting barcodes. Barcode will be discarded if it has lower count. This should be set according to --max_reads. Default: 10" />
+            </section>
+                <param argument="--UMI_cor" type="integer" min="0" value="1" label="Correct UMI sequence error" help="0 means no correction, 1 means simple correction and merge UMI with distance 1. Default: 1" />
+                <param argument="--stnd" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Perform strand-specific mapping" help="Default: Yes" />
+                <param argument="--gene_fl" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Remove low abundant genes" help="Low abundant is defined as only one copy of one UMI for this gene. Default: No" />
         </section>
     </inputs>

     <outputs>
         <data name="out_matrix" format="tabular" from_work_dir="gene_count.tsv" label="${tool.name} on ${on_string}: Count Matrix" />
+        <data name="out_plots" format="pdf" from_work_dir="plots.pdf" label="${tool.name} on ${on_string}: Plots">
+            <filter>plots</filter>
+        </data>
+        <data name="out_metrics_matrix" format="tabular" from_work_dir="metrics_matrix.tsv" label="${tool.name} on ${on_string}: QC metrics matrix">
+            <filter>metrics_matrix</filter>
+        </data>
         <data name="out_report" format="html" from_work_dir="report.nb.html" label="${tool.name} on ${on_string}: HTML Report" >
             <filter>report</filter>
         </data>
@@ -185,9 +242,11 @@
     <tests>
         <!-- Ensure outputs work -->
         <test>
+            <param name="format_select" value="fastq" />
             <param name="fasta_source" value="history"/>
             <param name="ref_fa_hist" ftype="fasta" value="mm10_MT19.fa.gz"/>
             <param name="exons" ftype="gff3" value="mm10_MT19.gff3.gz"/>
+            <param name="organism" value="mmusculus_gene_ensembl"/>
             <param name="paired_format_selector" value="paired" />
             <param name="in1" ftype="fastqsanger.gz" value="CB51_MT19_R1.gz"/>
             <param name="in2" ftype="fastqsanger.gz" value="CB51_MT19_R2.gz"/>
@@ -208,38 +267,81 @@
         </test>
         <!-- Ensure built-in fasta works -->
         <test>
+            <param name="format_select" value="fastq" />
             <param name="fasta_source" value="cached"/>
             <param name="exons" ftype="gff3" value="mm10_MT19.gff3.gz"/>
+            <param name="organism" value="mmusculus_gene_ensembl"/>
             <param name="paired_format_selector" value="paired" />
             <param name="in1" ftype="fastqsanger.gz" dbkey="mm10" value="CB51_MT19_R1.gz"/>
             <param name="in2" ftype="fastqsanger.gz" dbkey="mm10" value="CB51_MT19_R2.gz"/>
             <param name="us" value="-1"/>
             <param name="max_reads" value="5000000"/>
             <param name="min_count" value="100"/>
-            <param name="report" value="False" />
+            <output name="out_matrix" >
+                <assert_contents>
+                    <has_text text="ENSMUSG00000064351" />
+                </assert_contents>
+            </output>
+        </test>
+        <!-- Ensure BAM input works -->
+        <test>
+            <param name="format_select" value="bam" />
+            <param name="bam" ftype="bam" value="aligned.mapped.bam"/>
+            <param name="barcodes" ftype="tabular" value="barcode_anno.tsv"/>
+            <param name="exons" ftype="gff3" value="mm10_MT19.gff3.gz"/>
+            <param name="organism" value="mmusculus_gene_ensembl"/>
+            <param name="us" value="-1"/>
             <output name="out_matrix" >
                 <assert_contents>
                     <has_text text="ENSMUSG00000064351" />
                 </assert_contents>
             </output>
         </test>
+        <!-- Ensure BAM input with QC outputs works -->
+        <test>
+            <param name="format_select" value="bam" />
+            <param name="bam" ftype="bam" value="aligned.mapped.bam"/>
+            <param name="barcodes" ftype="tabular" value="barcode_anno.tsv"/>
+            <param name="exons" ftype="gff3" value="mm10_MT19.gff3.gz"/>
+            <param name="organism" value="mmusculus_gene_ensembl"/>
+            <param name="us" value="-1"/>
+            <param name="plots" value="True"/>
+            <param name="metrics_matrix" value="True"/>
+            <output name="out_matrix" >
+                <assert_contents>
+                    <has_text text="ENSMUSG00000064351" />
+                </assert_contents>
+            </output>
+            <output name="out_metrics_matrix" >
+                <assert_contents>
+                    <has_line_matching expression="cell_id.*unaligned.*aligned_unmapped.*mapped_to_exon.*mapped_to_intron.*ambiguous_mapping.*mapped_to_ERCC.*mapped_to_MT.*number_of_genes.*total_count_per_cell.*non_mt_percent\toutliers" />
+                </assert_contents>
+            </output>
+            <output name="out_plots" ftype="pdf" value="plots.pdf" compare="sim_size" />
+        </test>
+
     </tests>
     <help><![CDATA[
 .. class:: infomark

 **What it does**

-scPipe_ is an `R/Bioconductor package`_ that integrates barcode demultiplexing, read alignment, UMI-aware gene-level quantification and quality control of raw sequencing data generated by multiple protocols that include CEL-seq, MARS-seq, Chromium 10X, Drop-seq and Smart-seq. scPipe produces a count matrix that is essential for downstream analysis along with an HTML report that summarises data quality. These results can be used as input for downstream analyses including normalization, visualization and statistical testing.
-Examples of the report output can be found here_.
+scPipe_ is an `R/Bioconductor package`_ that integrates barcode demultiplexing, read alignment, UMI-aware gene-level quantification and quality control of raw sequencing data generated by multiple protocols that include CEL-seq, MARS-seq, Chromium 10X, Drop-seq and Smart-seq. scPipe produces a count matrix that is essential for downstream analysis along with QC metrics and a HTML report that summarises data quality. These results can be used as input for downstream analyses including normalization, visualization and statistical testing.
+The scPipe workflow is described in this vignette_ and examples of the report output can be found here_. Note that outlier cells are detected and removed by default but they can be kept if "Keep outliers?" is selected.

 -----

 **Inputs**

+Either
     * Reference genome in FASTA format
-    * Exon annotation in GFF3 format
     * Paired-end FASTQ.GZ reads
     * Cell barcodes TAB-separated file (Optional)
+OR
+    * BAM file
+    * Cell barcodes TAB-separated file
+AND
+    * Exon annotation in ENSEMBL GFF3 format

 *Read Structure*

@@ -266,12 +368,15 @@

 Optionally you can choose to output

-    * HTML report (default is Yes)
+    * PDF of QC Plots (default is Yes)
+    * QC metrics matrix
+    * HTML report (if FASTQs are input)
     * Rscript
     * RData

 .. _scPipe: http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1006361
 .. _R/Bioconductor package: https://bioconductor.org/packages/release/bioc/html/scPipe.html
+.. _vignette: https://bioconductor.org/packages/release/bioc/vignettes/scPipe/inst/doc/scPipe_tutorial.html
 .. _here: http://bioinf.wehi.edu.au/scPipe/

 ]]></help>
Binary file test-data/aligned.mapped.bam has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/barcode_anno.tsv	Mon Jan 14 08:06:47 2019 -0500
@@ -0,0 +1,263 @@
+CELL_0000	CGACCATA	146
+CELL_0001	TACACGTG	193
+CELL_0002	ATGCAGTC	144
+CELL_0003	TAGGCTGT	280
+CELL_0004	CACAAGTC	1087
+CELL_0005	TGACTCAG	266
+CELL_0006	TGACCATG	156
+CELL_0007	GTTGCTGA	150
+CELL_0008	CAGTACCA	999
+CELL_0009	GCTTGATG	206
+CELL_0010	ACGTGTGT	151
+CELL_0011	GCAAGCTT	255
+CELL_0012	GACCTACA	183
+CELL_0013	TACGCTGA	208
+CELL_0014	GGTACACT	202
+CELL_0015	GCGTTAGT	141
+CELL_0016	CAACATGC	201
+CELL_0017	CATTGTGG	941
+CELL_0018	AAGGAGGA	178
+CELL_0019	CGTAGTGT	150
+CELL_0020	GGTTAGCT	234
+CELL_0021	AGGTTCCA	466
+CELL_0022	TACCGATG	434
+CELL_0023	AGGTTCAC	351
+CELL_0024	CAACTGAC	299
+CELL_0025	GCAAGTTC	220
+CELL_0026	GGTGATTC	138
+CELL_0027	CTGGTATG	347
+CELL_0028	TGCCAGAT	282
+CELL_0029	CGTGATGT	122
+CELL_0030	GACCATCA	103
+CELL_0031	TACCGTGA	186
+CELL_0032	TAGTGCGT	105
+CELL_0033	GCATTAGC	219
+CELL_0034	GCGATATC	345
+CELL_0035	CAAGTGTC	215
+CELL_0036	TGCAGACT	536
+CELL_0037	ATCCGCAA	283
+CELL_0038	GGTTATGC	415
+CELL_0039	CGCAACAT	186
+CELL_0040	AGGAAGGA	109
+CELL_0041	AGTCACGT	291
+CELL_0042	GCACGTAT	113
+CELL_0043	ATTGGCGT	101
+CELL_0044	GGACTTAC	316
+CELL_0045	ACTGTGAC	693
+CELL_0046	TACTGTGG	122
+CELL_0047	CATCAAGC	184
+CELL_0048	TGACTACG	155
+CELL_0049	GTAGTGTC	203
+CELL_0050	TATCCGAC	171
+CELL_0051	GTCATGCA	259
+CELL_0052	CACAATCG	118
+CELL_0053	GCAATGTC	140
+CELL_0054	TGTATGGC	290
+CELL_0055	GCTAGTGT	226
+CELL_0056	ATACCACG	141
+CELL_0057	GGTATCGT	162
+CELL_0058	ATGCTGAC	129
+CELL_0059	GCAGTCAT	148
+CELL_0060	CAATCCAG	142
+CELL_0061	TAGGTTGC	454
+CELL_0062	AACCGTAC	510
+CELL_0063	CGTCGATA	454
+CELL_0064	ATCCGTAG	379
+CELL_0065	CCACTAAG	267
+CELL_0066	GTCAGTAC	426
+CELL_0067	CCTAGTAG	138
+CELL_0068	TGCGACTA	201
+CELL_0069	ACCGACAT	164
+CELL_0070	TCGCTGAA	168
+CELL_0071	AGCGTATC	1291
+CELL_0072	TCACGGTA	146
+CELL_0073	TGCAATGC	237
+CELL_0074	GGCCAATT	289
+CELL_0075	TCTCGGAA	336
+CELL_0076	GAGACCTT	291
+CELL_0077	CGCTAGTA	511
+CELL_0078	CCTGTAAG	215
+CELL_0079	GCGTGATT	338
+CELL_0080	CTCAAGAC	182
+CELL_0081	TAGCGATC	250
+CELL_0082	GGACTTGT	711
+CELL_0083	CTCACAGA	259
+CELL_0084	GTAGACCT	495
+CELL_0085	AGTGTGCT	477
+CELL_0086	ACTGTTGG	208
+CELL_0087	ACGCCATA	159
+CELL_0088	GGTCCAAT	281
+CELL_0089	TATCCGGA	159
+CELL_0090	GTCGCTAA	245
+CELL_0091	ACCAAGCT	135
+CELL_0092	TGTCGATG	112
+CELL_0093	GCCAATTG	161
+CELL_0094	ATGCCAGT	1068
+CELL_0095	ACGATGTC	211
+CELL_0096	CATCCAGA	156
+CELL_0097	CAACGCTA	229
+CELL_0098	GAAGGAAG	110
+CELL_0099	TGGCGATT	414
+CELL_0100	AGTCATGC	248
+CELL_0101	AGCGTTAC	123
+CELL_0102	ACTCAAGC	148
+CELL_0103	CGATAGCT	170
+CELL_0104	CTACGCAA	254
+CELL_0105	AAAAAAAA	261
+CELL_0106	TAACCGTG	136
+CELL_0107	TAGCGTCA	385
+CELL_0108	ACCACTGA	163
+CELL_0109	CGCATGTA	109
+CELL_0110	ACCGTGAT	137
+CELL_0111	CTCCAAGA	179
+CELL_0112	CACTACAG	144
+CELL_0113	AGCCTAAC	104
+CELL_0114	CTGTGAGT	112
+CELL_0115	CTGCAACA	320
+CELL_0116	CTCACGAA	1181
+CELL_0117	ATTCGCAG	858
+CELL_0118	CGCAATAC	127
+CELL_0119	TCAGTGCA	151
+CELL_0120	GTGAGTTC	123
+CELL_0121	CCTATGAG	136
+CELL_0122	CAGCTTAG	141
+CELL_0123	TCTCAAGG	151
+CELL_0124	GTGGTCTA	242
+CELL_0125	ATTGTGGC	160
+CELL_0126	ACTACGGT	206
+CELL_0127	TCATGGTG	134
+CELL_0128	AGCAGCTT	164
+CELL_0129	GTACTGTG	341
+CELL_0130	CACGAGTT	103
+CELL_0131	TCAGGTGT	103
+CELL_0132	ACCAACGT	203
+CELL_0133	CGACTACA	226
+CELL_0134	AACGTCAC	375
+CELL_0135	GACATGCT	372
+CELL_0136	TCCATGAG	182
+CELL_0137	GGATGCTT	114
+CELL_0138	GGCACTTA	176
+CELL_0139	AGTGTCTG	205
+CELL_0140	TGCGTAGT	319
+CELL_0141	TAAGCTGC	340
+CELL_0142	GTACATCG	159
+CELL_0143	TGGCAACT	1094
+CELL_0144	GGTGCATT	143
+CELL_0145	TGCATGGT	136
+CELL_0146	GGTCAATC	248
+CELL_0147	CGTGAATC	233
+CELL_0148	CAATCGGT	377
+CELL_0149	ACTACACG	165
+CELL_0150	TGCACTGA	116
+CELL_0151	GCGCAATT	132
+CELL_0152	ACAACGTC	987
+CELL_0153	AGCATGTC	260
+CELL_0154	TCACACGA	106
+CELL_0155	GAATCCAC	282
+CELL_0156	TAGGCCAT	335
+CELL_0157	ACACTCGA	374
+CELL_0158	GAACTTCG	155
+CELL_0159	CACTGCAA	119
+CELL_0160	TACGGATC	159
+CELL_0161	GCCAACTA	349
+CELL_0162	TACCACGA	151
+CELL_0163	GATACTGC	127
+CELL_0164	TCAGGCAT	125
+CELL_0165	TGCGGTTA	136
+CELL_0166	TCAGCGTA	245
+CELL_0167	GCTAAGCT	476
+CELL_0168	GTCCAACA	117
+CELL_0169	CCATCAAG	123
+CELL_0170	CACGGTAT	240
+CELL_0171	GTCATGTG	130
+CELL_0172	CGTAACTG	113
+CELL_0173	GTTACCGA	126
+CELL_0174	ATAGGCTC	1103
+CELL_0175	CCATCGAA	162
+CELL_0176	AGGAGGAA	105
+CELL_0177	ATTGCGTG	317
+CELL_0178	ACGTATCG	142
+CELL_0179	GCCATCAA	153
+CELL_0180	AGCTTGCA	169
+CELL_0181	TCACAGAC	275
+CELL_0182	CGACACAT	148
+CELL_0183	AGCACTTG	717
+CELL_0184	GTGCGTTA	163
+CELL_0185	GAGTTCCA	122
+CELL_0186	ATGGTCCA	171
+CELL_0187	GTTGAGTC	196
+CELL_0188	GCGTGTAT	577
+CELL_0189	TCGAGTTG	176
+CELL_0190	TACGAGTC	169
+CELL_0191	CAATTGGC	738
+CELL_0192	GGGGGGGG	183
+CELL_0193	GCCTTAAG	102
+CELL_0194	GGTCTATG	233
+CELL_0195	ACACACGT	1061
+CELL_0196	TGGTCTGA	150
+CELL_0197	GAGAGAAG	101
+CELL_0198	GCATGTGT	101
+CELL_0199	ACGCACTA	173
+CELL_0200	TCGCACAA	203
+CELL_0201	GATTCGGT	102
+CELL_0202	GCTGCATA	282
+CELL_0203	TATGGCTG	167
+CELL_0204	GTTCCAAG	377
+CELL_0205	CTAGGATC	200
+CELL_0206	TCTTCCTC	181
+CELL_0207	GTGCTCAA	602
+CELL_0208	TGAGTTGC	200
+CELL_0209	AACACCAC	163
+CELL_0210	TGCTAGGT	361
+CELL_0211	TGCACGTA	325
+CELL_0212	ATGTCCAG	687
+CELL_0213	ACCAAGTC	172
+CELL_0214	CACACTAG	247
+CELL_0215	AGTATCCG	126
+CELL_0216	ACCATGTG	155
+CELL_0217	CAAGCCAT	231
+CELL_0218	AGCTCATG	169
+CELL_0219	TCGTTGGA	192
+CELL_0220	GACTGTAC	250
+CELL_0221	ACTCTGAG	157
+CELL_0222	ACATGCCA	399
+CELL_0223	GACTACTG	735
+CELL_0224	ACACGCAT	401
+CELL_0225	CGTAGCAT	199
+CELL_0226	TAGCCGAT	319
+CELL_0227	ATCACGAC	129
+CELL_0228	ATGGTGCT	2031
+CELL_0229	GGTAACTC	155
+CELL_0230	AGCTCTGA	162
+CELL_0231	TCGAGACT	279
+CELL_0232	TGTCGCAA	121
+CELL_0233	CCAATAGC	145
+CELL_0234	CTATCGGA	812
+CELL_0235	CAACTAGC	188
+CELL_0236	TGCGGATT	132
+CELL_0237	CCGTTAAG	123
+CELL_0238	TCGACTAG	455
+CELL_0239	TATTGCGG	325
+CELL_0240	GCGCTTAA	379
+CELL_0241	CCGATGTA	488
+CELL_0242	TGGTCGAT	163
+CELL_0243	CTACGAAC	346
+CELL_0244	CAGTCCAA	105
+CELL_0245	AGCTTACG	221
+CELL_0246	CGACATAC	124
+CELL_0247	ATTGCCGA	1198
+CELL_0248	TCCTCTTC	160
+CELL_0249	AGCTGTTG	335
+CELL_0250	GCATAGTC	319
+CELL_0251	CGATTGTG	386
+CELL_0252	TGGTTGCA	339
+CELL_0253	CGAATCCA	158
+CELL_0254	TGGTGATC	318
+CELL_0255	AGAAGAGG	153
+CELL_0256	GTCACGTA	105
+CELL_0257	GTACTTGG	688
+CELL_0258	GTCCGAAT	265
+CELL_0259	TCTCCTCT	501
+CELL_0260	GATTGGTC	372
+CELL_0261	TACTGGCA	117
+CELL_0262	GTATGGTC	621
Binary file test-data/plots.pdf has changed