Galaxy |

Changeset 0:32e1bfc6b7b2 (2018-08-15)

Next changeset 1:4ec6717872b1 (2018-08-17)

Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scpipe commit 8908da9cdd112ae0943dbf1eccb221e84cd99ca7

added:
scpipe.R
scpipe.xml
test-data/CB51_MT19_R1.gz
test-data/CB51_MT19_R2.gz
test-data/mm10_MT19.fa.gz
test-data/mm10_MT19.gff3.gz
tool-data/all_fasta.loc.sample
tool_data_table_conf.xml.sample
tool_data_table_conf.xml.test

diff -r 000000000000 -r 32e1bfc6b7b2 scpipe.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/scpipe.R Wed Aug 15 13:54:40 2018 -0400

[

@@ -0,0 +1,149 @@
+options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+
+# we need that to not crash galaxy with an UTF8 error on German LC settings.
+loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+
+suppressPackageStartupMessages({
+    library(scPipe)
+    library(SingleCellExperiment)
+    library(optparse)
+    library(readr)
+    library(ggplot2)
+    library(plotly)
+    library(DT)
+    library(scater)
+    library(scran)
+    library(scales)
+    library(Rtsne)
+})
+
+option_list <- list(
+    make_option(c("-fasta","--fasta"), type="character", help="Genome fasta file"),
+    make_option(c("-exons","--exons"), type="character", help="Exon annotation gff3 file"),
+    make_option(c("-barcodes","--barcodes"), type="character", help="Cell barcodes csv file"),
+    make_option(c("-read1","--read1"), type="character", help="Read 1 fastq.gz"),
+    make_option(c("-read2","--read2"), type="character", help="Read 2 fastq.gz"),
+    make_option(c("-samplename","--samplename"), type="character", help="Name to use for sample"),
+    make_option(c("-bs1","--bs1"), type="integer", help="Barcode start in Read 1"),
+    make_option(c("-bl1","--bl1"), type="integer", help="Barcode length in Read 1"),
+    make_option(c("-bs2","--bs2"), type="integer", help="Barcode start in Read 2"),
+    make_option(c("-bl2","--bl2"), type="integer", help="Barcode length in Read 2"),
+    make_option(c("-us","--us"), type="integer", help="UMI start in Read 2"),
+    make_option(c("-ul","--ul"), type="integer", help="UMI length in Read 2"),
+    make_option(c("-rmlow","--rmlow"), type="logical", help="Remove reads with N in barcode or UMI"),
+    make_option(c("-rmN","--rmN"), type="logical", help="Remove reads with low quality"),
+    make_option(c("-minq","--minq"), type="integer", help="Minimum read quality"),
+    make_option(c("-numbq","--numbq"), type="integer", help="Maximum number of bases below minq"),
+    make_option(c("-stnd","--stnd"), type="logical", help="Perform strand-specific mapping"),
+    make_option(c("-max_mis","--max_mis"), type="integer", help="Maximum mismatch allowed in barcode"),
+    make_option(c("-UMI_cor","--UMI_cor"), type="integer", help="Correct UMI sequence error"),
+    make_option(c("-gene_fl","--gene_fl"), type="logical", help="Remove low abundant genes"),
+    make_option(c("-max_reads","--max_reads"), type="integer", help="Maximum reads processed"),
+    make_option(c("-min_count","--min_count"), type="integer", help="Minimum count to keep"),
+    make_option(c("-report","--report"), type="logical", help="HTML report of plots"),
+    make_option(c("-rdata","--rdata"), type="logical", help="Output RData file"),
+    make_option(c("-nthreads","--nthreads"), type="integer", help="Number of threads")
+  )
+
+parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
+args = parse_args(parser)
+
+fa_fn = args$fasta
+anno_fn = args$exons
+fq_R1 = args$read1
+fq_R2 = args$read2
+read_structure = list(
+    bs1 = args$bs1,   # barcode start position in fq_R1, -1 indicates no barcode
+    bl1 = args$bl1,    # barcode length in fq_R1, 0 since no barcode present
+    bs2 = args$bs2,    # barcode start position in fq_R2
+    bl2 = args$bl2,   # barcode length in fq_R2
+    us = args$us,    # UMI start position in fq_R2
+    ul = args$ul     # UMI length
+)
+
+if (args$us == -1) {
+  has_umi = FALSE
+} else {
+  has_umi = TRUE
+}
+
+filter_settings=list(rmlow=args$rmlow, rmN=args$rmN, minq=args$minq, numbq=args$numbq)
+
+# Outputs
+out_dir = "."
+fasta_index = file.path(out_dir, paste0(fa_fn, ".fasta_index"))
+combined_fastq = file.path(out_dir, "combined.fastq")
+aligned_bam = file.path(out_dir, "aligned.bam")
+mapped_bam = file.path(out_dir, "aligned.mapped.bam")
+
+
+print("Trimming barcodes")
+sc_trim_barcode(combined_fastq,
+                fq_R1,
+                fq_R2,
+                read_structure=read_structure,
+                filter_settings=filter_settings)
+
+print("Building genome index")
+Rsubread::buildindex(basename=fasta_index, reference=fa_fn)
+
+print("Aligning reads to genome")
+Rsubread::align(index=fasta_index,
+    readfile1=combined_fastq,
+    output_file=aligned_bam,
+    nthreads=args$nthreads)
+
+if (!is.null(args$barcodes)) {
+  barcode_anno=args$barcodes
+} else {
+  print("Detecting barcodes")
+  # detect 10X barcodes and generate sample_index.csv file
+  barcode_anno = "sample_index.csv"
+  sc_detect_bc(
+      infq=combined_fastq,
+      outcsv=barcode_anno,
+      bc_len=read_structure$bl2,
+      max_reads=args$max_reads,
+      min_count=args$min_count,
+      max_mismatch=args$max_mis
+  )
+}
+
+print("Assigning reads to exons")
+sc_exon_mapping(aligned_bam, mapped_bam, anno_fn, bc_len=read_structure$bl2, UMI_len=read_structure$ul, stnd=args$stnd)
+
+print("De-multiplexing data")
+sc_demultiplex(mapped_bam, out_dir, barcode_anno, has_UMI=has_umi)
+
+print("Counting genes")
+sc_gene_counting(out_dir, barcode_anno, UMI_cor=args$UMI_cor, gene_fl=args$gene_fl)
+
+print("Creating SingleCellExperiment object")
+sce <- create_sce_by_dir(out_dir)
+
+if (!is.null(args$report)) {
+print("Creating report")
+create_report(sample_name=args$samplename,
+   outdir=out_dir,
+   r1=fq_R1,
+   r2=fq_R2,
+   outfq=combined_fastq,
+   read_structure=read_structure,
+   filter_settings=filter_settings,
+   align_bam=aligned_bam,
+   genome_index=fasta_index,
+   map_bam=mapped_bam,
+   exon_anno=anno_fn,
+   stnd=args$stnd,
+   fix_chr=FALSE,
+   barcode_anno=barcode_anno,
+   max_mis=args$max_mis,
+   UMI_cor=args$UMI_cor,
+   gene_fl=args$gene_fl)
+}
+
+if (!is.null(args$rdata) ) {
+    save(sce, file = file.path(out_dir,"scPipe_analysis.RData"))
+}
+
+sessionInfo()

diff -r 000000000000 -r 32e1bfc6b7b2 scpipe.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/scpipe.xml Wed Aug 15 13:54:40 2018 -0400

[

b'@@ -0,0 +1,264 @@\n+<tool id="scpipe" name="scPipe" version="1.0.0">\n+ <description>- preprocessing pipeline for single cell RNA-seq</description>\n+ <requirements>\n+ <requirement type="package" version="1.0.0">bioconductor-scpipe</requirement>\n+ <requirement type="package" version="1.28.1">bioconductor-rsubread</requirement>\n+ <requirement type="package" version="1.20">r-knitr</requirement>\n+ <requirement type="package" version="1.10">r-rmarkdown</requirement>\n+ <requirement type="package" version="1.1.1">r-readr</requirement>\n+ <requirement type="package" version="4.7.1">r-plotly</requirement>\n+ <requirement type="package" version="0.4">r-dt</requirement>\n+ <requirement type="package" version="1.6.0">bioconductor-scater</requirement>\n+ <requirement type="package" version="1.6.2">bioconductor-scran</requirement>\n+ <requirement type="package" version="0.13">r-rtsne</requirement>\n+ \n+ <requirement type="package" version="2.2.1">r-ggplot2</requirement> \n+ <requirement type="package" version="1.6.0">r-optparse</requirement>\n+ </requirements>\n+ <version_command><![CDATA[\n+echo $(R --version | grep version | grep -v GNU)", scPipe version" $(R --vanilla --slave -e "library(scPipe); cat(sessionInfo()\\$otherPkgs\\$scPipe\\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", Rsubread version" $(R --vanilla --slave -e "library(Rsubread); cat(sessionInfo()\\$otherPkgs\\$Rsubread\\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", knitr version" $(R --vanilla --slave -e "library(knitr); cat(sessionInfo()\\$otherPkgs\\$knitr\\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", rmarkdown version" $(R --vanilla --slave -e "library(rmarkdown); cat(sessionInfo()\\$otherPkgs\\$rmarkdown\\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", readr version" $(R --vanilla --slave -e "library(readr); cat(sessionInfo()\\$otherPkgs\\$readr\\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", plotly version" $(R --vanilla --slave -e "library(plotly); cat(sessionInfo()\\$otherPkgs\\$plotly\\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", DT version" $(R --vanilla --slave -e "library(DT); cat(sessionInfo()\\$otherPkgs\\$DT\\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", scater version" $(R --vanilla --slave -e "library(scater); cat(sessionInfo()\\$otherPkgs\\$scater\\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", scran version" $(R --vanilla --slave -e "library(scran); cat(sessionInfo()\\$otherPkgs\\$scran\\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", rtsne version" $(R --vanilla --slave -e "library(Rtsne); cat(sessionInfo()\\$otherPkgs\\$Rtsne\\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", ggplot2 version" $(R --vanilla --slave -e "library(ggplot2); cat(sessionInfo()\\$otherPkgs\\$ggplot2\\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", optparse version" $(R --vanilla --slave -e "library(optparse); cat(sessionInfo()\\$otherPkgs\\$optparse\\$Version)" 2> /dev/null | grep -v -i "WARNING: ")\n+ ]]></version_command>\n+ <command detect_errors="exit_code"><![CDATA[\n+#import re\n+\n+## Link input files\n+\n+#if $ref_fasta.fasta_source == "history":\n+ #set $fasta_name = re.sub(\'[^\\w\\-\\s]\', \'_\', str($ref_fasta.ref_fa_hist.element_identifier))\n+ ln -s \'$ref_fasta.ref_fa_hist\' \'$fasta_name\' &&\n+#else:\n+ #set $fasta_name = re.sub(\'[^\\w\\-\\s]\', \'_\', str($ref_fasta.ref_fa_builtin.element_identifier))\n+ ln -s \'$ref_fasta.ref_fa_builtin.fields.path\' \'$fasta_name\' &&\n+#end if\n+\n+#set $anno_name = re.sub(\'[^\\w\\-\\s]\', \'_\', str($exons.element_identifier))\n+#set $anno_name = $anno_name + ".gff3"\n+ln -s \'${exons}\' \'$anno_name\' &&\n+\n+#if $paired_format.paired_format_selector == \'paired_collection\':\n+ #set $in1 = $paired_format.paired_input.forward\n+ #set $in2 = $paired_format.paired_input.reverse\n+ #set $in1_name = re.sub(\'[^'..b'_work_dir="gene_count.tsv" label="${tool.name} on ${on_string}: Count Matrix" />\n+ <data name="out_report" format="html" from_work_dir="report.nb.html" label="${tool.name} on ${on_string}: HTML Report" >\n+ <filter>report</filter>\n+ </data>\n+ <data name="out_rscript" format="txt" from_work_dir="out_rscript.txt" label="${tool.name} on ${on_string}: Rscript">\n+ <filter>rscript</filter>\n+ </data>\n+ <data name="out_rdata" format="rdata" from_work_dir="scPipe_analysis.RData" label="${tool.name} on ${on_string}: RData file">\n+ <filter>rdata</filter>\n+ </data>\n+ </outputs>\n+\n+ <tests>\n+ \n+ <test>\n+ <param name="fasta_source" value="history"/>\n+ <param name="ref_fa_hist" ftype="fasta" value="mm10_MT19.fa.gz"/>\n+ <param name="exons" ftype="gff3" value="mm10_MT19.gff3.gz"/>\n+ <param name="paired_format_selector" value="paired" />\n+ <param name="in1" ftype="fastqsanger.gz" value="CB51_MT19_R1.gz"/>\n+ <param name="in2" ftype="fastqsanger.gz" value="CB51_MT19_R2.gz"/>\n+ <param name="us" value="-1"/>\n+ <param name="max_reads" value="5000000"/>\n+ <param name="min_count" value="100"/>\n+ <param name="report" value="True" />\n+ <output name="out_matrix" >\n+ <assert_contents>\n+ <has_text text="ENSMUSG00000024940" />\n+ </assert_contents>\n+ </output>\n+ <output name="out_report" >\n+ <assert_contents>\n+ <has_text text="scPipe report for sample" />\n+ </assert_contents>\n+ </output>\n+ </test>\n+ </tests>\n+ <help><![CDATA[\n+.. class:: infomark\n+\n+**What it does**\n+\n+scPipe_ is an `R/Bioconductor package`_ that integrates barcode demultiplexing, read alignment, UMI-aware gene-level quantification and quality control of raw sequencing data generated by multiple protocols that include CEL-seq, MARS-seq, Chromium 10X, Drop-seq and Smart-seq. scPipe produces a count matrix that is essential for downstream analysis along with an HTML report that summarises data quality. These results can be used as input for downstream analyses including normalization, visualization and statistical testing. \n+Examples of the report output can be found here_.\n+\n+-----\n+\n+**Inputs**\n+\n+ * Reference genome in FASTA format\n+ * Exon annotation in GFF3 format\n+ * Paired-end FASTQ.GZ reads\n+ * Cell barcodes TAB-separated file (Optional)\n+\n+*Read Structure*\n+\n+The default read structure represents CEL-seq\n+paired-ended reads, with one cell barcode in Read 2 Start from\n+6bp and UMI sequence in Read 2 Start from the first bp. So the\n+read structure will be : `bs1=-1, bl1=0, bs2=6, bl2=8, us=0,\n+ul=6`. `bs1=-1, bl1=0` means we don\'t have index in Read 1 so we\n+set a negative value to start position and zero to the length.\n+`bs2=6, bl2=8` means we have index in Read 2 which starts at 6bp\n+with 8bp length. `us=0, ul=6` means we have UMI from the\n+start of Read 2 and the length is 6bp. NOTE: the zero\n+based index is used so the index of the sequence starts from zero. For a\n+typical Drop-seq experiment the setting will be `bs1=-1,\n+bl1=0, bs2=0, bl2=12, us=12, ul=8`, which means Read 1 only\n+contains transcript and the first 12bp in Read 2 are index,\n+followed by 8bp UMIs.\n+\n+-----\n+\n+**Outputs**\n+\n+ * Count matrix of genes in Tabular format\n+\n+Optionally you can choose to output\n+\n+ * HTML report (default is Yes)\n+ * Rscript\n+ * RData\n+\n+.. _scPipe: http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1006361\n+.. _R/Bioconductor package: https://bioconductor.org/packages/release/bioc/html/scPipe.html\n+.. _here: http://bioinf.wehi.edu.au/scPipe/\n+\n+]]></help>\n+ <citations>\n+ <citation type="doi">10.1371/journal.pcbi.1006361</citation>\n+ </citations>\n+</tool>\n'

diff -r 000000000000 -r 32e1bfc6b7b2 test-data/CB51_MT19_R1.gz

Binary file test-data/CB51_MT19_R1.gz has changed

diff -r 000000000000 -r 32e1bfc6b7b2 test-data/CB51_MT19_R2.gz

Binary file test-data/CB51_MT19_R2.gz has changed

diff -r 000000000000 -r 32e1bfc6b7b2 test-data/mm10_MT19.fa.gz

Binary file test-data/mm10_MT19.fa.gz has changed

diff -r 000000000000 -r 32e1bfc6b7b2 test-data/mm10_MT19.gff3.gz

Binary file test-data/mm10_MT19.gff3.gz has changed

diff -r 000000000000 -r 32e1bfc6b7b2 tool-data/all_fasta.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/all_fasta.loc.sample Wed Aug 15 13:54:40 2018 -0400

@@ -0,0 +1,18 @@
+#This file lists the locations and dbkeys of all the genome and transcriptome fasta files
+#under the "genome" directory (a directory that contains a directory
+#for each build. This file has the format (white space characters are
+#TAB characters):
+#
+#<unique_build_id> <dbkey> <display_name> <file_path>
+#
+#So, all_fasta.loc could look something like this:
+#
+#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa
+#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa
+#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa
+#hg19full.87 hg19 Human (Homo sapiens): hg19 Full Trans v87 /path/to/genome/hg19/hg19full.cdna.all.fa
+
+#Your all_fasta.loc file should contain an entry for each individual
+#fasta file. So there will be multiple fasta files for each build,
+#such as with hg19 above.
+#

diff -r 000000000000 -r 32e1bfc6b7b2 tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Wed Aug 15 13:54:40 2018 -0400

@@ -0,0 +1,7 @@
+<tables>
+    
+    <table name="all_fasta" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/all_fasta.loc" />
+    </table>
+</tables>

diff -r 000000000000 -r 32e1bfc6b7b2 tool_data_table_conf.xml.test
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test Wed Aug 15 13:54:40 2018 -0400

@@ -0,0 +1,8 @@
+<tables>
+    
+    <table name="all_fasta" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/all_fasta.loc" />
+    </table>
+</tables>
+