# HG changeset patch # User iuc # Date 1547471207 18000 # Node ID 5c4bca9dd4a24df431aa4de5440c0081d18ddf26 # Parent 4ec6717872b1b415a6223f51a6f4d42d91409d57 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scpipe commit 60e2a9e9129a22924c55b11b218b39d913c7e686 diff -r 4ec6717872b1 -r 5c4bca9dd4a2 scpipe.R --- a/scpipe.R Fri Aug 17 08:22:49 2018 -0400 +++ b/scpipe.R Mon Jan 14 08:06:47 2019 -0500 @@ -18,8 +18,10 @@ }) option_list <- list( + make_option(c("-bam","--bam"), type="character", help="BAM file"), make_option(c("-fasta","--fasta"), type="character", help="Genome fasta file"), make_option(c("-exons","--exons"), type="character", help="Exon annotation gff3 file"), + make_option(c("-organism","--organism"), type="character", help="Organism e.g. hsapiens_gene_ensembl"), make_option(c("-barcodes","--barcodes"), type="character", help="Cell barcodes csv file"), make_option(c("-read1","--read1"), type="character", help="Read 1 fastq.gz"), make_option(c("-read2","--read2"), type="character", help="Read 2 fastq.gz"), @@ -40,6 +42,8 @@ make_option(c("-gene_fl","--gene_fl"), type="logical", help="Remove low abundant genes"), make_option(c("-max_reads","--max_reads"), type="integer", help="Maximum reads processed"), make_option(c("-min_count","--min_count"), type="integer", help="Minimum count to keep"), + make_option(c("-metrics_matrix","--metrics_matrix"), type="logical", help="QC metrics matrix"), + make_option(c("-keep_outliers","--keep_outliers"), type="logical", help="Keep outlier cells"), make_option(c("-report","--report"), type="logical", help="HTML report of plots"), make_option(c("-rdata","--rdata"), type="logical", help="Output RData file"), make_option(c("-nthreads","--nthreads"), type="integer", help="Number of threads") @@ -48,6 +52,7 @@ parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) args = parse_args(parser) +bam = args$bam fa_fn = args$fasta anno_fn = args$exons fq_R1 = args$read1 @@ -71,42 +76,48 @@ # Outputs out_dir = "." -fasta_index = file.path(out_dir, paste0(fa_fn, ".fasta_index")) -combined_fastq = file.path(out_dir, "combined.fastq") -aligned_bam = file.path(out_dir, "aligned.bam") mapped_bam = file.path(out_dir, "aligned.mapped.bam") +# if input is fastqs +if (!is.null(fa_fn)) { + fasta_index = file.path(out_dir, paste0(fa_fn, ".fasta_index")) + combined_fastq = file.path(out_dir, "combined.fastq") + aligned_bam = file.path(out_dir, "aligned.bam") -print("Trimming barcodes") -sc_trim_barcode(combined_fastq, - fq_R1, - fq_R2, - read_structure=read_structure, - filter_settings=filter_settings) + print("Trimming barcodes") + sc_trim_barcode(combined_fastq, + fq_R1, + fq_R2, + read_structure=read_structure, + filter_settings=filter_settings) -print("Building genome index") -Rsubread::buildindex(basename=fasta_index, reference=fa_fn) + print("Building genome index") + Rsubread::buildindex(basename=fasta_index, reference=fa_fn) -print("Aligning reads to genome") -Rsubread::align(index=fasta_index, - readfile1=combined_fastq, - output_file=aligned_bam, - nthreads=args$nthreads) + print("Aligning reads to genome") + Rsubread::align(index=fasta_index, + readfile1=combined_fastq, + output_file=aligned_bam, + nthreads=args$nthreads) -if (!is.null(args$barcodes)) { - barcode_anno=args$barcodes + if (!is.null(args$barcodes)) { + barcode_anno=args$barcodes + } else { + print("Detecting barcodes") + # detect 10X barcodes and generate sample_index.csv file + barcode_anno = "sample_index.csv" + sc_detect_bc( + infq=combined_fastq, + outcsv=barcode_anno, + bc_len=read_structure$bl2, + max_reads=args$max_reads, + min_count=args$min_count, + max_mismatch=args$max_mis + ) + } } else { - print("Detecting barcodes") - # detect 10X barcodes and generate sample_index.csv file - barcode_anno = "sample_index.csv" - sc_detect_bc( - infq=combined_fastq, - outcsv=barcode_anno, - bc_len=read_structure$bl2, - max_reads=args$max_reads, - min_count=args$min_count, - max_mismatch=args$max_mis - ) + aligned_bam = file.path(out_dir, bam) + barcode_anno=args$barcodes } print("Assigning reads to exons") @@ -118,32 +129,59 @@ print("Counting genes") sc_gene_counting(out_dir, barcode_anno, UMI_cor=args$UMI_cor, gene_fl=args$gene_fl) -print("Creating SingleCellExperiment object") +print("Performing QC") sce <- create_sce_by_dir(out_dir) +pdf("plots.pdf") +plot_demultiplex(sce) +if (has_umi) { + p = plot_UMI_dup(sce) + print(p) +} +sce = calculate_QC_metrics(sce) +sce = detect_outlier(sce) +p = plot_mapping(sce, percentage=TRUE, dataname=args$samplename) +print(p) +p = plot_QC_pairs(sce) +print(p) +dev.off() + +print("Removing outliers") +if (is.null(args$keep_outliers)) { + sce = remove_outliers(sce) + gene_counts <- counts(sce) + write.table(data.frame("gene_id"=rownames(gene_counts), gene_counts), file="gene_count.tsv", sep="\t", quote=FALSE, row.names=FALSE) +} -if (!is.null(args$report)) { -print("Creating report") -create_report(sample_name=args$samplename, - outdir=out_dir, - r1=fq_R1, - r2=fq_R2, - outfq=combined_fastq, - read_structure=read_structure, - filter_settings=filter_settings, - align_bam=aligned_bam, - genome_index=fasta_index, - map_bam=mapped_bam, - exon_anno=anno_fn, - stnd=args$stnd, - fix_chr=FALSE, - barcode_anno=barcode_anno, - max_mis=args$max_mis, - UMI_cor=args$UMI_cor, - gene_fl=args$gene_fl) +if (!is.null(args$metrics_matrix)) { + metrics <- colData(sce, internal=TRUE) + write.table(data.frame("cell_id"=rownames(metrics), metrics), file="metrics_matrix.tsv", sep="\t", quote=FALSE, row.names=FALSE) +} + +if (!is.null(args$report) & (!is.null(fa_fn))) { + print("Creating report") + create_report(sample_name=args$samplename, + outdir=out_dir, + r1=fq_R1, + r2=fq_R2, + outfq=combined_fastq, + read_structure=read_structure, + filter_settings=filter_settings, + align_bam=aligned_bam, + genome_index=fasta_index, + map_bam=mapped_bam, + exon_anno=anno_fn, + stnd=args$stnd, + fix_chr=FALSE, + barcode_anno=barcode_anno, + max_mis=args$max_mis, + UMI_cor=args$UMI_cor, + gene_fl=args$gene_fl, + organism=args$organism, + gene_id_type="ensembl_gene_id") } if (!is.null(args$rdata) ) { - save(sce, file = file.path(out_dir,"scPipe_analysis.RData")) + save(sce, file = file.path(out_dir,"scPipe_analysis.RData")) } sessionInfo() diff -r 4ec6717872b1 -r 5c4bca9dd4a2 scpipe.xml --- a/scpipe.xml Fri Aug 17 08:22:49 2018 -0400 +++ b/scpipe.xml Mon Jan 14 08:06:47 2019 -0500 @@ -1,8 +1,10 @@ - + - preprocessing pipeline for single cell RNA-seq bioconductor-scpipe bioconductor-rsubread + + bioconductor-rhtslib r-knitr r-rmarkdown r-readr @@ -11,9 +13,9 @@ bioconductor-scater bioconductor-scran r-rtsne - - r-ggplot2 + r-ggplot2 r-optparse gene_count.tsv +#if $keep_outliers: + --keep_outliers '$keep_outliers' + && sed -e "s/,/\${TAB}/g" gene_count.csv > gene_count.tsv +#end if ]]> - - - - + + + + - - - - - - - + + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - + + + + ^[\(\w\)]+$ + - - - - + +
+ + + + + + +
- - - - - - - - - - +
+ + + + + + + + +
+ + +
+ + plots + + + metrics_matrix + report @@ -185,9 +242,11 @@ + + @@ -208,38 +267,81 @@ + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 4ec6717872b1 -r 5c4bca9dd4a2 test-data/aligned.mapped.bam Binary file test-data/aligned.mapped.bam has changed diff -r 4ec6717872b1 -r 5c4bca9dd4a2 test-data/barcode_anno.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/barcode_anno.tsv Mon Jan 14 08:06:47 2019 -0500 @@ -0,0 +1,263 @@ +CELL_0000 CGACCATA 146 +CELL_0001 TACACGTG 193 +CELL_0002 ATGCAGTC 144 +CELL_0003 TAGGCTGT 280 +CELL_0004 CACAAGTC 1087 +CELL_0005 TGACTCAG 266 +CELL_0006 TGACCATG 156 +CELL_0007 GTTGCTGA 150 +CELL_0008 CAGTACCA 999 +CELL_0009 GCTTGATG 206 +CELL_0010 ACGTGTGT 151 +CELL_0011 GCAAGCTT 255 +CELL_0012 GACCTACA 183 +CELL_0013 TACGCTGA 208 +CELL_0014 GGTACACT 202 +CELL_0015 GCGTTAGT 141 +CELL_0016 CAACATGC 201 +CELL_0017 CATTGTGG 941 +CELL_0018 AAGGAGGA 178 +CELL_0019 CGTAGTGT 150 +CELL_0020 GGTTAGCT 234 +CELL_0021 AGGTTCCA 466 +CELL_0022 TACCGATG 434 +CELL_0023 AGGTTCAC 351 +CELL_0024 CAACTGAC 299 +CELL_0025 GCAAGTTC 220 +CELL_0026 GGTGATTC 138 +CELL_0027 CTGGTATG 347 +CELL_0028 TGCCAGAT 282 +CELL_0029 CGTGATGT 122 +CELL_0030 GACCATCA 103 +CELL_0031 TACCGTGA 186 +CELL_0032 TAGTGCGT 105 +CELL_0033 GCATTAGC 219 +CELL_0034 GCGATATC 345 +CELL_0035 CAAGTGTC 215 +CELL_0036 TGCAGACT 536 +CELL_0037 ATCCGCAA 283 +CELL_0038 GGTTATGC 415 +CELL_0039 CGCAACAT 186 +CELL_0040 AGGAAGGA 109 +CELL_0041 AGTCACGT 291 +CELL_0042 GCACGTAT 113 +CELL_0043 ATTGGCGT 101 +CELL_0044 GGACTTAC 316 +CELL_0045 ACTGTGAC 693 +CELL_0046 TACTGTGG 122 +CELL_0047 CATCAAGC 184 +CELL_0048 TGACTACG 155 +CELL_0049 GTAGTGTC 203 +CELL_0050 TATCCGAC 171 +CELL_0051 GTCATGCA 259 +CELL_0052 CACAATCG 118 +CELL_0053 GCAATGTC 140 +CELL_0054 TGTATGGC 290 +CELL_0055 GCTAGTGT 226 +CELL_0056 ATACCACG 141 +CELL_0057 GGTATCGT 162 +CELL_0058 ATGCTGAC 129 +CELL_0059 GCAGTCAT 148 +CELL_0060 CAATCCAG 142 +CELL_0061 TAGGTTGC 454 +CELL_0062 AACCGTAC 510 +CELL_0063 CGTCGATA 454 +CELL_0064 ATCCGTAG 379 +CELL_0065 CCACTAAG 267 +CELL_0066 GTCAGTAC 426 +CELL_0067 CCTAGTAG 138 +CELL_0068 TGCGACTA 201 +CELL_0069 ACCGACAT 164 +CELL_0070 TCGCTGAA 168 +CELL_0071 AGCGTATC 1291 +CELL_0072 TCACGGTA 146 +CELL_0073 TGCAATGC 237 +CELL_0074 GGCCAATT 289 +CELL_0075 TCTCGGAA 336 +CELL_0076 GAGACCTT 291 +CELL_0077 CGCTAGTA 511 +CELL_0078 CCTGTAAG 215 +CELL_0079 GCGTGATT 338 +CELL_0080 CTCAAGAC 182 +CELL_0081 TAGCGATC 250 +CELL_0082 GGACTTGT 711 +CELL_0083 CTCACAGA 259 +CELL_0084 GTAGACCT 495 +CELL_0085 AGTGTGCT 477 +CELL_0086 ACTGTTGG 208 +CELL_0087 ACGCCATA 159 +CELL_0088 GGTCCAAT 281 +CELL_0089 TATCCGGA 159 +CELL_0090 GTCGCTAA 245 +CELL_0091 ACCAAGCT 135 +CELL_0092 TGTCGATG 112 +CELL_0093 GCCAATTG 161 +CELL_0094 ATGCCAGT 1068 +CELL_0095 ACGATGTC 211 +CELL_0096 CATCCAGA 156 +CELL_0097 CAACGCTA 229 +CELL_0098 GAAGGAAG 110 +CELL_0099 TGGCGATT 414 +CELL_0100 AGTCATGC 248 +CELL_0101 AGCGTTAC 123 +CELL_0102 ACTCAAGC 148 +CELL_0103 CGATAGCT 170 +CELL_0104 CTACGCAA 254 +CELL_0105 AAAAAAAA 261 +CELL_0106 TAACCGTG 136 +CELL_0107 TAGCGTCA 385 +CELL_0108 ACCACTGA 163 +CELL_0109 CGCATGTA 109 +CELL_0110 ACCGTGAT 137 +CELL_0111 CTCCAAGA 179 +CELL_0112 CACTACAG 144 +CELL_0113 AGCCTAAC 104 +CELL_0114 CTGTGAGT 112 +CELL_0115 CTGCAACA 320 +CELL_0116 CTCACGAA 1181 +CELL_0117 ATTCGCAG 858 +CELL_0118 CGCAATAC 127 +CELL_0119 TCAGTGCA 151 +CELL_0120 GTGAGTTC 123 +CELL_0121 CCTATGAG 136 +CELL_0122 CAGCTTAG 141 +CELL_0123 TCTCAAGG 151 +CELL_0124 GTGGTCTA 242 +CELL_0125 ATTGTGGC 160 +CELL_0126 ACTACGGT 206 +CELL_0127 TCATGGTG 134 +CELL_0128 AGCAGCTT 164 +CELL_0129 GTACTGTG 341 +CELL_0130 CACGAGTT 103 +CELL_0131 TCAGGTGT 103 +CELL_0132 ACCAACGT 203 +CELL_0133 CGACTACA 226 +CELL_0134 AACGTCAC 375 +CELL_0135 GACATGCT 372 +CELL_0136 TCCATGAG 182 +CELL_0137 GGATGCTT 114 +CELL_0138 GGCACTTA 176 +CELL_0139 AGTGTCTG 205 +CELL_0140 TGCGTAGT 319 +CELL_0141 TAAGCTGC 340 +CELL_0142 GTACATCG 159 +CELL_0143 TGGCAACT 1094 +CELL_0144 GGTGCATT 143 +CELL_0145 TGCATGGT 136 +CELL_0146 GGTCAATC 248 +CELL_0147 CGTGAATC 233 +CELL_0148 CAATCGGT 377 +CELL_0149 ACTACACG 165 +CELL_0150 TGCACTGA 116 +CELL_0151 GCGCAATT 132 +CELL_0152 ACAACGTC 987 +CELL_0153 AGCATGTC 260 +CELL_0154 TCACACGA 106 +CELL_0155 GAATCCAC 282 +CELL_0156 TAGGCCAT 335 +CELL_0157 ACACTCGA 374 +CELL_0158 GAACTTCG 155 +CELL_0159 CACTGCAA 119 +CELL_0160 TACGGATC 159 +CELL_0161 GCCAACTA 349 +CELL_0162 TACCACGA 151 +CELL_0163 GATACTGC 127 +CELL_0164 TCAGGCAT 125 +CELL_0165 TGCGGTTA 136 +CELL_0166 TCAGCGTA 245 +CELL_0167 GCTAAGCT 476 +CELL_0168 GTCCAACA 117 +CELL_0169 CCATCAAG 123 +CELL_0170 CACGGTAT 240 +CELL_0171 GTCATGTG 130 +CELL_0172 CGTAACTG 113 +CELL_0173 GTTACCGA 126 +CELL_0174 ATAGGCTC 1103 +CELL_0175 CCATCGAA 162 +CELL_0176 AGGAGGAA 105 +CELL_0177 ATTGCGTG 317 +CELL_0178 ACGTATCG 142 +CELL_0179 GCCATCAA 153 +CELL_0180 AGCTTGCA 169 +CELL_0181 TCACAGAC 275 +CELL_0182 CGACACAT 148 +CELL_0183 AGCACTTG 717 +CELL_0184 GTGCGTTA 163 +CELL_0185 GAGTTCCA 122 +CELL_0186 ATGGTCCA 171 +CELL_0187 GTTGAGTC 196 +CELL_0188 GCGTGTAT 577 +CELL_0189 TCGAGTTG 176 +CELL_0190 TACGAGTC 169 +CELL_0191 CAATTGGC 738 +CELL_0192 GGGGGGGG 183 +CELL_0193 GCCTTAAG 102 +CELL_0194 GGTCTATG 233 +CELL_0195 ACACACGT 1061 +CELL_0196 TGGTCTGA 150 +CELL_0197 GAGAGAAG 101 +CELL_0198 GCATGTGT 101 +CELL_0199 ACGCACTA 173 +CELL_0200 TCGCACAA 203 +CELL_0201 GATTCGGT 102 +CELL_0202 GCTGCATA 282 +CELL_0203 TATGGCTG 167 +CELL_0204 GTTCCAAG 377 +CELL_0205 CTAGGATC 200 +CELL_0206 TCTTCCTC 181 +CELL_0207 GTGCTCAA 602 +CELL_0208 TGAGTTGC 200 +CELL_0209 AACACCAC 163 +CELL_0210 TGCTAGGT 361 +CELL_0211 TGCACGTA 325 +CELL_0212 ATGTCCAG 687 +CELL_0213 ACCAAGTC 172 +CELL_0214 CACACTAG 247 +CELL_0215 AGTATCCG 126 +CELL_0216 ACCATGTG 155 +CELL_0217 CAAGCCAT 231 +CELL_0218 AGCTCATG 169 +CELL_0219 TCGTTGGA 192 +CELL_0220 GACTGTAC 250 +CELL_0221 ACTCTGAG 157 +CELL_0222 ACATGCCA 399 +CELL_0223 GACTACTG 735 +CELL_0224 ACACGCAT 401 +CELL_0225 CGTAGCAT 199 +CELL_0226 TAGCCGAT 319 +CELL_0227 ATCACGAC 129 +CELL_0228 ATGGTGCT 2031 +CELL_0229 GGTAACTC 155 +CELL_0230 AGCTCTGA 162 +CELL_0231 TCGAGACT 279 +CELL_0232 TGTCGCAA 121 +CELL_0233 CCAATAGC 145 +CELL_0234 CTATCGGA 812 +CELL_0235 CAACTAGC 188 +CELL_0236 TGCGGATT 132 +CELL_0237 CCGTTAAG 123 +CELL_0238 TCGACTAG 455 +CELL_0239 TATTGCGG 325 +CELL_0240 GCGCTTAA 379 +CELL_0241 CCGATGTA 488 +CELL_0242 TGGTCGAT 163 +CELL_0243 CTACGAAC 346 +CELL_0244 CAGTCCAA 105 +CELL_0245 AGCTTACG 221 +CELL_0246 CGACATAC 124 +CELL_0247 ATTGCCGA 1198 +CELL_0248 TCCTCTTC 160 +CELL_0249 AGCTGTTG 335 +CELL_0250 GCATAGTC 319 +CELL_0251 CGATTGTG 386 +CELL_0252 TGGTTGCA 339 +CELL_0253 CGAATCCA 158 +CELL_0254 TGGTGATC 318 +CELL_0255 AGAAGAGG 153 +CELL_0256 GTCACGTA 105 +CELL_0257 GTACTTGG 688 +CELL_0258 GTCCGAAT 265 +CELL_0259 TCTCCTCT 501 +CELL_0260 GATTGGTC 372 +CELL_0261 TACTGGCA 117 +CELL_0262 GTATGGTC 621 diff -r 4ec6717872b1 -r 5c4bca9dd4a2 test-data/plots.pdf Binary file test-data/plots.pdf has changed