Mercurial > repos > ebi-gxa > droplet_barcode_plot
diff dropletBarcodePlot.R @ 0:04f32429dcf2 draft
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/droplet-rank-plot/.shed.yml commit a785b79f2b5689aba87c0f7072897bb23f6bda76
author | ebi-gxa |
---|---|
date | Fri, 08 Nov 2019 09:08:14 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dropletBarcodePlot.R Fri Nov 08 09:08:14 2019 -0500 @@ -0,0 +1,170 @@ +#!/usr/bin/env Rscript + +# This script parses the GTF file to create a feature-wise annotation file with +# mitochondrial features flagged, to assist in annotation and QC of single-cell +# expression data analysis. + +suppressPackageStartupMessages(require(optparse)) +suppressPackageStartupMessages(require(ggplot2)) +suppressPackageStartupMessages(require(gridExtra)) +suppressPackageStartupMessages(require(DropletUtils)) +suppressPackageStartupMessages(require(Matrix)) + +die <- function(message){ + write(message, stderr()) + q(status = 1) +} + +option_list = list( + make_option( + c("-b", "--barcode-frequencies"), + action = "store", + default = NA, + type = 'character', + help = "Path to a two-column tab-delimited file, with barcodes in the first column and frequencies in the second (ignored if --mtx-matrix supplied)" + ), + make_option( + c("-m", "--mtx-matrix"), + action = "store", + default = NA, + type = 'character', + help = 'Matrix-market format matrix file, with cells by column (overrides --barcode-frequencies if supplied)' + ), + make_option( + c("-r", "--cells-by-row"), + action = "store_true", + default = FALSE, + type = 'logical', + help = 'For use with --mtx-matrix: force interpretation of matrix to assume cells are by row, rather than by column (default)' + ), + make_option( + c("-l", "--label"), + action = "store", + default = '', + type = 'character', + help = 'Label to use in plot' + ), + make_option( + c("-d", "--density-bins"), + action = "store", + default = 50, + type = 'numeric', + help = "Number of bins used to calculate density plot" + ), + make_option( + c("-y", "--roryk-multiplier"), + action = "store", + default = 1.5, + type = 'numeric', + help = "Above-baseline multiplier to calculate roryk threshold" + ), + make_option( + c("-o", "--output-plot"), + action = "store", + default = 'barcode_plot.png', + type = 'character', + help = "File path for output plot" + ), + make_option( + c("-t", "--output-thresholds"), + action = "store", + default = 'barcode_thresholds.txt', + type = 'character', + help = "File path for output file containing calculted thresholds" + ) +) + +opt <- parse_args(OptionParser(option_list = option_list), convert_hyphens_to_underscores = TRUE) + +# Process inputs dependent on what has been provided + +if (is.na(opt$mtx_matrix)){ + if (is.na(opt$barcode_frequencies)){ + die('ERROR: must supply --mtx-matrix or --barcode-frequencies') + }else if (! file.exists(opt$barcode_frequencies)){ + die(paste('ERROR: barcode frequencies file', opt$barcode_frequencies, 'does not exist')) + }else{ + barcode_counts <- read.delim(opt$barcode_frequencies, header = FALSE) + } +}else if (! file.exists(opt$mtx_matrix)){ + die(paste('ERROR: MTX matrix file', opt$mtx_matrix, 'does not exist')) +}else{ + result_matrix <- Matrix::readMM(opt$mtx_matrix) + if (opt$cells_by_row){ + barcode_counts <- data.frame(V1 = 1:nrow(result_matrix), V2=Matrix::rowSums(result_matrix)) + }else{ + barcode_counts <- data.frame(V1 = 1:ncol(result_matrix), V2=Matrix::colSums(result_matrix)) + } +} + +# Pick a cutoff on count as per https://github.com/COMBINE-lab/salmon/issues/362#issuecomment-490160480 + +pick_roryk_cutoff = function(bcs, above_baseline_multiplier = 1.5){ + bcs_hist = hist(log10(bcs), plot=FALSE, n=opt$density_bins) + mids = bcs_hist$mids + vals = bcs_hist$count + wdensity = vals * (10^mids) / sum(vals * (10^mids)) + baseline <- median(wdensity) + + # Find highest density in upper half of barcode distribution + + peak <- which(wdensity == max(wdensity[((length(wdensity)+1)/2):length(wdensity)])) + + # Cutoff is the point before the peak at which density falls below the multiplier of baseline + + 10^mids[max(which(wdensity[1:peak] < (above_baseline_multiplier*baseline)))] +} + +# Plot densities + +barcode_density_plot = function(bcs, roryk_cutoff, knee, inflection, name = ' ') { + bcs_hist = hist(log10(bcs), plot=FALSE, n=opt$density_bins) + counts = bcs_hist$count + mids = bcs_hist$mids + y = counts * (10^mids) / sum(counts * (10^mids)) + qplot(y, 10^mids) + geom_point() + theme_bw() + ggtitle(name) + ylab('Count') + xlab ('Density') + + geom_hline(aes(yintercept = roryk_cutoff, color = paste('roryk_cutoff =', length(which(bcs > roryk_cutoff)), 'cells'))) + + geom_hline(aes(yintercept = inflection, color = paste('dropletutils_inflection =', length(which(bcs > inflection)), 'cells'))) + + geom_hline(aes(yintercept = knee, color = paste('dropletutils_knee =', length(which(bcs > knee)), 'cells'))) + + scale_y_continuous(trans='log10') + theme(axis.title.y=element_blank()) + labs(color='Thresholds') +} + +# Plot a more standard barcode rank plot + +barcode_rank_plot <- function(br.out, roryk_total_cutoff, knee, inflection, name='no name'){ + ggplot(data.frame(br.out), aes(x=rank, y=total)) + geom_line() + scale_x_continuous(trans='log10') + scale_y_continuous(trans='log10') + theme_bw() + + geom_hline(aes(yintercept = knee, color = 'dropletutils_knee')) + + geom_hline(aes(yintercept = inflection, color = 'dropletutils_inflection')) + + geom_hline(aes(yintercept = roryk_total_cutoff, color = 'roryk_cutoff')) + + ggtitle(name) + ylab('Count') + xlab('Rank') + theme(legend.position = "none") +} + +# Sort barcodes by descending frequency + +barcode_counts <- barcode_counts[order(barcode_counts$V2, decreasing = TRUE), ] + +roryk_count_cutoff <- pick_roryk_cutoff(barcode_counts$V2, opt$roryk_multiplier) + +# Run dropletUtils' barcodeRanks to get knee etc +br.out <- barcodeRanks(t(barcode_counts[,2,drop=FALSE])) + +dropletutils_knee <- metadata(br.out)$knee +dropletutils_inflection <- metadata(br.out)$inflection + +plot_label <- paste(format(nrow(barcode_counts), big.mark = ','), 'cell barcodes') +if ((! is.na(opt$label)) && opt$label != ''){ + plot_label <- paste0(opt$label, ': ', plot_label) +} + +plots <- list( + dropletutils = barcode_rank_plot(br.out, roryk_count_cutoff, dropletutils_knee, dropletutils_inflection, name = plot_label), + roryk = barcode_density_plot(barcode_counts$V2, roryk_count_cutoff, dropletutils_knee, dropletutils_inflection, name = ' ') +) + +# Create output plot +png(width = 1000, height = 600, file=opt$output_plot) +grid.arrange(plots$dropletutils, plots$roryk, nrow=1) +dev.off() + +# Return calculated thresholds +write.table(data.frame(dropletutils_knee = dropletutils_knee, dropletutils_inflection = dropletutils_inflection, roryk=roryk_count_cutoff), file = opt$output_thresholds, row.names = FALSE, quote = FALSE)