Mercurial > repos > ebi-gxa > droplet_barcode_plot
view dropletBarcodePlot.R @ 1:0e985680e67d draft
"planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/droplet-rank-plot/.shed.yml commit bf9021c853da640771202920cde38f77cdeaf368"
author | ebi-gxa |
---|---|
date | Wed, 04 Mar 2020 06:44:10 -0500 |
parents | 04f32429dcf2 |
children |
line wrap: on
line source
#!/usr/bin/env Rscript # This script parses the GTF file to create a feature-wise annotation file with # mitochondrial features flagged, to assist in annotation and QC of single-cell # expression data analysis. suppressPackageStartupMessages(require(optparse)) suppressPackageStartupMessages(require(ggplot2)) suppressPackageStartupMessages(require(gridExtra)) suppressPackageStartupMessages(require(DropletUtils)) suppressPackageStartupMessages(require(Matrix)) die <- function(message){ write(message, stderr()) q(status = 1) } option_list = list( make_option( c("-b", "--barcode-frequencies"), action = "store", default = NA, type = 'character', help = "Path to a two-column tab-delimited file, with barcodes in the first column and frequencies in the second (ignored if --mtx-matrix supplied)" ), make_option( c("-m", "--mtx-matrix"), action = "store", default = NA, type = 'character', help = 'Matrix-market format matrix file, with cells by column (overrides --barcode-frequencies if supplied)' ), make_option( c("-r", "--cells-by-row"), action = "store_true", default = FALSE, type = 'logical', help = 'For use with --mtx-matrix: force interpretation of matrix to assume cells are by row, rather than by column (default)' ), make_option( c("-l", "--label"), action = "store", default = '', type = 'character', help = 'Label to use in plot' ), make_option( c("-d", "--density-bins"), action = "store", default = 50, type = 'numeric', help = "Number of bins used to calculate density plot" ), make_option( c("-y", "--roryk-multiplier"), action = "store", default = 1.5, type = 'numeric', help = "Above-baseline multiplier to calculate roryk threshold" ), make_option( c("-o", "--output-plot"), action = "store", default = 'barcode_plot.png', type = 'character', help = "File path for output plot" ), make_option( c("-t", "--output-thresholds"), action = "store", default = 'barcode_thresholds.txt', type = 'character', help = "File path for output file containing calculted thresholds" ) ) opt <- parse_args(OptionParser(option_list = option_list), convert_hyphens_to_underscores = TRUE) # Process inputs dependent on what has been provided if (is.na(opt$mtx_matrix)){ if (is.na(opt$barcode_frequencies)){ die('ERROR: must supply --mtx-matrix or --barcode-frequencies') }else if (! file.exists(opt$barcode_frequencies)){ die(paste('ERROR: barcode frequencies file', opt$barcode_frequencies, 'does not exist')) }else{ barcode_counts <- read.delim(opt$barcode_frequencies, header = FALSE) } }else if (! file.exists(opt$mtx_matrix)){ die(paste('ERROR: MTX matrix file', opt$mtx_matrix, 'does not exist')) }else{ result_matrix <- Matrix::readMM(opt$mtx_matrix) if (opt$cells_by_row){ barcode_counts <- data.frame(V1 = 1:nrow(result_matrix), V2=Matrix::rowSums(result_matrix)) }else{ barcode_counts <- data.frame(V1 = 1:ncol(result_matrix), V2=Matrix::colSums(result_matrix)) } } # Pick a cutoff on count as per https://github.com/COMBINE-lab/salmon/issues/362#issuecomment-490160480 pick_roryk_cutoff = function(bcs, above_baseline_multiplier = 1.5){ bcs_hist = hist(log10(bcs), plot=FALSE, n=opt$density_bins) mids = bcs_hist$mids vals = bcs_hist$count wdensity = vals * (10^mids) / sum(vals * (10^mids)) baseline <- median(wdensity) # Find highest density in upper half of barcode distribution peak <- which(wdensity == max(wdensity[((length(wdensity)+1)/2):length(wdensity)])) # Cutoff is the point before the peak at which density falls below the multiplier of baseline 10^mids[max(which(wdensity[1:peak] < (above_baseline_multiplier*baseline)))] } # Plot densities barcode_density_plot = function(bcs, roryk_cutoff, knee, inflection, name = ' ') { bcs_hist = hist(log10(bcs), plot=FALSE, n=opt$density_bins) counts = bcs_hist$count mids = bcs_hist$mids y = counts * (10^mids) / sum(counts * (10^mids)) qplot(y, 10^mids) + geom_point() + theme_bw() + ggtitle(name) + ylab('Count') + xlab ('Density') + geom_hline(aes(yintercept = roryk_cutoff, color = paste('roryk_cutoff =', length(which(bcs > roryk_cutoff)), 'cells'))) + geom_hline(aes(yintercept = inflection, color = paste('dropletutils_inflection =', length(which(bcs > inflection)), 'cells'))) + geom_hline(aes(yintercept = knee, color = paste('dropletutils_knee =', length(which(bcs > knee)), 'cells'))) + scale_y_continuous(trans='log10') + theme(axis.title.y=element_blank()) + labs(color='Thresholds') } # Plot a more standard barcode rank plot barcode_rank_plot <- function(br.out, roryk_total_cutoff, knee, inflection, name='no name'){ ggplot(data.frame(br.out), aes(x=rank, y=total)) + geom_line() + scale_x_continuous(trans='log10') + scale_y_continuous(trans='log10') + theme_bw() + geom_hline(aes(yintercept = knee, color = 'dropletutils_knee')) + geom_hline(aes(yintercept = inflection, color = 'dropletutils_inflection')) + geom_hline(aes(yintercept = roryk_total_cutoff, color = 'roryk_cutoff')) + ggtitle(name) + ylab('Count') + xlab('Rank') + theme(legend.position = "none") } # Sort barcodes by descending frequency barcode_counts <- barcode_counts[order(barcode_counts$V2, decreasing = TRUE), ] roryk_count_cutoff <- pick_roryk_cutoff(barcode_counts$V2, opt$roryk_multiplier) # Run dropletUtils' barcodeRanks to get knee etc br.out <- barcodeRanks(t(barcode_counts[,2,drop=FALSE])) dropletutils_knee <- metadata(br.out)$knee dropletutils_inflection <- metadata(br.out)$inflection plot_label <- paste(format(nrow(barcode_counts), big.mark = ','), 'cell barcodes') if ((! is.na(opt$label)) && opt$label != ''){ plot_label <- paste0(opt$label, ': ', plot_label) } plots <- list( dropletutils = barcode_rank_plot(br.out, roryk_count_cutoff, dropletutils_knee, dropletutils_inflection, name = plot_label), roryk = barcode_density_plot(barcode_counts$V2, roryk_count_cutoff, dropletutils_knee, dropletutils_inflection, name = ' ') ) # Create output plot png(width = 1000, height = 600, file=opt$output_plot) grid.arrange(plots$dropletutils, plots$roryk, nrow=1) dev.off() # Return calculated thresholds write.table(data.frame(dropletutils_knee = dropletutils_knee, dropletutils_inflection = dropletutils_inflection, roryk=roryk_count_cutoff), file = opt$output_thresholds, row.names = FALSE, quote = FALSE)