Mercurial > repos > ebi-gxa > droplet_barcode_plot
comparison dropletBarcodePlot.R @ 0:04f32429dcf2 draft
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/droplet-rank-plot/.shed.yml commit a785b79f2b5689aba87c0f7072897bb23f6bda76
author | ebi-gxa |
---|---|
date | Fri, 08 Nov 2019 09:08:14 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:04f32429dcf2 |
---|---|
1 #!/usr/bin/env Rscript | |
2 | |
3 # This script parses the GTF file to create a feature-wise annotation file with | |
4 # mitochondrial features flagged, to assist in annotation and QC of single-cell | |
5 # expression data analysis. | |
6 | |
7 suppressPackageStartupMessages(require(optparse)) | |
8 suppressPackageStartupMessages(require(ggplot2)) | |
9 suppressPackageStartupMessages(require(gridExtra)) | |
10 suppressPackageStartupMessages(require(DropletUtils)) | |
11 suppressPackageStartupMessages(require(Matrix)) | |
12 | |
13 die <- function(message){ | |
14 write(message, stderr()) | |
15 q(status = 1) | |
16 } | |
17 | |
18 option_list = list( | |
19 make_option( | |
20 c("-b", "--barcode-frequencies"), | |
21 action = "store", | |
22 default = NA, | |
23 type = 'character', | |
24 help = "Path to a two-column tab-delimited file, with barcodes in the first column and frequencies in the second (ignored if --mtx-matrix supplied)" | |
25 ), | |
26 make_option( | |
27 c("-m", "--mtx-matrix"), | |
28 action = "store", | |
29 default = NA, | |
30 type = 'character', | |
31 help = 'Matrix-market format matrix file, with cells by column (overrides --barcode-frequencies if supplied)' | |
32 ), | |
33 make_option( | |
34 c("-r", "--cells-by-row"), | |
35 action = "store_true", | |
36 default = FALSE, | |
37 type = 'logical', | |
38 help = 'For use with --mtx-matrix: force interpretation of matrix to assume cells are by row, rather than by column (default)' | |
39 ), | |
40 make_option( | |
41 c("-l", "--label"), | |
42 action = "store", | |
43 default = '', | |
44 type = 'character', | |
45 help = 'Label to use in plot' | |
46 ), | |
47 make_option( | |
48 c("-d", "--density-bins"), | |
49 action = "store", | |
50 default = 50, | |
51 type = 'numeric', | |
52 help = "Number of bins used to calculate density plot" | |
53 ), | |
54 make_option( | |
55 c("-y", "--roryk-multiplier"), | |
56 action = "store", | |
57 default = 1.5, | |
58 type = 'numeric', | |
59 help = "Above-baseline multiplier to calculate roryk threshold" | |
60 ), | |
61 make_option( | |
62 c("-o", "--output-plot"), | |
63 action = "store", | |
64 default = 'barcode_plot.png', | |
65 type = 'character', | |
66 help = "File path for output plot" | |
67 ), | |
68 make_option( | |
69 c("-t", "--output-thresholds"), | |
70 action = "store", | |
71 default = 'barcode_thresholds.txt', | |
72 type = 'character', | |
73 help = "File path for output file containing calculted thresholds" | |
74 ) | |
75 ) | |
76 | |
77 opt <- parse_args(OptionParser(option_list = option_list), convert_hyphens_to_underscores = TRUE) | |
78 | |
79 # Process inputs dependent on what has been provided | |
80 | |
81 if (is.na(opt$mtx_matrix)){ | |
82 if (is.na(opt$barcode_frequencies)){ | |
83 die('ERROR: must supply --mtx-matrix or --barcode-frequencies') | |
84 }else if (! file.exists(opt$barcode_frequencies)){ | |
85 die(paste('ERROR: barcode frequencies file', opt$barcode_frequencies, 'does not exist')) | |
86 }else{ | |
87 barcode_counts <- read.delim(opt$barcode_frequencies, header = FALSE) | |
88 } | |
89 }else if (! file.exists(opt$mtx_matrix)){ | |
90 die(paste('ERROR: MTX matrix file', opt$mtx_matrix, 'does not exist')) | |
91 }else{ | |
92 result_matrix <- Matrix::readMM(opt$mtx_matrix) | |
93 if (opt$cells_by_row){ | |
94 barcode_counts <- data.frame(V1 = 1:nrow(result_matrix), V2=Matrix::rowSums(result_matrix)) | |
95 }else{ | |
96 barcode_counts <- data.frame(V1 = 1:ncol(result_matrix), V2=Matrix::colSums(result_matrix)) | |
97 } | |
98 } | |
99 | |
100 # Pick a cutoff on count as per https://github.com/COMBINE-lab/salmon/issues/362#issuecomment-490160480 | |
101 | |
102 pick_roryk_cutoff = function(bcs, above_baseline_multiplier = 1.5){ | |
103 bcs_hist = hist(log10(bcs), plot=FALSE, n=opt$density_bins) | |
104 mids = bcs_hist$mids | |
105 vals = bcs_hist$count | |
106 wdensity = vals * (10^mids) / sum(vals * (10^mids)) | |
107 baseline <- median(wdensity) | |
108 | |
109 # Find highest density in upper half of barcode distribution | |
110 | |
111 peak <- which(wdensity == max(wdensity[((length(wdensity)+1)/2):length(wdensity)])) | |
112 | |
113 # Cutoff is the point before the peak at which density falls below the multiplier of baseline | |
114 | |
115 10^mids[max(which(wdensity[1:peak] < (above_baseline_multiplier*baseline)))] | |
116 } | |
117 | |
118 # Plot densities | |
119 | |
120 barcode_density_plot = function(bcs, roryk_cutoff, knee, inflection, name = ' ') { | |
121 bcs_hist = hist(log10(bcs), plot=FALSE, n=opt$density_bins) | |
122 counts = bcs_hist$count | |
123 mids = bcs_hist$mids | |
124 y = counts * (10^mids) / sum(counts * (10^mids)) | |
125 qplot(y, 10^mids) + geom_point() + theme_bw() + ggtitle(name) + ylab('Count') + xlab ('Density') + | |
126 geom_hline(aes(yintercept = roryk_cutoff, color = paste('roryk_cutoff =', length(which(bcs > roryk_cutoff)), 'cells'))) + | |
127 geom_hline(aes(yintercept = inflection, color = paste('dropletutils_inflection =', length(which(bcs > inflection)), 'cells'))) + | |
128 geom_hline(aes(yintercept = knee, color = paste('dropletutils_knee =', length(which(bcs > knee)), 'cells'))) + | |
129 scale_y_continuous(trans='log10') + theme(axis.title.y=element_blank()) + labs(color='Thresholds') | |
130 } | |
131 | |
132 # Plot a more standard barcode rank plot | |
133 | |
134 barcode_rank_plot <- function(br.out, roryk_total_cutoff, knee, inflection, name='no name'){ | |
135 ggplot(data.frame(br.out), aes(x=rank, y=total)) + geom_line() + scale_x_continuous(trans='log10') + scale_y_continuous(trans='log10') + theme_bw() + | |
136 geom_hline(aes(yintercept = knee, color = 'dropletutils_knee')) + | |
137 geom_hline(aes(yintercept = inflection, color = 'dropletutils_inflection')) + | |
138 geom_hline(aes(yintercept = roryk_total_cutoff, color = 'roryk_cutoff')) + | |
139 ggtitle(name) + ylab('Count') + xlab('Rank') + theme(legend.position = "none") | |
140 } | |
141 | |
142 # Sort barcodes by descending frequency | |
143 | |
144 barcode_counts <- barcode_counts[order(barcode_counts$V2, decreasing = TRUE), ] | |
145 | |
146 roryk_count_cutoff <- pick_roryk_cutoff(barcode_counts$V2, opt$roryk_multiplier) | |
147 | |
148 # Run dropletUtils' barcodeRanks to get knee etc | |
149 br.out <- barcodeRanks(t(barcode_counts[,2,drop=FALSE])) | |
150 | |
151 dropletutils_knee <- metadata(br.out)$knee | |
152 dropletutils_inflection <- metadata(br.out)$inflection | |
153 | |
154 plot_label <- paste(format(nrow(barcode_counts), big.mark = ','), 'cell barcodes') | |
155 if ((! is.na(opt$label)) && opt$label != ''){ | |
156 plot_label <- paste0(opt$label, ': ', plot_label) | |
157 } | |
158 | |
159 plots <- list( | |
160 dropletutils = barcode_rank_plot(br.out, roryk_count_cutoff, dropletutils_knee, dropletutils_inflection, name = plot_label), | |
161 roryk = barcode_density_plot(barcode_counts$V2, roryk_count_cutoff, dropletutils_knee, dropletutils_inflection, name = ' ') | |
162 ) | |
163 | |
164 # Create output plot | |
165 png(width = 1000, height = 600, file=opt$output_plot) | |
166 grid.arrange(plots$dropletutils, plots$roryk, nrow=1) | |
167 dev.off() | |
168 | |
169 # Return calculated thresholds | |
170 write.table(data.frame(dropletutils_knee = dropletutils_knee, dropletutils_inflection = dropletutils_inflection, roryk=roryk_count_cutoff), file = opt$output_thresholds, row.names = FALSE, quote = FALSE) |