comparison filter_cells.R @ 3:5407dc697e24 draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_filter_cells commit fdfb8deb1e770c824fcd1e127c2e3faa4e0cf35e
author artbio
date Mon, 16 Oct 2023 22:33:23 +0000
parents e63bd8f13679
children
comparison
equal deleted inserted replaced
2:47cf889595ec 3:5407dc697e24
1 # First step of the signature-based workflow 1 # First step of the signature-based workflow
2 # Remove low quality cells below a user-fixed cutoff of 2 # Remove low quality cells below a user-fixed cutoff of
3 # percentiles or raw values of number of genes detected or 3 # percentiles or raw values of number of genes detected or
4 # total aligned reads 4 # total aligned reads
5 5
6 # Example of command (that generates output files) : 6 options(show.error.messages = FALSE,
7 # Rscript filter_cells.R -f <input> --sep "/t" --absolute_genes 1700 --absolute_counts 90000 --pdfplot <plotfile.pdf> --output <filtered_cells.tsv> --output_metada <filtered_metadata.tsv> 7 error = function() {
8 8 cat(geterrmessage(), file = stderr())
9 # load packages that are provided in the conda env 9 q("no", 1, FALSE)
10 options( show.error.messages=F, 10 }
11 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) 11 )
12
12 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") 13 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
13 warnings() 14 warnings()
15
14 library(optparse) 16 library(optparse)
15 library(ggplot2) 17 library(ggplot2)
16 18
17 # Arguments 19 # Arguments
18 option_list = list( 20 option_list <- list(
19 make_option(c("-f", "--file"), default=NA, type='character', 21 make_option(c("-f", "--file"), default = NA, type = "character",
20 help="Input file that contains values to filter"), 22 help = "Input file that contains values to filter"),
21 make_option("--sep", default="\t", type='character', 23 make_option("--sep", default = "\t", type = "character",
22 help="File column separator [default : '%default' ]"), 24 help = "File column separator [default : '%default' ]"),
23 make_option("--percentile_genes", default=0, type='integer', 25 make_option("--percentile_genes", default = 0, type = "integer",
24 help="nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"), 26 help = "nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"),
25 make_option("--percentile_counts", default=0, type='integer', 27 make_option("--percentile_counts", default = 0, type = "integer",
26 help="nth Percentile of the total counts per cell distribution [default : '%default' ]"), 28 help = "nth Percentile of the total counts per cell distribution [default : '%default' ]"),
27 make_option("--absolute_genes", default=0, type='integer', 29 make_option("--absolute_genes", default = 0, type = "integer",
28 help="Remove cells that didn't express at least this number of genes [default : '%default' ]"), 30 help = "Remove cells that did not express at least this number of genes [default : '%default' ]"),
29 make_option("--absolute_counts", default=0, type='integer', 31 make_option("--absolute_counts", default = 0, type = "integer",
30 help="Number of transcript threshold for cell filtering [default : '%default' ]"), 32 help = "Number of transcript threshold for cell filtering [default : '%default' ]"),
31 make_option("--manage_cutoffs", default="intersect", type='character', 33 make_option("--manage_cutoffs", default = "intersect", type = "character",
32 help="combine or intersect cutoffs for filtering"), 34 help = "combine or intersect cutoffs for filtering"),
33 make_option("--pdfplot", type = 'character', 35 make_option("--pdfplot", type = "character",
34 help="Path to pdf file of the plots"), 36 help = "Path to pdf file of the plots"),
35 make_option("--output", type = 'character', 37 make_option("--output", type = "character",
36 help="Path to tsv file of filtered cell data"), 38 help = "Path to tsv file of filtered cell data"),
37 make_option("--output_metada", type = 'character', 39 make_option("--output_metada", type = "character",
38 help="Path to tsv file of filtered cell metadata") 40 help = "Path to tsv file of filtered cell metadata")
39 ) 41 )
40 opt = parse_args(OptionParser(option_list=option_list), args = commandArgs(trailingOnly = TRUE)) 42 opt <- parse_args(OptionParser(option_list = option_list),
41 if (opt$sep == "tab") {opt$sep = "\t"} 43 args = commandArgs(trailingOnly = TRUE)
42 if (opt$sep == "comma") {opt$sep = ","} 44 )
43 if (opt$sep == "space") {opt$sep = " "} 45 if (opt$sep == "tab") {
44 46 opt$sep <- "\t"
45 47 }
46 # check consistency of filtering options 48 if (opt$sep == "comma") {
47 if ((opt$percentile_counts > 0) & (opt$absolute_counts > 0)) { 49 opt$sep <- ","
48 opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering 50 }
49 # if ((opt$percentile_counts == 0) & (opt$absolute_counts == 0)) { 51 if (opt$sep == "space") {
50 # opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering 52 opt$sep <- " "
51 if ((opt$percentile_genes > 0) & (opt$absolute_genes > 0)) { 53 }
52 opt$percentile_genes = 0 } # since input parameters are not consistent (one or either method, not both), no filtering 54
53 # if ((opt$percentile_genes == 0) & (opt$absolute_genes == 0)) { 55
54 # opt$percentile_genes = 100 } # since input parameters are not consistent (one or either method, not both), no filtering 56 ## check consistency of filtering options
57
58 # if input parameters are not consistent (one or either method, not both), no filtering
59 if ((opt$percentile_counts > 0) && (opt$absolute_counts > 0)) {
60 opt$percentile_counts <- 0
61 }
62
63 # if input parameters are not consistent (one or either method, not both), no filtering
64 if ((opt$percentile_genes > 0) && (opt$absolute_genes > 0)) {
65 opt$percentile_genes <- 0
66 }
55 67
56 # Import datasets 68 # Import datasets
57 data.counts <- read.table( 69 data_counts <- read.delim(
58 opt$file, 70 opt$file,
59 header = TRUE, 71 header = TRUE,
60 stringsAsFactors = F, 72 stringsAsFactors = FALSE,
61 sep = opt$sep, 73 sep = opt$sep,
62 check.names = FALSE, 74 check.names = FALSE,
63 row.names = 1 75 row.names = 1
64 ) 76 )
65 77
66 QC_metrics <- 78 QC_metrics <- data.frame(cell_id = colnames(data_counts),
67 data.frame(cell_id = colnames(data.counts), 79 nGenes = colSums(data_counts != 0), # nGenes is Number of detected genes for each cell
68 nGenes = colSums(data.counts != 0), # nGenes : Number of detected genes for each cell 80 total_counts = colSums(data_counts), # total_counts is Total counts per cell
69 total_counts = colSums(data.counts), # total_counts : Total counts per cell 81 stringsAsFactors = FALSE)
70 stringsAsFactors = F) 82
71 83
72 plot_hist <- function(mydata, variable, title, cutoff){ 84 plot_hist <- function(mydata, variable, title, cutoff) {
73 mybinwidth = round(max(mydata[, variable]) * 5 / 100) 85 mybinwidth <- round(max(mydata[, variable]) * 5 / 100)
74 mylabel = paste0("cutoff= ", cutoff) 86 mylabel <- paste0("cutoff= ", cutoff)
75 hist_plot <- qplot( 87 hist_plot <- ggplot(as.data.frame(mydata[, variable]),
76 mydata[, variable], 88 aes(x = mydata[, variable], colour = I("white"))) +
77 main = title, 89 geom_histogram(binwidth = mybinwidth) +
78 xlab = variable, 90 labs(title = title, x = variable, y = "count") +
79 geom="histogram", 91 scale_x_continuous() +
80 binwidth = mybinwidth,
81 col = I("white")) +
82 geom_vline(xintercept = cutoff) + 92 geom_vline(xintercept = cutoff) +
83 annotate(geom="text", 93 annotate(geom = "text",
84 x=cutoff + mybinwidth, y=1, 94 x = cutoff + mybinwidth, y = 1,
85 label=mylabel, color="white") 95 label = mylabel, color = "white")
86 plot(hist_plot) 96 plot(hist_plot)
87 } 97 return
88 98 }
89 # returns the highest value such as the sum of the ordered values including this highest value 99
90 # is lower (below) than the percentile threshold (n) 100 # returns the highest value such as the sum of the ordered values including this highest
91 percentile_cutoff <- function(n, qcmetrics, variable, plot_title, ...){ 101 # value is lower (below) than the percentile threshold (n)
92 p = n / 100 102 percentile_cutoff <- function(n, qcmetrics, variable, plot_title, ...) {
93 percentile_threshold = quantile(qcmetrics[, variable], p)[[1]] 103 p <- n / 100
104 percentile_threshold <- quantile(qcmetrics[, variable], p)[[1]]
94 plot_hist(qcmetrics, 105 plot_hist(qcmetrics,
95 variable, 106 variable,
96 plot_title, 107 plot_title,
97 percentile_threshold) 108 percentile_threshold)
98 return(percentile_threshold) 109 return(percentile_threshold)
101 pdf(file = opt$pdfplot) 112 pdf(file = opt$pdfplot)
102 113
103 # Determine thresholds based on percentile 114 # Determine thresholds based on percentile
104 115
105 if (opt$percentile_counts > 0) { 116 if (opt$percentile_counts > 0) {
106 counts_threshold <- percentile_cutoff( 117 counts_threshold <- percentile_cutoff(opt$percentile_counts,
107 opt$percentile_counts, 118 QC_metrics,
108 QC_metrics, 119 "total_counts",
109 "total_counts", 120 "Histogram of Aligned read counts per cell")
110 "Histogram of Aligned read counts per cell" 121 } else {
111 )} else {
112 counts_threshold <- opt$absolute_counts 122 counts_threshold <- opt$absolute_counts
113 plot_hist(QC_metrics, 123 plot_hist(QC_metrics,
114 variable = "total_counts", 124 variable = "total_counts",
115 title = "Histogram of Total counts per cell", 125 title = "Histogram of Total counts per cell",
116 cutoff = counts_threshold) 126 cutoff = counts_threshold)
117 } 127 }
118 128
119 if (opt$percentile_genes > 0) { 129 if (opt$percentile_genes > 0) {
120 130 genes_threshold <- percentile_cutoff(opt$percentile_genes,
121 genes_threshold <- percentile_cutoff( 131 QC_metrics,
122 opt$percentile_genes, 132 "nGenes",
123 QC_metrics, 133 "Histogram of Number of detected genes per cell")
124 "nGenes", 134 } else {
125 "Histogram of Number of detected genes per cell"
126 )} else {
127 genes_threshold <- opt$absolute_genes 135 genes_threshold <- opt$absolute_genes
128 plot_hist(QC_metrics, 136 plot_hist(QC_metrics,
129 variable = "nGenes", 137 variable = "nGenes",
130 title = "Histogram of Number of detected genes per cell", 138 title = "Histogram of Number of detected genes per cell",
131 cutoff = genes_threshold) 139 cutoff = genes_threshold)
132 } 140 }
133 141
134 # Filter out rows below thresholds (genes and read counts) 142 # Filter out rows below thresholds (genes and read counts)
135 if (opt$manage_cutoffs == 'union'){ 143 if (opt$manage_cutoffs == "union") {
136 QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold) 144 QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold)
137 } else { 145 } else {
138 QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold) 146 QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold)
139 } 147 }
140 148
141 ## Plot the results 149 ## Plot the results
142 150
143 # Determine title from the parameter logics 151 # Determine title from the parameter logics
144 if (opt$percentile_counts > 0){ 152 if (opt$percentile_counts > 0) {
145 part_one = paste0("Cells with aligned reads counts below the ", 153 part_one <- paste0("Cells with aligned reads counts below the ",
146 opt$percentile_counts, 154 opt$percentile_counts,
147 "th percentile of aligned read counts")} else { 155 "th percentile of aligned read counts")
148 part_one = paste0("Cells with aligned read counts below ", 156 } else {
149 opt$absolute_counts) 157 part_one <- paste0("Cells with aligned read counts below ",
150 } 158 opt$absolute_counts)
151 159 }
152 if(opt$percentile_genes > 0){ 160
153 part_two = paste0("with number of detected genes below the ", 161 if (opt$percentile_genes > 0) {
154 opt$percentile_genes, 162 part_two <- paste0("with number of detected genes below the ",
155 "th percentile of detected gene counts")} else { 163 opt$percentile_genes,
156 part_two = paste0("with number of detected genes below ", 164 "th percentile of detected gene counts")
157 opt$absolute_genes) 165 } else {
158 } 166 part_two <- paste0("with number of detected genes below ",
167 opt$absolute_genes)
168 }
169
159 if (opt$manage_cutoffs == "intersect") { 170 if (opt$manage_cutoffs == "intersect") {
160 conjunction = " and\n" } else { 171 conjunction <- " and\n"
161 conjunction = " or\n" 172 } else {
173 conjunction <- " or\n"
162 } 174 }
163 175
164 # plot with ggplot2 176 # plot with ggplot2
165 ggplot(QC_metrics, aes(nGenes, total_counts, colour = filtered)) + 177 ggplot(QC_metrics, aes(nGenes, total_counts, colour = filtered)) +
166 geom_point() + scale_y_log10() + 178 geom_point() +
167 scale_colour_discrete(name = "", 179 scale_y_log10() +
168 breaks= c(FALSE, TRUE), 180 scale_colour_discrete(name = "",
169 labels= c(paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"), 181 breaks = c(FALSE, TRUE),
170 paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)"))) + 182 labels = c(paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"),
171 xlab("Detected genes per cell") + ylab("Aligned reads per cell (log10 scale)") + 183 paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)"))
172 geom_vline(xintercept = genes_threshold) + geom_hline(yintercept = counts_threshold) + 184 ) +
173 ggtitle( paste0(part_one, conjunction, part_two, "\nwere filtered out")) + 185 xlab("Detected genes per cell") +
174 theme(plot.title = element_text(size = 8, face = "bold")) 186 ylab("Aligned reads per cell (log10 scale)") +
187 geom_vline(xintercept = genes_threshold) +
188 geom_hline(yintercept = counts_threshold) +
189 ggtitle(paste0(part_one, conjunction, part_two, "\nwere filtered out")) +
190 theme(plot.title = element_text(size = 8, face = "bold"))
175 191
176 dev.off() 192 dev.off()
177 193
178 # Retrieve identifier of kept cells 194 # Retrieve identifier of kept_cells
179 kept.cells <- QC_metrics$cell_id[!QC_metrics$filtered] 195 kept_cells <- QC_metrics$cell_id[!QC_metrics$filtered]
180 196
181 data.counts <- data.frame(Genes=rownames(data.counts[,kept.cells]), data.counts[,kept.cells], check.names = FALSE) 197 data_counts <- data.frame(Genes = rownames(data_counts[, kept_cells]),
182 198 data_counts[, kept_cells],
183 # Save filtered cells 199 check.names = FALSE)
184 write.table( 200
185 data.counts, 201 # Save filtered cells
202 write.table(data_counts,
186 opt$output, 203 opt$output,
187 sep = "\t", 204 sep = "\t",
188 quote = F, 205 quote = FALSE,
189 col.names = T, 206 col.names = TRUE,
190 row.names = F 207 row.names = FALSE
191 ) 208 )
192 209
193 # Add QC metrics of filtered cells to a metadata file 210 # Add QC metrics of filtered cells to a metadata file
194 metadata <- QC_metrics 211 metadata <- QC_metrics
195 212
196 # Save the metadata (QC metrics) file 213 # Save the metadata (QC metrics) file
197 write.table( 214 write.table(metadata,
198 metadata,
199 opt$output_metada, 215 opt$output_metada,
200 sep = "\t", 216 sep = "\t",
201 quote = F, 217 quote = FALSE,
202 col.names = T, 218 col.names = TRUE,
203 row.names = F 219 row.names = FALSE
204 ) 220 )