Mercurial > repos > artbio > gsc_filter_cells
comparison filter_cells.R @ 3:5407dc697e24 draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_filter_cells commit fdfb8deb1e770c824fcd1e127c2e3faa4e0cf35e
author | artbio |
---|---|
date | Mon, 16 Oct 2023 22:33:23 +0000 |
parents | e63bd8f13679 |
children |
comparison
equal
deleted
inserted
replaced
2:47cf889595ec | 3:5407dc697e24 |
---|---|
1 # First step of the signature-based workflow | 1 # First step of the signature-based workflow |
2 # Remove low quality cells below a user-fixed cutoff of | 2 # Remove low quality cells below a user-fixed cutoff of |
3 # percentiles or raw values of number of genes detected or | 3 # percentiles or raw values of number of genes detected or |
4 # total aligned reads | 4 # total aligned reads |
5 | 5 |
6 # Example of command (that generates output files) : | 6 options(show.error.messages = FALSE, |
7 # Rscript filter_cells.R -f <input> --sep "/t" --absolute_genes 1700 --absolute_counts 90000 --pdfplot <plotfile.pdf> --output <filtered_cells.tsv> --output_metada <filtered_metadata.tsv> | 7 error = function() { |
8 | 8 cat(geterrmessage(), file = stderr()) |
9 # load packages that are provided in the conda env | 9 q("no", 1, FALSE) |
10 options( show.error.messages=F, | 10 } |
11 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) | 11 ) |
12 | |
12 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | 13 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") |
13 warnings() | 14 warnings() |
15 | |
14 library(optparse) | 16 library(optparse) |
15 library(ggplot2) | 17 library(ggplot2) |
16 | 18 |
17 # Arguments | 19 # Arguments |
18 option_list = list( | 20 option_list <- list( |
19 make_option(c("-f", "--file"), default=NA, type='character', | 21 make_option(c("-f", "--file"), default = NA, type = "character", |
20 help="Input file that contains values to filter"), | 22 help = "Input file that contains values to filter"), |
21 make_option("--sep", default="\t", type='character', | 23 make_option("--sep", default = "\t", type = "character", |
22 help="File column separator [default : '%default' ]"), | 24 help = "File column separator [default : '%default' ]"), |
23 make_option("--percentile_genes", default=0, type='integer', | 25 make_option("--percentile_genes", default = 0, type = "integer", |
24 help="nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"), | 26 help = "nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"), |
25 make_option("--percentile_counts", default=0, type='integer', | 27 make_option("--percentile_counts", default = 0, type = "integer", |
26 help="nth Percentile of the total counts per cell distribution [default : '%default' ]"), | 28 help = "nth Percentile of the total counts per cell distribution [default : '%default' ]"), |
27 make_option("--absolute_genes", default=0, type='integer', | 29 make_option("--absolute_genes", default = 0, type = "integer", |
28 help="Remove cells that didn't express at least this number of genes [default : '%default' ]"), | 30 help = "Remove cells that did not express at least this number of genes [default : '%default' ]"), |
29 make_option("--absolute_counts", default=0, type='integer', | 31 make_option("--absolute_counts", default = 0, type = "integer", |
30 help="Number of transcript threshold for cell filtering [default : '%default' ]"), | 32 help = "Number of transcript threshold for cell filtering [default : '%default' ]"), |
31 make_option("--manage_cutoffs", default="intersect", type='character', | 33 make_option("--manage_cutoffs", default = "intersect", type = "character", |
32 help="combine or intersect cutoffs for filtering"), | 34 help = "combine or intersect cutoffs for filtering"), |
33 make_option("--pdfplot", type = 'character', | 35 make_option("--pdfplot", type = "character", |
34 help="Path to pdf file of the plots"), | 36 help = "Path to pdf file of the plots"), |
35 make_option("--output", type = 'character', | 37 make_option("--output", type = "character", |
36 help="Path to tsv file of filtered cell data"), | 38 help = "Path to tsv file of filtered cell data"), |
37 make_option("--output_metada", type = 'character', | 39 make_option("--output_metada", type = "character", |
38 help="Path to tsv file of filtered cell metadata") | 40 help = "Path to tsv file of filtered cell metadata") |
39 ) | 41 ) |
40 opt = parse_args(OptionParser(option_list=option_list), args = commandArgs(trailingOnly = TRUE)) | 42 opt <- parse_args(OptionParser(option_list = option_list), |
41 if (opt$sep == "tab") {opt$sep = "\t"} | 43 args = commandArgs(trailingOnly = TRUE) |
42 if (opt$sep == "comma") {opt$sep = ","} | 44 ) |
43 if (opt$sep == "space") {opt$sep = " "} | 45 if (opt$sep == "tab") { |
44 | 46 opt$sep <- "\t" |
45 | 47 } |
46 # check consistency of filtering options | 48 if (opt$sep == "comma") { |
47 if ((opt$percentile_counts > 0) & (opt$absolute_counts > 0)) { | 49 opt$sep <- "," |
48 opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering | 50 } |
49 # if ((opt$percentile_counts == 0) & (opt$absolute_counts == 0)) { | 51 if (opt$sep == "space") { |
50 # opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering | 52 opt$sep <- " " |
51 if ((opt$percentile_genes > 0) & (opt$absolute_genes > 0)) { | 53 } |
52 opt$percentile_genes = 0 } # since input parameters are not consistent (one or either method, not both), no filtering | 54 |
53 # if ((opt$percentile_genes == 0) & (opt$absolute_genes == 0)) { | 55 |
54 # opt$percentile_genes = 100 } # since input parameters are not consistent (one or either method, not both), no filtering | 56 ## check consistency of filtering options |
57 | |
58 # if input parameters are not consistent (one or either method, not both), no filtering | |
59 if ((opt$percentile_counts > 0) && (opt$absolute_counts > 0)) { | |
60 opt$percentile_counts <- 0 | |
61 } | |
62 | |
63 # if input parameters are not consistent (one or either method, not both), no filtering | |
64 if ((opt$percentile_genes > 0) && (opt$absolute_genes > 0)) { | |
65 opt$percentile_genes <- 0 | |
66 } | |
55 | 67 |
56 # Import datasets | 68 # Import datasets |
57 data.counts <- read.table( | 69 data_counts <- read.delim( |
58 opt$file, | 70 opt$file, |
59 header = TRUE, | 71 header = TRUE, |
60 stringsAsFactors = F, | 72 stringsAsFactors = FALSE, |
61 sep = opt$sep, | 73 sep = opt$sep, |
62 check.names = FALSE, | 74 check.names = FALSE, |
63 row.names = 1 | 75 row.names = 1 |
64 ) | 76 ) |
65 | 77 |
66 QC_metrics <- | 78 QC_metrics <- data.frame(cell_id = colnames(data_counts), |
67 data.frame(cell_id = colnames(data.counts), | 79 nGenes = colSums(data_counts != 0), # nGenes is Number of detected genes for each cell |
68 nGenes = colSums(data.counts != 0), # nGenes : Number of detected genes for each cell | 80 total_counts = colSums(data_counts), # total_counts is Total counts per cell |
69 total_counts = colSums(data.counts), # total_counts : Total counts per cell | 81 stringsAsFactors = FALSE) |
70 stringsAsFactors = F) | 82 |
71 | 83 |
72 plot_hist <- function(mydata, variable, title, cutoff){ | 84 plot_hist <- function(mydata, variable, title, cutoff) { |
73 mybinwidth = round(max(mydata[, variable]) * 5 / 100) | 85 mybinwidth <- round(max(mydata[, variable]) * 5 / 100) |
74 mylabel = paste0("cutoff= ", cutoff) | 86 mylabel <- paste0("cutoff= ", cutoff) |
75 hist_plot <- qplot( | 87 hist_plot <- ggplot(as.data.frame(mydata[, variable]), |
76 mydata[, variable], | 88 aes(x = mydata[, variable], colour = I("white"))) + |
77 main = title, | 89 geom_histogram(binwidth = mybinwidth) + |
78 xlab = variable, | 90 labs(title = title, x = variable, y = "count") + |
79 geom="histogram", | 91 scale_x_continuous() + |
80 binwidth = mybinwidth, | |
81 col = I("white")) + | |
82 geom_vline(xintercept = cutoff) + | 92 geom_vline(xintercept = cutoff) + |
83 annotate(geom="text", | 93 annotate(geom = "text", |
84 x=cutoff + mybinwidth, y=1, | 94 x = cutoff + mybinwidth, y = 1, |
85 label=mylabel, color="white") | 95 label = mylabel, color = "white") |
86 plot(hist_plot) | 96 plot(hist_plot) |
87 } | 97 return |
88 | 98 } |
89 # returns the highest value such as the sum of the ordered values including this highest value | 99 |
90 # is lower (below) than the percentile threshold (n) | 100 # returns the highest value such as the sum of the ordered values including this highest |
91 percentile_cutoff <- function(n, qcmetrics, variable, plot_title, ...){ | 101 # value is lower (below) than the percentile threshold (n) |
92 p = n / 100 | 102 percentile_cutoff <- function(n, qcmetrics, variable, plot_title, ...) { |
93 percentile_threshold = quantile(qcmetrics[, variable], p)[[1]] | 103 p <- n / 100 |
104 percentile_threshold <- quantile(qcmetrics[, variable], p)[[1]] | |
94 plot_hist(qcmetrics, | 105 plot_hist(qcmetrics, |
95 variable, | 106 variable, |
96 plot_title, | 107 plot_title, |
97 percentile_threshold) | 108 percentile_threshold) |
98 return(percentile_threshold) | 109 return(percentile_threshold) |
101 pdf(file = opt$pdfplot) | 112 pdf(file = opt$pdfplot) |
102 | 113 |
103 # Determine thresholds based on percentile | 114 # Determine thresholds based on percentile |
104 | 115 |
105 if (opt$percentile_counts > 0) { | 116 if (opt$percentile_counts > 0) { |
106 counts_threshold <- percentile_cutoff( | 117 counts_threshold <- percentile_cutoff(opt$percentile_counts, |
107 opt$percentile_counts, | 118 QC_metrics, |
108 QC_metrics, | 119 "total_counts", |
109 "total_counts", | 120 "Histogram of Aligned read counts per cell") |
110 "Histogram of Aligned read counts per cell" | 121 } else { |
111 )} else { | |
112 counts_threshold <- opt$absolute_counts | 122 counts_threshold <- opt$absolute_counts |
113 plot_hist(QC_metrics, | 123 plot_hist(QC_metrics, |
114 variable = "total_counts", | 124 variable = "total_counts", |
115 title = "Histogram of Total counts per cell", | 125 title = "Histogram of Total counts per cell", |
116 cutoff = counts_threshold) | 126 cutoff = counts_threshold) |
117 } | 127 } |
118 | 128 |
119 if (opt$percentile_genes > 0) { | 129 if (opt$percentile_genes > 0) { |
120 | 130 genes_threshold <- percentile_cutoff(opt$percentile_genes, |
121 genes_threshold <- percentile_cutoff( | 131 QC_metrics, |
122 opt$percentile_genes, | 132 "nGenes", |
123 QC_metrics, | 133 "Histogram of Number of detected genes per cell") |
124 "nGenes", | 134 } else { |
125 "Histogram of Number of detected genes per cell" | |
126 )} else { | |
127 genes_threshold <- opt$absolute_genes | 135 genes_threshold <- opt$absolute_genes |
128 plot_hist(QC_metrics, | 136 plot_hist(QC_metrics, |
129 variable = "nGenes", | 137 variable = "nGenes", |
130 title = "Histogram of Number of detected genes per cell", | 138 title = "Histogram of Number of detected genes per cell", |
131 cutoff = genes_threshold) | 139 cutoff = genes_threshold) |
132 } | 140 } |
133 | 141 |
134 # Filter out rows below thresholds (genes and read counts) | 142 # Filter out rows below thresholds (genes and read counts) |
135 if (opt$manage_cutoffs == 'union'){ | 143 if (opt$manage_cutoffs == "union") { |
136 QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold) | 144 QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold) |
137 } else { | 145 } else { |
138 QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold) | 146 QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold) |
139 } | 147 } |
140 | 148 |
141 ## Plot the results | 149 ## Plot the results |
142 | 150 |
143 # Determine title from the parameter logics | 151 # Determine title from the parameter logics |
144 if (opt$percentile_counts > 0){ | 152 if (opt$percentile_counts > 0) { |
145 part_one = paste0("Cells with aligned reads counts below the ", | 153 part_one <- paste0("Cells with aligned reads counts below the ", |
146 opt$percentile_counts, | 154 opt$percentile_counts, |
147 "th percentile of aligned read counts")} else { | 155 "th percentile of aligned read counts") |
148 part_one = paste0("Cells with aligned read counts below ", | 156 } else { |
149 opt$absolute_counts) | 157 part_one <- paste0("Cells with aligned read counts below ", |
150 } | 158 opt$absolute_counts) |
151 | 159 } |
152 if(opt$percentile_genes > 0){ | 160 |
153 part_two = paste0("with number of detected genes below the ", | 161 if (opt$percentile_genes > 0) { |
154 opt$percentile_genes, | 162 part_two <- paste0("with number of detected genes below the ", |
155 "th percentile of detected gene counts")} else { | 163 opt$percentile_genes, |
156 part_two = paste0("with number of detected genes below ", | 164 "th percentile of detected gene counts") |
157 opt$absolute_genes) | 165 } else { |
158 } | 166 part_two <- paste0("with number of detected genes below ", |
167 opt$absolute_genes) | |
168 } | |
169 | |
159 if (opt$manage_cutoffs == "intersect") { | 170 if (opt$manage_cutoffs == "intersect") { |
160 conjunction = " and\n" } else { | 171 conjunction <- " and\n" |
161 conjunction = " or\n" | 172 } else { |
173 conjunction <- " or\n" | |
162 } | 174 } |
163 | 175 |
164 # plot with ggplot2 | 176 # plot with ggplot2 |
165 ggplot(QC_metrics, aes(nGenes, total_counts, colour = filtered)) + | 177 ggplot(QC_metrics, aes(nGenes, total_counts, colour = filtered)) + |
166 geom_point() + scale_y_log10() + | 178 geom_point() + |
167 scale_colour_discrete(name = "", | 179 scale_y_log10() + |
168 breaks= c(FALSE, TRUE), | 180 scale_colour_discrete(name = "", |
169 labels= c(paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"), | 181 breaks = c(FALSE, TRUE), |
170 paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)"))) + | 182 labels = c(paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"), |
171 xlab("Detected genes per cell") + ylab("Aligned reads per cell (log10 scale)") + | 183 paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)")) |
172 geom_vline(xintercept = genes_threshold) + geom_hline(yintercept = counts_threshold) + | 184 ) + |
173 ggtitle( paste0(part_one, conjunction, part_two, "\nwere filtered out")) + | 185 xlab("Detected genes per cell") + |
174 theme(plot.title = element_text(size = 8, face = "bold")) | 186 ylab("Aligned reads per cell (log10 scale)") + |
187 geom_vline(xintercept = genes_threshold) + | |
188 geom_hline(yintercept = counts_threshold) + | |
189 ggtitle(paste0(part_one, conjunction, part_two, "\nwere filtered out")) + | |
190 theme(plot.title = element_text(size = 8, face = "bold")) | |
175 | 191 |
176 dev.off() | 192 dev.off() |
177 | 193 |
178 # Retrieve identifier of kept cells | 194 # Retrieve identifier of kept_cells |
179 kept.cells <- QC_metrics$cell_id[!QC_metrics$filtered] | 195 kept_cells <- QC_metrics$cell_id[!QC_metrics$filtered] |
180 | 196 |
181 data.counts <- data.frame(Genes=rownames(data.counts[,kept.cells]), data.counts[,kept.cells], check.names = FALSE) | 197 data_counts <- data.frame(Genes = rownames(data_counts[, kept_cells]), |
182 | 198 data_counts[, kept_cells], |
183 # Save filtered cells | 199 check.names = FALSE) |
184 write.table( | 200 |
185 data.counts, | 201 # Save filtered cells |
202 write.table(data_counts, | |
186 opt$output, | 203 opt$output, |
187 sep = "\t", | 204 sep = "\t", |
188 quote = F, | 205 quote = FALSE, |
189 col.names = T, | 206 col.names = TRUE, |
190 row.names = F | 207 row.names = FALSE |
191 ) | 208 ) |
192 | 209 |
193 # Add QC metrics of filtered cells to a metadata file | 210 # Add QC metrics of filtered cells to a metadata file |
194 metadata <- QC_metrics | 211 metadata <- QC_metrics |
195 | 212 |
196 # Save the metadata (QC metrics) file | 213 # Save the metadata (QC metrics) file |
197 write.table( | 214 write.table(metadata, |
198 metadata, | |
199 opt$output_metada, | 215 opt$output_metada, |
200 sep = "\t", | 216 sep = "\t", |
201 quote = F, | 217 quote = FALSE, |
202 col.names = T, | 218 col.names = TRUE, |
203 row.names = F | 219 row.names = FALSE |
204 ) | 220 ) |