Mercurial > repos > artbio > gsc_filter_genes
comparison filter_genes.R @ 2:afe949d332b3 draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_filter_genes commit b184054ad6d4230ab0a714c13f9ef32449faa327
author | artbio |
---|---|
date | Mon, 16 Oct 2023 23:26:20 +0000 |
parents | 5d2304b09f58 |
children |
comparison
equal
deleted
inserted
replaced
1:5d2304b09f58 | 2:afe949d332b3 |
---|---|
1 # ######################## | |
2 # filter genes # | |
3 # ######################## | |
4 | |
5 # Filter out low expressed genes | 1 # Filter out low expressed genes |
6 | 2 |
7 # Example of command (used for generate output file) : | 3 # Example of command (used for generate output file) : |
8 # Rscript filter_genes.R -f <input file> -o <output file> | 4 # Rscript filter_genes.R -f <input file> -o <output file> |
9 | 5 |
10 # load packages that are provided in the conda env | 6 # load packages that are provided in the conda env |
11 options( show.error.messages=F, | 7 options(show.error.messages = FALSE, |
12 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) | 8 error = function() { |
9 cat(geterrmessage(), file = stderr()) | |
10 q("no", 1, FALSE) | |
11 } | |
12 ) | |
13 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | 13 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") |
14 library(optparse) | 14 library(optparse) |
15 | 15 |
16 # Arguments | 16 # Arguments |
17 option_list = list( | 17 option_list <- list( |
18 make_option( | 18 make_option( |
19 c("-f", "--input"), | 19 c("-f", "--input"), |
20 default = NA, | 20 default = NA, |
21 type = 'character', | 21 type = "character", |
22 help = "Input file that contains count values to filter" | 22 help = "Input file that contains count values to filter" |
23 ), | 23 ), |
24 make_option( | 24 make_option( |
25 c("-s", "--sep"), | 25 c("-s", "--sep"), |
26 default = '\t', | 26 default = "\t", |
27 type = 'character', | 27 type = "character", |
28 help = "File separator [default : '%default' ]" | 28 help = "File separator [default : '%default' ]" |
29 ), | 29 ), |
30 make_option( | 30 make_option( |
31 c("-c", "--colnames"), | 31 c("-c", "--colnames"), |
32 default = TRUE, | 32 default = TRUE, |
33 type = 'logical', | 33 type = "logical", |
34 help = "first line is a header [default : '%default' ]" | 34 help = "first line is a header [default : '%default' ]" |
35 ), | 35 ), |
36 make_option( | 36 make_option( |
37 "--percentile_detection", | 37 "--percentile_detection", |
38 default = 0, | 38 default = 0, |
39 type = 'numeric', | 39 type = "numeric", |
40 help = "Include genes with detected expression in at least \ | 40 help = "Include genes with detected expression in at least \ |
41 this fraction of cells [default : '%default' ]" | 41 this fraction of cells [default : '%default' ]" |
42 ), | 42 ), |
43 make_option( | 43 make_option( |
44 "--absolute_detection", | 44 "--absolute_detection", |
45 default = 0, | 45 default = 0, |
46 type = 'numeric', | 46 type = "numeric", |
47 help = "Include genes with detected expression in at least \ | 47 help = "Include genes with detected expression in at least \ |
48 this number of cells [default : '%default' ]" | 48 this number of cells [default : '%default' ]" |
49 ), | 49 ), |
50 make_option( | 50 make_option( |
51 c("-o", "--output"), | 51 c("-o", "--output"), |
52 default = NA, | 52 default = NA, |
53 type = 'character', | 53 type = "character", |
54 help = "Output name [default : '%default' ]" | 54 help = "Output name [default : '%default' ]" |
55 ) | 55 ) |
56 ) | 56 ) |
57 | 57 |
58 opt = parse_args(OptionParser(option_list = option_list), | 58 opt <- parse_args(OptionParser(option_list = option_list), |
59 args = commandArgs(trailingOnly = TRUE)) | 59 args = commandArgs(trailingOnly = TRUE)) |
60 if (opt$sep == "tab") {opt$sep = "\t"} | 60 if (opt$sep == "tab") { |
61 if (opt$sep == "comma") {opt$sep = ","} | 61 opt$sep <- "\t" |
62 } | |
63 if (opt$sep == "comma") { | |
64 opt$sep <- "," | |
65 } | |
62 | 66 |
63 # Open files | 67 # Open files |
64 data.counts <- read.table( | 68 data.counts <- read.delim( |
65 opt$input, | 69 opt$input, |
66 h = opt$colnames, | 70 h = opt$colnames, |
67 row.names = 1, | 71 row.names = 1, |
68 sep = opt$sep, | 72 sep = opt$sep, |
69 check.names = F | 73 check.names = FALSE |
70 ) | 74 ) |
71 | 75 |
72 # note the [if else] below, to handle percentile_detection=absolute_detection=0 | 76 # note the [if else] below, to handle percentile_detection=absolute_detection=0 |
73 # Search for genes that are expressed in a certain percent of cells | 77 # Search for genes that are expressed in a certain percent of cells |
74 if (opt$percentile_detection > 0) { | 78 if (opt$percentile_detection > 0) { |
75 kept_genes <- rowSums(data.counts != 0) >= (opt$percentile_detection * ncol(data.counts)) | 79 kept_genes <- rowSums(data.counts != 0) >= (opt$percentile_detection * ncol(data.counts)) |
76 } else { | 80 } else { |
77 | 81 |
78 # Search for genes that are expressed in more than an absolute number of cells | 82 # Search for genes that are expressed in more than an absolute number of cells |
79 kept_genes <- rowSums(data.counts != 0) >= (opt$absolute_detection) | 83 kept_genes <- rowSums(data.counts != 0) >= (opt$absolute_detection) |
80 } | 84 } |
81 | 85 |
82 # Filter matrix | 86 # Filter matrix |
83 data.counts <- data.counts[kept_genes,] | 87 data.counts <- data.counts[kept_genes, ] |
84 data.counts <- cbind(Genes=rownames(data.counts), data.counts) | 88 data.counts <- cbind(Genes = rownames(data.counts), data.counts) |
85 | 89 |
86 # Save filtered matrix | 90 # Save filtered matrix |
87 write.table( | 91 write.table( |
88 data.counts, | 92 data.counts, |
89 opt$output, | 93 opt$output, |
90 sep = "\t", | 94 sep = "\t", |
91 quote = F, | 95 quote = FALSE, |
92 col.names = T, | 96 col.names = TRUE, |
93 row.names = F | 97 row.names = FALSE |
94 ) | 98 ) |