0
|
1 ##
|
|
2 #
|
|
3 # Removes duplicates from a RI-database
|
|
4 #
|
|
5 # Usage:
|
|
6 # Rscript filter-RIDB.R /path/to/retention_db.txt output_RIDB_file.txt
|
|
7 #
|
|
8 ##
|
|
9
|
|
10 # Commandline arguments
|
|
11 args <- commandArgs(TRUE)
|
|
12 ridb <- args[1]
|
|
13 out_file <- args[2]
|
|
14
|
|
15 # Function to check duplicates
|
|
16 duplicates <- function(dat) {
|
|
17 s <- do.call("order", as.data.frame(dat))
|
|
18 non.dup <- !duplicated(dat[s, ])
|
|
19 orig.ind <- s[non.dup]
|
|
20 first.occ <- orig.ind[cumsum(non.dup)]
|
|
21 first.occ[non.dup] <- NA
|
|
22 first.occ[order(s)]
|
|
23 }
|
|
24
|
|
25 # Load CSV file
|
|
26 ridb <- read.csv(ridb,header=TRUE, sep="\t")
|
|
27 ## Filters on: CAS FORMULA Column type Column phase type Column name
|
|
28 filter_cols <- c(1, 3, 5, 6, 7)
|
|
29 cat("RIDB dimensions: ")
|
|
30 print(dim(ridb))
|
|
31 deleted <- NULL
|
|
32 cat("Checking for duplicates...")
|
|
33 dups <- duplicates(ridb[,filter_cols])
|
|
34 cat("\t[DONE]\nRemoving duplicates...")
|
|
35 newridb <- ridb
|
|
36 newridb["min"] <- NA
|
|
37 newridb["max"] <- NA
|
|
38 newridb["orig.columns"] <- NA
|
|
39 for (i in unique(dups)) {
|
|
40 if (!is.na(i)) {
|
|
41 rows <- which(dups == i)
|
|
42 duprows <- ridb[c(i, rows),]
|
|
43 # Replace duplicate rows with one row containing the median value
|
|
44 new_RI <- median(duprows$RI)
|
|
45 newridb$RI[i] <- median(duprows$RI)
|
|
46 newridb$min[i] <- min(duprows$RI)
|
|
47 newridb$max[i] <- max(duprows$RI)
|
|
48 newridb$orig.columns[i] <- paste(rows, collapse=",")
|
|
49 deleted <- c(deleted, rows)
|
|
50 }
|
|
51 }
|
|
52 cat("\t\t[DONE]\nCreating new dataset...")
|
|
53 out_ridb <- newridb[-deleted,]
|
|
54 cat("\t\t[DONE]\nWriting new dataset...")
|
|
55 write.table(out_ridb, na='', file=out_file, quote=T, sep="\t", row.names=F)
|
|
56 cat("\t\t[DONE]\n")
|