annotate Rscripts/filter-RIDB.R @ 61:d685210eef3e

fix in pdftotabular tool
author pieter.lukasse@wur.nl
date Fri, 19 Dec 2014 15:30:13 +0100
parents 9d5f4f5f764b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
1 ##
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
2 #
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
3 # Removes duplicates from a RI-database
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
4 #
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
5 # Usage:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
6 # Rscript filter-RIDB.R /path/to/retention_db.txt output_RIDB_file.txt
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
7 #
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
8 ##
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
9
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
10 # Commandline arguments
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
11 args <- commandArgs(TRUE)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
12 ridb <- args[1]
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
13 out_file <- args[2]
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
14
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
15 # Function to check duplicates
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
16 duplicates <- function(dat) {
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
17 s <- do.call("order", as.data.frame(dat))
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
18 non.dup <- !duplicated(dat[s, ])
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
19 orig.ind <- s[non.dup]
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
20 first.occ <- orig.ind[cumsum(non.dup)]
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
21 first.occ[non.dup] <- NA
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
22 first.occ[order(s)]
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
23 }
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
24
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
25 # Load CSV file
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
26 ridb <- read.csv(ridb,header=TRUE, sep="\t")
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
27 ## Filters on: CAS FORMULA Column type Column phase type Column name
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
28 filter_cols <- c(1, 3, 5, 6, 7)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
29 cat("RIDB dimensions: ")
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
30 print(dim(ridb))
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
31 deleted <- NULL
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
32 cat("Checking for duplicates...")
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
33 dups <- duplicates(ridb[,filter_cols])
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
34 cat("\t[DONE]\nRemoving duplicates...")
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
35 newridb <- ridb
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
36 newridb["min"] <- NA
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
37 newridb["max"] <- NA
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
38 newridb["orig.columns"] <- NA
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
39 for (i in unique(dups)) {
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
40 if (!is.na(i)) {
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
41 rows <- which(dups == i)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
42 duprows <- ridb[c(i, rows),]
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
43 # Replace duplicate rows with one row containing the median value
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
44 new_RI <- median(duprows$RI)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
45 newridb$RI[i] <- median(duprows$RI)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
46 newridb$min[i] <- min(duprows$RI)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
47 newridb$max[i] <- max(duprows$RI)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
48 newridb$orig.columns[i] <- paste(rows, collapse=",")
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
49 deleted <- c(deleted, rows)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
50 }
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
51 }
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
52 cat("\t\t[DONE]\nCreating new dataset...")
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
53 out_ridb <- newridb[-deleted,]
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
54 cat("\t\t[DONE]\nWriting new dataset...")
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
55 write.table(out_ridb, na='', file=out_file, quote=T, sep="\t", row.names=F)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
56 cat("\t\t[DONE]\n")