##
#
# Removes duplicates from a RI-database
#
# Usage:
#       Rscript filter-RIDB.R /path/to/retention_db.txt output_RIDB_file.txt
#
##

# Commandline arguments
args  <- commandArgs(TRUE)
ridb <- args[1]
out_file <- args[2]

# Function to check duplicates
duplicates <- function(dat) { 
     s <- do.call("order", as.data.frame(dat)) 
     non.dup <- !duplicated(dat[s, ]) 
     orig.ind <- s[non.dup] 
     first.occ <- orig.ind[cumsum(non.dup)] 
     first.occ[non.dup] <- NA 
     first.occ[order(s)]
}

# Load CSV file
ridb <- read.csv(ridb,header=TRUE, sep="\t")
## Filters on: CAS FORMULA Column type Column phase type Column name
filter_cols <- c(1, 3, 5, 6, 7)
cat("RIDB dimensions: ")
print(dim(ridb))
deleted <- NULL
cat("Checking for duplicates...")
dups <- duplicates(ridb[,filter_cols])
cat("\t[DONE]\nRemoving duplicates...")
newridb <- ridb
newridb["min"] <- NA
newridb["max"] <- NA
newridb["orig.columns"] <- NA
for (i in unique(dups)) {
    if (!is.na(i)) {
        rows <- which(dups == i)
        duprows <- ridb[c(i, rows),]
        # Replace duplicate rows with one row containing the median value
        new_RI <- median(duprows$RI)
        newridb$RI[i] <- median(duprows$RI)
        newridb$min[i] <- min(duprows$RI)
        newridb$max[i] <- max(duprows$RI)
        newridb$orig.columns[i] <- paste(rows, collapse=",")
        deleted <- c(deleted, rows)
    }
}
cat("\t\t[DONE]\nCreating new dataset...")
out_ridb <- newridb[-deleted,]
cat("\t\t[DONE]\nWriting new dataset...")
write.table(out_ridb, na='', file=out_file, quote=T, sep="\t", row.names=F)
cat("\t\t[DONE]\n")
