Galaxy |

Changeset 73:ca6af7b12073 (2016-10-28)

Previous changeset 72:366a9dbec192 (2016-10-28) Next changeset 74:dea79015dfc9 (2016-10-28)

Commit message:
Uploaded

added:
encode.R

diff -r 366a9dbec192 -r ca6af7b12073 encode.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/encode.R Fri Oct 28 08:45:13 2016 -0400

[

@@ -0,0 +1,126 @@
+########################################################
+#
+# creation date : 04/01/16
+# last modification : 17/09/16
+# author : Dr Nicolas Beaume
+# owner : IRRI
+#
+########################################################
+
+############################ helper functions #######################
+
+# encode one position in one individual
+encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){
+  res <- x
+  if(!is.na(x)) {
+    if(isHeterozygous(x, sep = sep)) {
+      # heterozygous
+      res <- code[2]
+    } else {
+      # determine whether it is the minor or major allele
+      x <- unlist(strsplit(x, sep))
+      # need to check only one element as we already know it is a homozygous
+      if(length(x) > 1) {
+        x <- x[1]
+      }
+      if(x==major) {
+        res <- code[3]
+      } else {
+        res <- code[1]
+      }
+    }
+  } else {
+    # keep NA as NA
+    res <- NA
+  }
+  return(res)
+}
+
+# rewrite a marker to make an exact count of the allele for the current marker
+encodeGenotype.rewrite <- function(x, sep=""){
+  res <- x
+  if(!is.na(x)) {
+    if(length(unlist(strsplit(x,sep)))==1) {
+      # in case of homozygous, must be counted 2 times so the caracter is written 2 times
+      res <- c(x,x)
+    } else {
+      # heterozygous
+      res <- unlist(strsplit(x, split=sep))
+    }
+  } else {
+    res <- NA
+  }
+  return(res)
+}
+
+# encode one individual
+encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){
+  # rewrite genotype to make sure everything is written as a double caracter value
+  newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep))
+  # compute the occurcence of each genotype to determine major an minor allele
+  stat <- table(as.character(newIndiv))
+  major <- names(stat)[which.max(stat)]
+  # Encode using the appropriate code
+  indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep))
+  return(indiv)
+}
+
+# determine if the genotype is an homozygous or heterozygous one
+# genotype must be written with two characters, even homozygous
+# (see encodeGenotype.rewrite() function )
+isHeterozygous <- function(genotype, sep=""){
+  bool <- F
+  # case of NA, can't be determined
+  if(is.na(genotype)){
+    bool <- NA
+  } else {
+    # check whether both element of the genotype are the same or not
+    x <- unlist(strsplit(genotype, sep))
+    if(length(x) > 1 & !(x[1] %in% x[2])) {
+      bool <- T
+    }
+
+  }
+  return(bool)
+}
+
+# check if encoding has been made properly. return a boolean vector
+# which has the same length that the number of columns of the input matrix
+# at marker i, check[i] is true if code[3] is larger than code[1], false otherwise
+# please note that this function is not used in the current tool and let
+# by convinience for being used outside of galaxy
+checkEncoding <- function(encoded, code=c(0,1,2)) {
+  check <- NULL
+  for(i in 1:ncol(encoded)) {
+    # find major an minor allele
+    major <- length(which(encoded[,i]==code[3]))
+    minor <- length(which(encoded[,i]==code[1]))
+    # comaprison
+    if(major >= minor) {
+      check <- c(check, T)
+    } else {
+      check <- c(check, F)
+    }
+  }
+  return(check)
+}
+
+################################## main function ###########################
+# encode all individuals
+encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){
+  # encode genotype
+  encoded <- apply(raw, 2, encodeGenotype.vec, sep, code)
+  # set all NA to -1 (thus encoding schems with -1 are not allowed)
+  encoded[is.na(encoded)] <- -1
+  write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t")
+}
+
+############################ main #############################
+# load argument from xml file
+cmd <- commandArgs(T)
+source(cmd[1])
+# load genotype
+genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T)
+code <- c(0,1,2)
+encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out)
+cat(paste(out,".csv", "\n", sep=""))
\ No newline at end of file