| 
73
 | 
     1 ########################################################
 | 
| 
 | 
     2 #
 | 
| 
 | 
     3 # creation date : 04/01/16
 | 
| 
 | 
     4 # last modification : 17/09/16
 | 
| 
 | 
     5 # author : Dr Nicolas Beaume
 | 
| 
 | 
     6 # owner : IRRI
 | 
| 
 | 
     7 #
 | 
| 
 | 
     8 ########################################################
 | 
| 
 | 
     9 
 | 
| 
 | 
    10 ############################ helper functions #######################
 | 
| 
 | 
    11 
 | 
| 
 | 
    12 # encode one position in one individual
 | 
| 
 | 
    13 encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){
 | 
| 
 | 
    14   res <- x
 | 
| 
 | 
    15   if(!is.na(x)) {
 | 
| 
 | 
    16     if(isHeterozygous(x, sep = sep)) {
 | 
| 
 | 
    17       # heterozygous
 | 
| 
 | 
    18       res <- code[2]
 | 
| 
 | 
    19     } else {
 | 
| 
 | 
    20       # determine whether it is the minor or major allele
 | 
| 
 | 
    21       x <- unlist(strsplit(x, sep))
 | 
| 
 | 
    22       # need to check only one element as we already know it is a homozygous
 | 
| 
 | 
    23       if(length(x) > 1) {
 | 
| 
 | 
    24         x <- x[1]
 | 
| 
 | 
    25       }
 | 
| 
 | 
    26       if(x==major) {
 | 
| 
 | 
    27         res <- code[3]
 | 
| 
 | 
    28       } else {
 | 
| 
 | 
    29         res <- code[1]
 | 
| 
 | 
    30       }
 | 
| 
 | 
    31     }
 | 
| 
 | 
    32   } else {
 | 
| 
 | 
    33     # keep NA as NA
 | 
| 
 | 
    34     res <- NA
 | 
| 
 | 
    35   }
 | 
| 
 | 
    36   return(res)
 | 
| 
 | 
    37 }
 | 
| 
 | 
    38 
 | 
| 
 | 
    39 # rewrite a marker to make an exact count of the allele for the current marker
 | 
| 
 | 
    40 encodeGenotype.rewrite <- function(x, sep=""){
 | 
| 
 | 
    41   res <- x
 | 
| 
 | 
    42   if(!is.na(x)) {
 | 
| 
 | 
    43     if(length(unlist(strsplit(x,sep)))==1) {
 | 
| 
 | 
    44       # in case of homozygous, must be counted 2 times so the caracter is written 2 times
 | 
| 
 | 
    45       res <- c(x,x)
 | 
| 
 | 
    46     } else {
 | 
| 
 | 
    47       # heterozygous
 | 
| 
 | 
    48       res <- unlist(strsplit(x, split=sep))
 | 
| 
 | 
    49     }
 | 
| 
 | 
    50   } else {
 | 
| 
 | 
    51     res <- NA
 | 
| 
 | 
    52   }
 | 
| 
 | 
    53   return(res)
 | 
| 
 | 
    54 }
 | 
| 
 | 
    55 
 | 
| 
 | 
    56 # encode one individual
 | 
| 
 | 
    57 encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){
 | 
| 
 | 
    58   # rewrite genotype to make sure everything is written as a double caracter value
 | 
| 
 | 
    59   newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep))
 | 
| 
 | 
    60   # compute the occurcence of each genotype to determine major an minor allele
 | 
| 
 | 
    61   stat <- table(as.character(newIndiv))
 | 
| 
 | 
    62   major <- names(stat)[which.max(stat)]
 | 
| 
 | 
    63   # Encode using the appropriate code
 | 
| 
 | 
    64   indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep))
 | 
| 
 | 
    65   return(indiv)
 | 
| 
 | 
    66 }
 | 
| 
 | 
    67 
 | 
| 
 | 
    68 # determine if the genotype is an homozygous or heterozygous one
 | 
| 
 | 
    69 # genotype must be written with two characters, even homozygous 
 | 
| 
 | 
    70 # (see encodeGenotype.rewrite() function )
 | 
| 
 | 
    71 isHeterozygous <- function(genotype, sep=""){
 | 
| 
 | 
    72   bool <- F
 | 
| 
 | 
    73   # case of NA, can't be determined
 | 
| 
 | 
    74   if(is.na(genotype)){
 | 
| 
 | 
    75     bool <- NA
 | 
| 
 | 
    76   } else {
 | 
| 
 | 
    77     # check whether both element of the genotype are the same or not
 | 
| 
 | 
    78     x <- unlist(strsplit(genotype, sep))
 | 
| 
 | 
    79     if(length(x) > 1 & !(x[1] %in% x[2])) {
 | 
| 
 | 
    80       bool <- T
 | 
| 
 | 
    81     }
 | 
| 
 | 
    82     
 | 
| 
 | 
    83   }
 | 
| 
 | 
    84   return(bool)
 | 
| 
 | 
    85 }
 | 
| 
 | 
    86 
 | 
| 
 | 
    87 # check if encoding has been made properly. return a boolean vector
 | 
| 
 | 
    88 # which has the same length that the number of columns of the input matrix
 | 
| 
 | 
    89 # at marker i, check[i] is true if code[3] is larger than code[1], false otherwise
 | 
| 
 | 
    90 # please note that this function is not used in the current tool and let
 | 
| 
 | 
    91 # by convinience for being used outside of galaxy
 | 
| 
 | 
    92 checkEncoding <- function(encoded, code=c(0,1,2)) {
 | 
| 
 | 
    93   check <- NULL
 | 
| 
 | 
    94   for(i in 1:ncol(encoded)) {
 | 
| 
 | 
    95     # find major an minor allele
 | 
| 
 | 
    96     major <- length(which(encoded[,i]==code[3]))
 | 
| 
 | 
    97     minor <- length(which(encoded[,i]==code[1]))
 | 
| 
 | 
    98     # comaprison
 | 
| 
 | 
    99     if(major >= minor) {
 | 
| 
 | 
   100       check <- c(check, T)
 | 
| 
 | 
   101     } else {
 | 
| 
 | 
   102       check <- c(check, F)
 | 
| 
 | 
   103     }
 | 
| 
 | 
   104   }
 | 
| 
 | 
   105   return(check)
 | 
| 
 | 
   106 }
 | 
| 
 | 
   107 
 | 
| 
 | 
   108 ################################## main function ###########################
 | 
| 
 | 
   109 # encode all individuals
 | 
| 
 | 
   110 encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){
 | 
| 
 | 
   111   # encode genotype
 | 
| 
 | 
   112   encoded <- apply(raw, 2, encodeGenotype.vec, sep, code)
 | 
| 
 | 
   113   # set all NA to -1 (thus encoding schems with -1 are not allowed)
 | 
| 
 | 
   114   encoded[is.na(encoded)] <- -1
 | 
| 
 | 
   115   write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t")
 | 
| 
 | 
   116 }
 | 
| 
 | 
   117 
 | 
| 
 | 
   118 ############################ main #############################
 | 
| 
 | 
   119 # load argument from xml file
 | 
| 
 | 
   120 cmd <- commandArgs(T)
 | 
| 
 | 
   121 source(cmd[1])
 | 
| 
 | 
   122 # load genotype
 | 
| 
 | 
   123 genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T)
 | 
| 
 | 
   124 code <- c(0,1,2)
 | 
| 
 | 
   125 encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out)
 | 
| 
 | 
   126 cat(paste(out,".csv", "\n", sep="")) |