| 73 | 1 ######################################################## | 
|  | 2 # | 
|  | 3 # creation date : 04/01/16 | 
|  | 4 # last modification : 17/09/16 | 
|  | 5 # author : Dr Nicolas Beaume | 
|  | 6 # owner : IRRI | 
|  | 7 # | 
|  | 8 ######################################################## | 
|  | 9 | 
|  | 10 ############################ helper functions ####################### | 
|  | 11 | 
|  | 12 # encode one position in one individual | 
|  | 13 encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){ | 
|  | 14   res <- x | 
|  | 15   if(!is.na(x)) { | 
|  | 16     if(isHeterozygous(x, sep = sep)) { | 
|  | 17       # heterozygous | 
|  | 18       res <- code[2] | 
|  | 19     } else { | 
|  | 20       # determine whether it is the minor or major allele | 
|  | 21       x <- unlist(strsplit(x, sep)) | 
|  | 22       # need to check only one element as we already know it is a homozygous | 
|  | 23       if(length(x) > 1) { | 
|  | 24         x <- x[1] | 
|  | 25       } | 
|  | 26       if(x==major) { | 
|  | 27         res <- code[3] | 
|  | 28       } else { | 
|  | 29         res <- code[1] | 
|  | 30       } | 
|  | 31     } | 
|  | 32   } else { | 
|  | 33     # keep NA as NA | 
|  | 34     res <- NA | 
|  | 35   } | 
|  | 36   return(res) | 
|  | 37 } | 
|  | 38 | 
|  | 39 # rewrite a marker to make an exact count of the allele for the current marker | 
|  | 40 encodeGenotype.rewrite <- function(x, sep=""){ | 
|  | 41   res <- x | 
|  | 42   if(!is.na(x)) { | 
|  | 43     if(length(unlist(strsplit(x,sep)))==1) { | 
|  | 44       # in case of homozygous, must be counted 2 times so the caracter is written 2 times | 
|  | 45       res <- c(x,x) | 
|  | 46     } else { | 
|  | 47       # heterozygous | 
|  | 48       res <- unlist(strsplit(x, split=sep)) | 
|  | 49     } | 
|  | 50   } else { | 
|  | 51     res <- NA | 
|  | 52   } | 
|  | 53   return(res) | 
|  | 54 } | 
|  | 55 | 
|  | 56 # encode one individual | 
|  | 57 encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){ | 
|  | 58   # rewrite genotype to make sure everything is written as a double caracter value | 
|  | 59   newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep)) | 
|  | 60   # compute the occurcence of each genotype to determine major an minor allele | 
|  | 61   stat <- table(as.character(newIndiv)) | 
|  | 62   major <- names(stat)[which.max(stat)] | 
|  | 63   # Encode using the appropriate code | 
|  | 64   indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep)) | 
|  | 65   return(indiv) | 
|  | 66 } | 
|  | 67 | 
|  | 68 # determine if the genotype is an homozygous or heterozygous one | 
|  | 69 # genotype must be written with two characters, even homozygous | 
|  | 70 # (see encodeGenotype.rewrite() function ) | 
|  | 71 isHeterozygous <- function(genotype, sep=""){ | 
|  | 72   bool <- F | 
|  | 73   # case of NA, can't be determined | 
|  | 74   if(is.na(genotype)){ | 
|  | 75     bool <- NA | 
|  | 76   } else { | 
|  | 77     # check whether both element of the genotype are the same or not | 
|  | 78     x <- unlist(strsplit(genotype, sep)) | 
|  | 79     if(length(x) > 1 & !(x[1] %in% x[2])) { | 
|  | 80       bool <- T | 
|  | 81     } | 
|  | 82 | 
|  | 83   } | 
|  | 84   return(bool) | 
|  | 85 } | 
|  | 86 | 
|  | 87 # check if encoding has been made properly. return a boolean vector | 
|  | 88 # which has the same length that the number of columns of the input matrix | 
|  | 89 # at marker i, check[i] is true if code[3] is larger than code[1], false otherwise | 
|  | 90 # please note that this function is not used in the current tool and let | 
|  | 91 # by convinience for being used outside of galaxy | 
|  | 92 checkEncoding <- function(encoded, code=c(0,1,2)) { | 
|  | 93   check <- NULL | 
|  | 94   for(i in 1:ncol(encoded)) { | 
|  | 95     # find major an minor allele | 
|  | 96     major <- length(which(encoded[,i]==code[3])) | 
|  | 97     minor <- length(which(encoded[,i]==code[1])) | 
|  | 98     # comaprison | 
|  | 99     if(major >= minor) { | 
|  | 100       check <- c(check, T) | 
|  | 101     } else { | 
|  | 102       check <- c(check, F) | 
|  | 103     } | 
|  | 104   } | 
|  | 105   return(check) | 
|  | 106 } | 
|  | 107 | 
|  | 108 ################################## main function ########################### | 
|  | 109 # encode all individuals | 
|  | 110 encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){ | 
|  | 111   # encode genotype | 
|  | 112   encoded <- apply(raw, 2, encodeGenotype.vec, sep, code) | 
|  | 113   # set all NA to -1 (thus encoding schems with -1 are not allowed) | 
|  | 114   encoded[is.na(encoded)] <- -1 | 
|  | 115   write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t") | 
|  | 116 } | 
|  | 117 | 
|  | 118 ############################ main ############################# | 
|  | 119 # load argument from xml file | 
|  | 120 cmd <- commandArgs(T) | 
|  | 121 source(cmd[1]) | 
|  | 122 # load genotype | 
|  | 123 genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T) | 
|  | 124 code <- c(0,1,2) | 
|  | 125 encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out) | 
|  | 126 cat(paste(out,".csv", "\n", sep="")) |