| 2 | 1 ######################################################## | 
|  | 2 # | 
|  | 3 # creation date : 04/01/16 | 
|  | 4 # last modification : 17/09/16 | 
|  | 5 # author : Dr Nicolas Beaume | 
|  | 6 # owner : IRRI | 
|  | 7 # | 
|  | 8 ######################################################## | 
|  | 9 | 
|  | 10 log <- file(paste(getwd(), "log_encode.txt", sep="/"), open = "wt") | 
|  | 11 sink(file = log, type="message") | 
|  | 12 | 
|  | 13 ############################ helper functions ####################### | 
|  | 14 | 
|  | 15 # encode one position in one individual | 
|  | 16 encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){ | 
|  | 17   res <- x | 
|  | 18   if(!is.na(x)) { | 
|  | 19     if(isHeterozygous(x, sep = sep)) { | 
|  | 20       # heterozygous | 
|  | 21       res <- code[2] | 
|  | 22     } else { | 
|  | 23       # determine whether it is the minor or major allele | 
|  | 24       x <- unlist(strsplit(x, sep)) | 
|  | 25       # need to check only one element as we already know it is a homozygous | 
|  | 26       if(length(x) > 1) { | 
|  | 27         x <- x[1] | 
|  | 28       } | 
|  | 29       if(x==major) { | 
|  | 30         res <- code[3] | 
|  | 31       } else { | 
|  | 32         res <- code[1] | 
|  | 33       } | 
|  | 34     } | 
|  | 35   } else { | 
|  | 36     res <- NA | 
|  | 37   } | 
|  | 38   return(res) | 
|  | 39 } | 
|  | 40 | 
|  | 41 # rewrite a marker to determine the major allele | 
|  | 42 encodeGenotype.rewrite <- function(x, sep=""){ | 
|  | 43   res <- x | 
|  | 44   if(!is.na(x)) { | 
|  | 45     if(length(unlist(strsplit(x,sep)))==1) { | 
|  | 46       # in case of homozygous, must be counted 2 times | 
|  | 47       res <- c(x,x) | 
|  | 48     } else { | 
|  | 49       res <- unlist(strsplit(x, split=sep)) | 
|  | 50     } | 
|  | 51   } else { | 
|  | 52     res <- NA | 
|  | 53   } | 
|  | 54   return(res) | 
|  | 55 } | 
|  | 56 | 
|  | 57 # encode one individual | 
|  | 58 encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){ | 
|  | 59   newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep)) | 
|  | 60   stat <- table(as.character(newIndiv)) | 
|  | 61   major <- names(stat)[which.max(stat)] | 
|  | 62   indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep)) | 
|  | 63   return(indiv) | 
|  | 64 } | 
|  | 65 | 
|  | 66 | 
|  | 67 isHeterozygous <- function(genotype, sep=""){ | 
|  | 68   bool <- F | 
|  | 69   if(is.na(genotype)){ | 
|  | 70     bool <- NA | 
|  | 71   } else { | 
|  | 72     x <- unlist(strsplit(genotype, sep)) | 
|  | 73     if(length(x) > 1 & !(x[1] %in% x[2])) { | 
|  | 74       bool <- T | 
|  | 75     } | 
|  | 76 | 
|  | 77   } | 
|  | 78   return(bool) | 
|  | 79 } | 
|  | 80 | 
|  | 81 checkEncoding <- function(encoded, code=c(0,1,2)) { | 
|  | 82   check <- NULL | 
|  | 83   for(i in 1:ncol(encoded)) { | 
|  | 84     major <- length(which(encoded[,i]==code[3])) | 
|  | 85     minor <- length(which(encoded[,i]==code[1])) | 
|  | 86     if(major >= minor) { | 
|  | 87       check <- c(check, T) | 
|  | 88     } else { | 
|  | 89       check <- c(check, F) | 
|  | 90     } | 
|  | 91   } | 
|  | 92   return(check) | 
|  | 93 } | 
|  | 94 | 
|  | 95 ################################## main function ########################### | 
|  | 96 # encode all individuals | 
|  | 97 # encode genotype into a {-1,0,1} scheme, where -1 = minor homozygous, 0 = heterozygous, 1 = major homozygous | 
|  | 98 encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){ | 
|  | 99   encoded <- apply(raw, 2, encodeGenotype.vec, sep, code) | 
|  | 100   encoded[is.na(encoded)] <- -1 | 
|  | 101   write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t") | 
|  | 102 } | 
|  | 103 | 
|  | 104 ############################ main ############################# | 
|  | 105 # running from terminal (supposing the OghmaGalaxy/bin directory is in your path) : | 
|  | 106 # encode.sh -i path_to_raw_data -s separator -c code -o path_to_result_directory | 
|  | 107 ## -i : path to the file that contains the genotypes to encode, must be a .rda file (as outputed by loadGenotype.R). | 
|  | 108 # please note that the table must be called "genotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used) | 
|  | 109 | 
|  | 110 ## -s : in case of heterozygous both allele are encoded in the same "cell" of the table and separated by a character | 
|  | 111 # (most often "" or "/"). This argument specify which character | 
|  | 112 | 
|  | 113 ## -c : the encoding of minor allele/heterozygous/major allele. by default {-1,0,1} | 
|  | 114 | 
|  | 115 ## -o : path to the file of encoded genotype. the .rda extension is automatically added | 
|  | 116 | 
|  | 117 cmd <- commandArgs(T) | 
|  | 118 source(cmd[1]) | 
|  | 119 genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T) | 
|  | 120 # deal with optional argument | 
|  | 121 code <- strsplit(code, ",") | 
|  | 122 code <- unlist(lapply(code, as.numeric), use.names = F) | 
|  | 123 encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out) | 
|  | 124 cat(paste(out,".csv", "\n", sep="")) |