| 
2
 | 
     1 ########################################################
 | 
| 
 | 
     2 #
 | 
| 
 | 
     3 # creation date : 04/01/16
 | 
| 
 | 
     4 # last modification : 17/09/16
 | 
| 
 | 
     5 # author : Dr Nicolas Beaume
 | 
| 
 | 
     6 # owner : IRRI
 | 
| 
 | 
     7 #
 | 
| 
 | 
     8 ########################################################
 | 
| 
 | 
     9 
 | 
| 
 | 
    10 log <- file(paste(getwd(), "log_encode.txt", sep="/"), open = "wt")
 | 
| 
 | 
    11 sink(file = log, type="message")
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 ############################ helper functions #######################
 | 
| 
 | 
    14 
 | 
| 
 | 
    15 # encode one position in one individual
 | 
| 
 | 
    16 encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){
 | 
| 
 | 
    17   res <- x
 | 
| 
 | 
    18   if(!is.na(x)) {
 | 
| 
 | 
    19     if(isHeterozygous(x, sep = sep)) {
 | 
| 
 | 
    20       # heterozygous
 | 
| 
 | 
    21       res <- code[2]
 | 
| 
 | 
    22     } else {
 | 
| 
 | 
    23       # determine whether it is the minor or major allele
 | 
| 
 | 
    24       x <- unlist(strsplit(x, sep))
 | 
| 
 | 
    25       # need to check only one element as we already know it is a homozygous
 | 
| 
 | 
    26       if(length(x) > 1) {
 | 
| 
 | 
    27         x <- x[1]
 | 
| 
 | 
    28       }
 | 
| 
 | 
    29       if(x==major) {
 | 
| 
 | 
    30         res <- code[3]
 | 
| 
 | 
    31       } else {
 | 
| 
 | 
    32         res <- code[1]
 | 
| 
 | 
    33       }
 | 
| 
 | 
    34     }
 | 
| 
 | 
    35   } else {
 | 
| 
 | 
    36     res <- NA
 | 
| 
 | 
    37   }
 | 
| 
 | 
    38   return(res)
 | 
| 
 | 
    39 }
 | 
| 
 | 
    40 
 | 
| 
 | 
    41 # rewrite a marker to determine the major allele
 | 
| 
 | 
    42 encodeGenotype.rewrite <- function(x, sep=""){
 | 
| 
 | 
    43   res <- x
 | 
| 
 | 
    44   if(!is.na(x)) {
 | 
| 
 | 
    45     if(length(unlist(strsplit(x,sep)))==1) {
 | 
| 
 | 
    46       # in case of homozygous, must be counted 2 times
 | 
| 
 | 
    47       res <- c(x,x)
 | 
| 
 | 
    48     } else {
 | 
| 
 | 
    49       res <- unlist(strsplit(x, split=sep))
 | 
| 
 | 
    50     }
 | 
| 
 | 
    51   } else {
 | 
| 
 | 
    52     res <- NA
 | 
| 
 | 
    53   }
 | 
| 
 | 
    54   return(res)
 | 
| 
 | 
    55 }
 | 
| 
 | 
    56 
 | 
| 
 | 
    57 # encode one individual
 | 
| 
 | 
    58 encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){
 | 
| 
 | 
    59   newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep))
 | 
| 
 | 
    60   stat <- table(as.character(newIndiv))
 | 
| 
 | 
    61   major <- names(stat)[which.max(stat)]
 | 
| 
 | 
    62   indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep))
 | 
| 
 | 
    63   return(indiv)
 | 
| 
 | 
    64 }
 | 
| 
 | 
    65 
 | 
| 
 | 
    66 
 | 
| 
 | 
    67 isHeterozygous <- function(genotype, sep=""){
 | 
| 
 | 
    68   bool <- F
 | 
| 
 | 
    69   if(is.na(genotype)){
 | 
| 
 | 
    70     bool <- NA
 | 
| 
 | 
    71   } else {
 | 
| 
 | 
    72     x <- unlist(strsplit(genotype, sep))
 | 
| 
 | 
    73     if(length(x) > 1 & !(x[1] %in% x[2])) {
 | 
| 
 | 
    74       bool <- T
 | 
| 
 | 
    75     }
 | 
| 
 | 
    76     
 | 
| 
 | 
    77   }
 | 
| 
 | 
    78   return(bool)
 | 
| 
 | 
    79 }
 | 
| 
 | 
    80 
 | 
| 
 | 
    81 checkEncoding <- function(encoded, code=c(0,1,2)) {
 | 
| 
 | 
    82   check <- NULL
 | 
| 
 | 
    83   for(i in 1:ncol(encoded)) {
 | 
| 
 | 
    84     major <- length(which(encoded[,i]==code[3]))
 | 
| 
 | 
    85     minor <- length(which(encoded[,i]==code[1]))
 | 
| 
 | 
    86     if(major >= minor) {
 | 
| 
 | 
    87       check <- c(check, T)
 | 
| 
 | 
    88     } else {
 | 
| 
 | 
    89       check <- c(check, F)
 | 
| 
 | 
    90     }
 | 
| 
 | 
    91   }
 | 
| 
 | 
    92   return(check)
 | 
| 
 | 
    93 }
 | 
| 
 | 
    94 
 | 
| 
 | 
    95 ################################## main function ###########################
 | 
| 
 | 
    96 # encode all individuals
 | 
| 
 | 
    97 # encode genotype into a {-1,0,1} scheme, where -1 = minor homozygous, 0 = heterozygous, 1 = major homozygous
 | 
| 
 | 
    98 encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){
 | 
| 
 | 
    99   encoded <- apply(raw, 2, encodeGenotype.vec, sep, code)
 | 
| 
 | 
   100   encoded[is.na(encoded)] <- -1
 | 
| 
 | 
   101   write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t")
 | 
| 
 | 
   102 }
 | 
| 
 | 
   103 
 | 
| 
 | 
   104 ############################ main #############################
 | 
| 
 | 
   105 # running from terminal (supposing the OghmaGalaxy/bin directory is in your path) :
 | 
| 
 | 
   106 # encode.sh -i path_to_raw_data -s separator -c code -o path_to_result_directory 
 | 
| 
 | 
   107 ## -i : path to the file that contains the genotypes to encode, must be a .rda file (as outputed by loadGenotype.R).
 | 
| 
 | 
   108 # please note that the table must be called "genotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used)
 | 
| 
 | 
   109 
 | 
| 
 | 
   110 ## -s : in case of heterozygous both allele are encoded in the same "cell" of the table and separated by a character
 | 
| 
 | 
   111 # (most often "" or "/"). This argument specify which character
 | 
| 
 | 
   112 
 | 
| 
 | 
   113 ## -c : the encoding of minor allele/heterozygous/major allele. by default {-1,0,1}
 | 
| 
 | 
   114 
 | 
| 
 | 
   115 ## -o : path to the file of encoded genotype. the .rda extension is automatically added
 | 
| 
 | 
   116 
 | 
| 
 | 
   117 cmd <- commandArgs(T)
 | 
| 
 | 
   118 source(cmd[1])
 | 
| 
 | 
   119 genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T)
 | 
| 
 | 
   120 # deal with optional argument
 | 
| 
 | 
   121 code <- strsplit(code, ",")
 | 
| 
 | 
   122 code <- unlist(lapply(code, as.numeric), use.names = F)
 | 
| 
 | 
   123 encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out)
 | 
| 
 | 
   124 cat(paste(out,".csv", "\n", sep="")) |