Previous changeset 28:40664d2d295f (2016-10-25) Next changeset 30:e85041eeda80 (2016-10-25) |
Commit message:
Uploaded |
modified:
encode.R |
b |
diff -r 40664d2d295f -r 89175737f16b encode.R --- a/encode.R Tue Oct 25 14:38:15 2016 -0400 +++ b/encode.R Tue Oct 25 14:38:58 2016 -0400 |
[ |
@@ -7,9 +7,6 @@ # ######################################################## -log <- file(paste(getwd(), "log_encode.txt", sep="/"), open = "wt") -sink(file = log, type="message") - ############################ helper functions ####################### # encode one position in one individual @@ -33,19 +30,21 @@ } } } else { + # keep NA as NA res <- NA } return(res) } -# rewrite a marker to determine the major allele +# rewrite a marker to make an exact count of the allele for the current marker encodeGenotype.rewrite <- function(x, sep=""){ res <- x if(!is.na(x)) { if(length(unlist(strsplit(x,sep)))==1) { - # in case of homozygous, must be counted 2 times + # in case of homozygous, must be counted 2 times so the caracter is written 2 times res <- c(x,x) } else { + # heterozygous res <- unlist(strsplit(x, split=sep)) } } else { @@ -56,19 +55,26 @@ # encode one individual encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){ + # rewrite genotype to make sure everything is written as a double caracter value newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep)) + # compute the occurcence of each genotype to determine major an minor allele stat <- table(as.character(newIndiv)) major <- names(stat)[which.max(stat)] + # Encode using the appropriate code indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep)) return(indiv) } - +# determine if the genotype is an homozygous or heterozygous one +# genotype must be written with two characters, even homozygous +# (see encodeGenotype.rewrite() function ) isHeterozygous <- function(genotype, sep=""){ bool <- F + # case of NA, can't be determined if(is.na(genotype)){ bool <- NA } else { + # check whether both element of the genotype are the same or not x <- unlist(strsplit(genotype, sep)) if(length(x) > 1 & !(x[1] %in% x[2])) { bool <- T @@ -78,11 +84,18 @@ return(bool) } +# check if encoding has been made properly. return a boolean vector +# which has the same length that the number of columns of the input matrix +# at marker i, check[i] is true if code[3] is larger than code[1], false otherwise +# please note that this function is not used in the current tool and let +# by convinience for being used outside of galaxy checkEncoding <- function(encoded, code=c(0,1,2)) { check <- NULL for(i in 1:ncol(encoded)) { + # find major an minor allele major <- length(which(encoded[,i]==code[3])) minor <- length(which(encoded[,i]==code[1])) + # comaprison if(major >= minor) { check <- c(check, T) } else { @@ -94,31 +107,20 @@ ################################## main function ########################### # encode all individuals -# encode genotype into a {-1,0,1} scheme, where -1 = minor homozygous, 0 = heterozygous, 1 = major homozygous encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){ + # encode genotype encoded <- apply(raw, 2, encodeGenotype.vec, sep, code) + # set all NA to -1 (thus encoding schems with -1 are not allowed) encoded[is.na(encoded)] <- -1 write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t") } ############################ main ############################# -# running from terminal (supposing the OghmaGalaxy/bin directory is in your path) : -# encode.sh -i path_to_raw_data -s separator -c code -o path_to_result_directory -## -i : path to the file that contains the genotypes to encode, must be a .rda file (as outputed by loadGenotype.R). -# please note that the table must be called "genotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used) - -## -s : in case of heterozygous both allele are encoded in the same "cell" of the table and separated by a character -# (most often "" or "/"). This argument specify which character - -## -c : the encoding of minor allele/heterozygous/major allele. by default {-1,0,1} - -## -o : path to the file of encoded genotype. the .rda extension is automatically added - +# load argument from xml file cmd <- commandArgs(T) source(cmd[1]) +# load genotype genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T) -# deal with optional argument -code <- strsplit(code, ",") -code <- unlist(lapply(code, as.numeric), use.names = F) +code <- c(0,1,2) encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out) cat(paste(out,".csv", "\n", sep="")) \ No newline at end of file |