Galaxy |

Changeset 29:89175737f16b (2016-10-25)

Previous changeset 28:40664d2d295f (2016-10-25) Next changeset 30:e85041eeda80 (2016-10-25)

Commit message:
Uploaded

modified:
encode.R

diff -r 40664d2d295f -r 89175737f16b encode.R
--- a/encode.R Tue Oct 25 14:38:15 2016 -0400
+++ b/encode.R Tue Oct 25 14:38:58 2016 -0400

[

@@ -7,9 +7,6 @@
#
########################################################

-log <- file(paste(getwd(), "log_encode.txt", sep="/"), open = "wt")
-sink(file = log, type="message")
-
############################ helper functions #######################

# encode one position in one individual
@@ -33,19 +30,21 @@
       }
     }
   } else {
+    # keep NA as NA
     res <- NA
   }
   return(res)
}

-# rewrite a marker to determine the major allele
+# rewrite a marker to make an exact count of the allele for the current marker
encodeGenotype.rewrite <- function(x, sep=""){
   res <- x
   if(!is.na(x)) {
     if(length(unlist(strsplit(x,sep)))==1) {
-      # in case of homozygous, must be counted 2 times
+      # in case of homozygous, must be counted 2 times so the caracter is written 2 times
       res <- c(x,x)
     } else {
+      # heterozygous
       res <- unlist(strsplit(x, split=sep))
     }
   } else {
@@ -56,19 +55,26 @@

# encode one individual
encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){
+  # rewrite genotype to make sure everything is written as a double caracter value
   newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep))
+  # compute the occurcence of each genotype to determine major an minor allele
   stat <- table(as.character(newIndiv))
   major <- names(stat)[which.max(stat)]
+  # Encode using the appropriate code
   indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep))
   return(indiv)
}

-
+# determine if the genotype is an homozygous or heterozygous one
+# genotype must be written with two characters, even homozygous
+# (see encodeGenotype.rewrite() function )
isHeterozygous <- function(genotype, sep=""){
   bool <- F
+  # case of NA, can't be determined
   if(is.na(genotype)){
     bool <- NA
   } else {
+    # check whether both element of the genotype are the same or not
     x <- unlist(strsplit(genotype, sep))
     if(length(x) > 1 & !(x[1] %in% x[2])) {
       bool <- T
@@ -78,11 +84,18 @@
   return(bool)
}

+# check if encoding has been made properly. return a boolean vector
+# which has the same length that the number of columns of the input matrix
+# at marker i, check[i] is true if code[3] is larger than code[1], false otherwise
+# please note that this function is not used in the current tool and let
+# by convinience for being used outside of galaxy
checkEncoding <- function(encoded, code=c(0,1,2)) {
   check <- NULL
   for(i in 1:ncol(encoded)) {
+    # find major an minor allele
     major <- length(which(encoded[,i]==code[3]))
     minor <- length(which(encoded[,i]==code[1]))
+    # comaprison
     if(major >= minor) {
       check <- c(check, T)
     } else {
@@ -94,31 +107,20 @@

################################## main function ###########################
# encode all individuals
-# encode genotype into a {-1,0,1} scheme, where -1 = minor homozygous, 0 = heterozygous, 1 = major homozygous
encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){
+  # encode genotype
   encoded <- apply(raw, 2, encodeGenotype.vec, sep, code)
+  # set all NA to -1 (thus encoding schems with -1 are not allowed)
   encoded[is.na(encoded)] <- -1
   write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t")
}

############################ main #############################
-# running from terminal (supposing the OghmaGalaxy/bin directory is in your path) :
-# encode.sh -i path_to_raw_data -s separator -c code -o path_to_result_directory
-## -i : path to the file that contains the genotypes to encode, must be a .rda file (as outputed by loadGenotype.R).
-# please note that the table must be called "genotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used)
-
-## -s : in case of heterozygous both allele are encoded in the same "cell" of the table and separated by a character
-# (most often "" or "/"). This argument specify which character
-
-## -c : the encoding of minor allele/heterozygous/major allele. by default {-1,0,1}
-
-## -o : path to the file of encoded genotype. the .rda extension is automatically added
-
+# load argument from xml file
cmd <- commandArgs(T)
source(cmd[1])
+# load genotype
genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T)
-# deal with optional argument
-code <- strsplit(code, ",")
-code <- unlist(lapply(code, as.numeric), use.names = F)
+code <- c(0,1,2)
encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out)
cat(paste(out,".csv", "\n", sep=""))
\ No newline at end of file