Mercurial > repos > nicolas > oghma
comparison encode.R @ 73:ca6af7b12073 draft
Uploaded
author | nicolas |
---|---|
date | Fri, 28 Oct 2016 08:45:13 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
72:366a9dbec192 | 73:ca6af7b12073 |
---|---|
1 ######################################################## | |
2 # | |
3 # creation date : 04/01/16 | |
4 # last modification : 17/09/16 | |
5 # author : Dr Nicolas Beaume | |
6 # owner : IRRI | |
7 # | |
8 ######################################################## | |
9 | |
10 ############################ helper functions ####################### | |
11 | |
12 # encode one position in one individual | |
13 encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){ | |
14 res <- x | |
15 if(!is.na(x)) { | |
16 if(isHeterozygous(x, sep = sep)) { | |
17 # heterozygous | |
18 res <- code[2] | |
19 } else { | |
20 # determine whether it is the minor or major allele | |
21 x <- unlist(strsplit(x, sep)) | |
22 # need to check only one element as we already know it is a homozygous | |
23 if(length(x) > 1) { | |
24 x <- x[1] | |
25 } | |
26 if(x==major) { | |
27 res <- code[3] | |
28 } else { | |
29 res <- code[1] | |
30 } | |
31 } | |
32 } else { | |
33 # keep NA as NA | |
34 res <- NA | |
35 } | |
36 return(res) | |
37 } | |
38 | |
39 # rewrite a marker to make an exact count of the allele for the current marker | |
40 encodeGenotype.rewrite <- function(x, sep=""){ | |
41 res <- x | |
42 if(!is.na(x)) { | |
43 if(length(unlist(strsplit(x,sep)))==1) { | |
44 # in case of homozygous, must be counted 2 times so the caracter is written 2 times | |
45 res <- c(x,x) | |
46 } else { | |
47 # heterozygous | |
48 res <- unlist(strsplit(x, split=sep)) | |
49 } | |
50 } else { | |
51 res <- NA | |
52 } | |
53 return(res) | |
54 } | |
55 | |
56 # encode one individual | |
57 encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){ | |
58 # rewrite genotype to make sure everything is written as a double caracter value | |
59 newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep)) | |
60 # compute the occurcence of each genotype to determine major an minor allele | |
61 stat <- table(as.character(newIndiv)) | |
62 major <- names(stat)[which.max(stat)] | |
63 # Encode using the appropriate code | |
64 indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep)) | |
65 return(indiv) | |
66 } | |
67 | |
68 # determine if the genotype is an homozygous or heterozygous one | |
69 # genotype must be written with two characters, even homozygous | |
70 # (see encodeGenotype.rewrite() function ) | |
71 isHeterozygous <- function(genotype, sep=""){ | |
72 bool <- F | |
73 # case of NA, can't be determined | |
74 if(is.na(genotype)){ | |
75 bool <- NA | |
76 } else { | |
77 # check whether both element of the genotype are the same or not | |
78 x <- unlist(strsplit(genotype, sep)) | |
79 if(length(x) > 1 & !(x[1] %in% x[2])) { | |
80 bool <- T | |
81 } | |
82 | |
83 } | |
84 return(bool) | |
85 } | |
86 | |
87 # check if encoding has been made properly. return a boolean vector | |
88 # which has the same length that the number of columns of the input matrix | |
89 # at marker i, check[i] is true if code[3] is larger than code[1], false otherwise | |
90 # please note that this function is not used in the current tool and let | |
91 # by convinience for being used outside of galaxy | |
92 checkEncoding <- function(encoded, code=c(0,1,2)) { | |
93 check <- NULL | |
94 for(i in 1:ncol(encoded)) { | |
95 # find major an minor allele | |
96 major <- length(which(encoded[,i]==code[3])) | |
97 minor <- length(which(encoded[,i]==code[1])) | |
98 # comaprison | |
99 if(major >= minor) { | |
100 check <- c(check, T) | |
101 } else { | |
102 check <- c(check, F) | |
103 } | |
104 } | |
105 return(check) | |
106 } | |
107 | |
108 ################################## main function ########################### | |
109 # encode all individuals | |
110 encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){ | |
111 # encode genotype | |
112 encoded <- apply(raw, 2, encodeGenotype.vec, sep, code) | |
113 # set all NA to -1 (thus encoding schems with -1 are not allowed) | |
114 encoded[is.na(encoded)] <- -1 | |
115 write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t") | |
116 } | |
117 | |
118 ############################ main ############################# | |
119 # load argument from xml file | |
120 cmd <- commandArgs(T) | |
121 source(cmd[1]) | |
122 # load genotype | |
123 genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T) | |
124 code <- c(0,1,2) | |
125 encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out) | |
126 cat(paste(out,".csv", "\n", sep="")) |