6
|
1 #
|
|
2 # Copyright INRA-URGI 2009-2010
|
|
3 #
|
|
4 # This software is governed by the CeCILL license under French law and
|
|
5 # abiding by the rules of distribution of free software. You can use,
|
|
6 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
7 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
8 # "http://www.cecill.info".
|
|
9 #
|
|
10 # As a counterpart to the access to the source code and rights to copy,
|
|
11 # modify and redistribute granted by the license, users are provided only
|
|
12 # with a limited warranty and the software's author, the holder of the
|
|
13 # economic rights, and the successive licensors have only limited
|
|
14 # liability.
|
|
15 #
|
|
16 # In this respect, the user's attention is drawn to the risks associated
|
|
17 # with loading, using, modifying and/or developing or reproducing the
|
|
18 # software by the user in light of its specific status of free software,
|
|
19 # that may mean that it is complicated to manipulate, and that also
|
|
20 # therefore means that it is reserved for developers and experienced
|
|
21 # professionals having in-depth computer knowledge. Users are therefore
|
|
22 # encouraged to load and test the software's suitability as regards their
|
|
23 # requirements in conditions enabling the security of their systems and/or
|
|
24 # data to be ensured and, more generally, to use and operate it in the
|
|
25 # same conditions as regards security.
|
|
26 #
|
|
27 # The fact that you are presently reading this means that you have had
|
|
28 # knowledge of the CeCILL license and that you accept its terms.
|
|
29 #
|
|
30 """Some useful functions"""
|
|
31
|
|
32 import sys, os
|
|
33 import random
|
|
34 import subprocess
|
|
35
|
|
36
|
|
37 def writeFile(fileName, content):
|
|
38 """
|
|
39 Write the content of a file
|
|
40 """
|
|
41 handle = open(fileName, "w")
|
|
42 handle.write(content)
|
|
43 handle.close()
|
|
44
|
|
45 def sumOfLists(list1, list2):
|
|
46 """
|
|
47 Element by element sum
|
|
48 """
|
|
49 if len(list1) != len(list2):
|
|
50 sys.exit("Cannot sum list whose sizes are different!")
|
|
51 return [list1[i] + list2[i] for i in range(len(list1))]
|
|
52
|
|
53
|
|
54 def protectBackslashes(string):
|
|
55 """
|
|
56 Protect the backslashes in a path by adding another backslash
|
|
57 """
|
|
58 return string.replace("\\", "\\\\")
|
|
59
|
|
60
|
|
61 def getHammingDistance(string1, string2):
|
|
62 """
|
|
63 Compute Hamming distance between two strings
|
|
64 """
|
|
65 if len(string1) != len(string2):
|
|
66 raise Exception("Error, size of %s and %s differ" % (string1, string2))
|
|
67 return sum(ch1 != ch2 for ch1, ch2 in zip(string1, string2))
|
|
68
|
|
69
|
|
70 def getLevenshteinDistance(string1, string2):
|
|
71 """
|
|
72 Compute Levenshtein distance between two strings
|
|
73 """
|
|
74 if len(string1) < len(string2):
|
|
75 return getLevenshteinDistance(string2, string1)
|
|
76 if not string1:
|
|
77 return len(string2)
|
|
78 previousRow = xrange(len(string2) + 1)
|
|
79 for i, c1 in enumerate(string1):
|
|
80 currentRow = [i + 1]
|
|
81 for j, c2 in enumerate(string2):
|
|
82 insertions = previousRow[j + 1] + 1
|
|
83 deletions = currentRow[j] + 1
|
|
84 substitutions = previousRow[j] + (c1 != c2)
|
|
85 currentRow.append(min(insertions, deletions, substitutions))
|
|
86 previousRow = currentRow
|
|
87 return previousRow[-1]
|
|
88
|
|
89
|
|
90 def getMinAvgMedMax(values):
|
|
91 """
|
|
92 Get some stats about a dict
|
|
93 @param values: a distribution (the value being the number of occurrences of the key)
|
|
94 @type values: dict int to int
|
|
95 @return: a tuple
|
|
96 """
|
|
97 minValues = min(values.keys())
|
|
98 maxValues = max(values.keys())
|
|
99 sumValues = sum([value * values[value] for value in values])
|
|
100 nbValues = sum(values.values())
|
|
101 allValues = []
|
|
102 for key in values:
|
|
103 for i in range(values[key]):
|
|
104 allValues.append(key)
|
|
105 sortedValues = sorted(allValues)
|
|
106 sorted(values.values())
|
|
107 if (nbValues % 2 == 0):
|
|
108 medValues = (sortedValues[nbValues / 2 - 1] + sortedValues[nbValues / 2]) / 2.0
|
|
109 else:
|
|
110 medValues = sortedValues[(nbValues + 1) / 2 - 1]
|
|
111 return (minValues, float(sumValues) / nbValues, medValues, maxValues)
|
|
112
|
|
113
|
|
114 def xor(value1, value2):
|
|
115 """
|
|
116 Logical xor
|
|
117 @param value1: a value
|
|
118 @type value1: anything
|
|
119 @param value2: a value
|
|
120 @type value2: anything
|
|
121 """
|
|
122 return bool(value1) != bool(value2)
|
|
123
|
|
124
|
|
125 def diff(fileName1, fileName2):
|
|
126 """
|
|
127 Compare two files
|
|
128 @param fileName1: a file name
|
|
129 @type fileName1: string
|
|
130 @param fileName2: another file name
|
|
131 @type fileName2: string
|
|
132 @return: None if the files are the same, a string otherwise
|
|
133 """
|
|
134 handle1 = open(fileName1)
|
|
135 lines1 = handle1.readlines()
|
|
136 handle2 = open(fileName2)
|
|
137 lines2 = handle2.readlines()
|
|
138 if len(lines1) != len(lines2):
|
|
139 print "Sizes of files differ (%d != %d)" % (len(lines1), len(lines2))
|
|
140 return False
|
|
141 for i in xrange(len(lines1)):
|
|
142 if lines1[i] != lines2[i]:
|
|
143 print "Line %d differ ('%s' != '%s')" % (i, lines1[i].strip(), lines2[i].strip())
|
|
144 return False
|
|
145 return True
|
|
146
|
|
147
|
|
148 def binomialCoefficient(a, b):
|
|
149 """
|
|
150 Compute cumulated product from a to b
|
|
151 @param a: a value
|
|
152 @type a: int
|
|
153 @param b: a value
|
|
154 @type b: int
|
|
155 """
|
|
156 if a > b / 2:
|
|
157 a = b-a
|
|
158 p = 1.0
|
|
159 for i in range(b-a+1, b+1):
|
|
160 p *= i
|
|
161 q = 1.0
|
|
162 for i in range(1, a+1):
|
|
163 q *= i
|
|
164 return p / q
|
|
165
|
|
166
|
|
167 memory = {}
|
|
168
|
|
169 # def fisherExactPValue(a, b, c, d):
|
|
170 # """
|
|
171 # P-value of Fisher exact test for 2x2 contingency table
|
|
172 # """
|
|
173 # if (a, b, c, d) in memory:
|
|
174 # return memory[(a, b, c, d)]
|
|
175
|
|
176 # n = a + b + c + d
|
|
177 # i1 = binomialCoefficient(a, a+b)
|
|
178 # i2 = binomialCoefficient(c, a+c)
|
|
179 # i3 = binomialCoefficient(c+d, n)
|
|
180 # pValue = i1 * i2 / i3
|
|
181
|
|
182 # memory[(a, b, c, d)] = pValue
|
|
183
|
|
184 # return pValue
|
|
185
|
|
186
|
|
187 def fisherExactPValue(a, b, c, d):
|
|
188 if (a, b, c, d) in memory:
|
|
189 return memory[(a, b, c, d)]
|
|
190
|
|
191 scriptFileName = "tmpScript-%d.R" % (random.randint(0, 10000))
|
|
192 rScript = open(scriptFileName, "w")
|
|
193 rScript.write("data = matrix(c(%d, %d, %d, %d), nr=2)\n" % (a, b, c, d))
|
|
194 rScript.write("fisher.test(data)\n")
|
|
195 #rScript.write("chisq.test(data)\n")
|
|
196 rScript.close()
|
|
197
|
|
198 rCommand = "R"
|
|
199 if "SMARTRPATH" in os.environ:
|
|
200 rCommand = os.environ["SMARTRPATH"]
|
|
201 command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName)
|
|
202 status = subprocess.call(command, shell=True)
|
|
203
|
|
204 if status != 0:
|
|
205 sys.exit("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status))
|
|
206
|
|
207 outputRFileName = "%sout" % (scriptFileName)
|
|
208 outputRFile = open(outputRFileName)
|
|
209 pValue = None
|
|
210 pValueTag = "p-value "
|
|
211 for line in outputRFile:
|
|
212 line = line.strip()
|
|
213 if line == "": continue
|
|
214 for splittedLine in line.split(","):
|
|
215 splittedLine = splittedLine.strip()
|
|
216 if splittedLine.startswith(pValueTag):
|
|
217 pValue = float(splittedLine.split()[-1])
|
|
218 break
|
|
219
|
|
220 if pValue == None:
|
|
221 sys.exit("Problem with the cannot find p-value! File %s, values are: %d, %d, %d, %d" % (scriptFileName, a, b, c, d))
|
|
222
|
|
223 os.remove(scriptFileName)
|
|
224 os.remove(outputRFileName)
|
|
225
|
|
226 memory[(a, b, c, d)] = pValue
|
|
227
|
|
228 return pValue
|
|
229
|
|
230
|
|
231 def fisherExactPValueBulk(list):
|
|
232
|
|
233 scriptFileName = "tmpScript-%d.R" % (random.randint(0, 10000))
|
|
234 rScript = open(scriptFileName, "w")
|
|
235 for element in list:
|
|
236 rScript.write("fisher.test(matrix(c(%d, %d, %d, %d), nr=2))$p.value\n" % (int(element[0]), int(element[1]), int(element[2]), int(element[3])))
|
|
237 rScript.close()
|
|
238
|
|
239 rCommand = "R"
|
|
240 if "SMARTRPATH" in os.environ:
|
|
241 rCommand = os.environ["SMARTRPATH"]
|
|
242 command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName)
|
|
243 status = subprocess.call(command, shell=True)
|
|
244
|
|
245 if status != 0:
|
|
246 sys.exit("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status))
|
|
247
|
|
248 outputRFileName = "%sout" % (scriptFileName)
|
|
249 outputRFile = open(outputRFileName)
|
|
250 pValue = None
|
|
251 pValueTag = "[1] "
|
|
252 results = {}
|
|
253 cpt = 0
|
|
254 for line in outputRFile:
|
|
255 line = line.strip()
|
|
256 if line == "": continue
|
|
257 if line.startswith(pValueTag):
|
|
258 pValue = float(line.split()[-1])
|
|
259 results[list[cpt][0:2]] = pValue
|
|
260 cpt += 1
|
|
261
|
|
262 if pValue == None:
|
|
263 sys.exit("Problem with the cannot find p-value!")
|
|
264 if cpt != len(list):
|
|
265 sys.exit("Error in the number of p-values computed by R in file '%s'!" % (scriptFileName))
|
|
266
|
|
267 os.remove(scriptFileName)
|
|
268 os.remove(outputRFileName)
|
|
269
|
|
270 return results
|
|
271
|