Mercurial > repos > yufei-luo > s_mart
comparison SMART/Java/Python/misc/Utils.py @ 6:769e306b7933
Change the repository level.
author | yufei-luo |
---|---|
date | Fri, 18 Jan 2013 04:54:14 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:ea3082881bf8 | 6:769e306b7933 |
---|---|
1 # | |
2 # Copyright INRA-URGI 2009-2010 | |
3 # | |
4 # This software is governed by the CeCILL license under French law and | |
5 # abiding by the rules of distribution of free software. You can use, | |
6 # modify and/ or redistribute the software under the terms of the CeCILL | |
7 # license as circulated by CEA, CNRS and INRIA at the following URL | |
8 # "http://www.cecill.info". | |
9 # | |
10 # As a counterpart to the access to the source code and rights to copy, | |
11 # modify and redistribute granted by the license, users are provided only | |
12 # with a limited warranty and the software's author, the holder of the | |
13 # economic rights, and the successive licensors have only limited | |
14 # liability. | |
15 # | |
16 # In this respect, the user's attention is drawn to the risks associated | |
17 # with loading, using, modifying and/or developing or reproducing the | |
18 # software by the user in light of its specific status of free software, | |
19 # that may mean that it is complicated to manipulate, and that also | |
20 # therefore means that it is reserved for developers and experienced | |
21 # professionals having in-depth computer knowledge. Users are therefore | |
22 # encouraged to load and test the software's suitability as regards their | |
23 # requirements in conditions enabling the security of their systems and/or | |
24 # data to be ensured and, more generally, to use and operate it in the | |
25 # same conditions as regards security. | |
26 # | |
27 # The fact that you are presently reading this means that you have had | |
28 # knowledge of the CeCILL license and that you accept its terms. | |
29 # | |
30 """Some useful functions""" | |
31 | |
32 import sys, os | |
33 import random | |
34 import subprocess | |
35 | |
36 | |
37 def writeFile(fileName, content): | |
38 """ | |
39 Write the content of a file | |
40 """ | |
41 handle = open(fileName, "w") | |
42 handle.write(content) | |
43 handle.close() | |
44 | |
45 def sumOfLists(list1, list2): | |
46 """ | |
47 Element by element sum | |
48 """ | |
49 if len(list1) != len(list2): | |
50 sys.exit("Cannot sum list whose sizes are different!") | |
51 return [list1[i] + list2[i] for i in range(len(list1))] | |
52 | |
53 | |
54 def protectBackslashes(string): | |
55 """ | |
56 Protect the backslashes in a path by adding another backslash | |
57 """ | |
58 return string.replace("\\", "\\\\") | |
59 | |
60 | |
61 def getHammingDistance(string1, string2): | |
62 """ | |
63 Compute Hamming distance between two strings | |
64 """ | |
65 if len(string1) != len(string2): | |
66 raise Exception("Error, size of %s and %s differ" % (string1, string2)) | |
67 return sum(ch1 != ch2 for ch1, ch2 in zip(string1, string2)) | |
68 | |
69 | |
70 def getLevenshteinDistance(string1, string2): | |
71 """ | |
72 Compute Levenshtein distance between two strings | |
73 """ | |
74 if len(string1) < len(string2): | |
75 return getLevenshteinDistance(string2, string1) | |
76 if not string1: | |
77 return len(string2) | |
78 previousRow = xrange(len(string2) + 1) | |
79 for i, c1 in enumerate(string1): | |
80 currentRow = [i + 1] | |
81 for j, c2 in enumerate(string2): | |
82 insertions = previousRow[j + 1] + 1 | |
83 deletions = currentRow[j] + 1 | |
84 substitutions = previousRow[j] + (c1 != c2) | |
85 currentRow.append(min(insertions, deletions, substitutions)) | |
86 previousRow = currentRow | |
87 return previousRow[-1] | |
88 | |
89 | |
90 def getMinAvgMedMax(values): | |
91 """ | |
92 Get some stats about a dict | |
93 @param values: a distribution (the value being the number of occurrences of the key) | |
94 @type values: dict int to int | |
95 @return: a tuple | |
96 """ | |
97 minValues = min(values.keys()) | |
98 maxValues = max(values.keys()) | |
99 sumValues = sum([value * values[value] for value in values]) | |
100 nbValues = sum(values.values()) | |
101 allValues = [] | |
102 for key in values: | |
103 for i in range(values[key]): | |
104 allValues.append(key) | |
105 sortedValues = sorted(allValues) | |
106 sorted(values.values()) | |
107 if (nbValues % 2 == 0): | |
108 medValues = (sortedValues[nbValues / 2 - 1] + sortedValues[nbValues / 2]) / 2.0 | |
109 else: | |
110 medValues = sortedValues[(nbValues + 1) / 2 - 1] | |
111 return (minValues, float(sumValues) / nbValues, medValues, maxValues) | |
112 | |
113 | |
114 def xor(value1, value2): | |
115 """ | |
116 Logical xor | |
117 @param value1: a value | |
118 @type value1: anything | |
119 @param value2: a value | |
120 @type value2: anything | |
121 """ | |
122 return bool(value1) != bool(value2) | |
123 | |
124 | |
125 def diff(fileName1, fileName2): | |
126 """ | |
127 Compare two files | |
128 @param fileName1: a file name | |
129 @type fileName1: string | |
130 @param fileName2: another file name | |
131 @type fileName2: string | |
132 @return: None if the files are the same, a string otherwise | |
133 """ | |
134 handle1 = open(fileName1) | |
135 lines1 = handle1.readlines() | |
136 handle2 = open(fileName2) | |
137 lines2 = handle2.readlines() | |
138 if len(lines1) != len(lines2): | |
139 print "Sizes of files differ (%d != %d)" % (len(lines1), len(lines2)) | |
140 return False | |
141 for i in xrange(len(lines1)): | |
142 if lines1[i] != lines2[i]: | |
143 print "Line %d differ ('%s' != '%s')" % (i, lines1[i].strip(), lines2[i].strip()) | |
144 return False | |
145 return True | |
146 | |
147 | |
148 def binomialCoefficient(a, b): | |
149 """ | |
150 Compute cumulated product from a to b | |
151 @param a: a value | |
152 @type a: int | |
153 @param b: a value | |
154 @type b: int | |
155 """ | |
156 if a > b / 2: | |
157 a = b-a | |
158 p = 1.0 | |
159 for i in range(b-a+1, b+1): | |
160 p *= i | |
161 q = 1.0 | |
162 for i in range(1, a+1): | |
163 q *= i | |
164 return p / q | |
165 | |
166 | |
167 memory = {} | |
168 | |
169 # def fisherExactPValue(a, b, c, d): | |
170 # """ | |
171 # P-value of Fisher exact test for 2x2 contingency table | |
172 # """ | |
173 # if (a, b, c, d) in memory: | |
174 # return memory[(a, b, c, d)] | |
175 | |
176 # n = a + b + c + d | |
177 # i1 = binomialCoefficient(a, a+b) | |
178 # i2 = binomialCoefficient(c, a+c) | |
179 # i3 = binomialCoefficient(c+d, n) | |
180 # pValue = i1 * i2 / i3 | |
181 | |
182 # memory[(a, b, c, d)] = pValue | |
183 | |
184 # return pValue | |
185 | |
186 | |
187 def fisherExactPValue(a, b, c, d): | |
188 if (a, b, c, d) in memory: | |
189 return memory[(a, b, c, d)] | |
190 | |
191 scriptFileName = "tmpScript-%d.R" % (random.randint(0, 10000)) | |
192 rScript = open(scriptFileName, "w") | |
193 rScript.write("data = matrix(c(%d, %d, %d, %d), nr=2)\n" % (a, b, c, d)) | |
194 rScript.write("fisher.test(data)\n") | |
195 #rScript.write("chisq.test(data)\n") | |
196 rScript.close() | |
197 | |
198 rCommand = "R" | |
199 if "SMARTRPATH" in os.environ: | |
200 rCommand = os.environ["SMARTRPATH"] | |
201 command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName) | |
202 status = subprocess.call(command, shell=True) | |
203 | |
204 if status != 0: | |
205 sys.exit("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status)) | |
206 | |
207 outputRFileName = "%sout" % (scriptFileName) | |
208 outputRFile = open(outputRFileName) | |
209 pValue = None | |
210 pValueTag = "p-value " | |
211 for line in outputRFile: | |
212 line = line.strip() | |
213 if line == "": continue | |
214 for splittedLine in line.split(","): | |
215 splittedLine = splittedLine.strip() | |
216 if splittedLine.startswith(pValueTag): | |
217 pValue = float(splittedLine.split()[-1]) | |
218 break | |
219 | |
220 if pValue == None: | |
221 sys.exit("Problem with the cannot find p-value! File %s, values are: %d, %d, %d, %d" % (scriptFileName, a, b, c, d)) | |
222 | |
223 os.remove(scriptFileName) | |
224 os.remove(outputRFileName) | |
225 | |
226 memory[(a, b, c, d)] = pValue | |
227 | |
228 return pValue | |
229 | |
230 | |
231 def fisherExactPValueBulk(list): | |
232 | |
233 scriptFileName = "tmpScript-%d.R" % (random.randint(0, 10000)) | |
234 rScript = open(scriptFileName, "w") | |
235 for element in list: | |
236 rScript.write("fisher.test(matrix(c(%d, %d, %d, %d), nr=2))$p.value\n" % (int(element[0]), int(element[1]), int(element[2]), int(element[3]))) | |
237 rScript.close() | |
238 | |
239 rCommand = "R" | |
240 if "SMARTRPATH" in os.environ: | |
241 rCommand = os.environ["SMARTRPATH"] | |
242 command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName) | |
243 status = subprocess.call(command, shell=True) | |
244 | |
245 if status != 0: | |
246 sys.exit("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status)) | |
247 | |
248 outputRFileName = "%sout" % (scriptFileName) | |
249 outputRFile = open(outputRFileName) | |
250 pValue = None | |
251 pValueTag = "[1] " | |
252 results = {} | |
253 cpt = 0 | |
254 for line in outputRFile: | |
255 line = line.strip() | |
256 if line == "": continue | |
257 if line.startswith(pValueTag): | |
258 pValue = float(line.split()[-1]) | |
259 results[list[cpt][0:2]] = pValue | |
260 cpt += 1 | |
261 | |
262 if pValue == None: | |
263 sys.exit("Problem with the cannot find p-value!") | |
264 if cpt != len(list): | |
265 sys.exit("Error in the number of p-values computed by R in file '%s'!" % (scriptFileName)) | |
266 | |
267 os.remove(scriptFileName) | |
268 os.remove(outputRFileName) | |
269 | |
270 return results | |
271 |