comparison smart_toolShed/SMART/Java/Python/misc/Utils.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #
2 # Copyright INRA-URGI 2009-2010
3 #
4 # This software is governed by the CeCILL license under French law and
5 # abiding by the rules of distribution of free software. You can use,
6 # modify and/ or redistribute the software under the terms of the CeCILL
7 # license as circulated by CEA, CNRS and INRIA at the following URL
8 # "http://www.cecill.info".
9 #
10 # As a counterpart to the access to the source code and rights to copy,
11 # modify and redistribute granted by the license, users are provided only
12 # with a limited warranty and the software's author, the holder of the
13 # economic rights, and the successive licensors have only limited
14 # liability.
15 #
16 # In this respect, the user's attention is drawn to the risks associated
17 # with loading, using, modifying and/or developing or reproducing the
18 # software by the user in light of its specific status of free software,
19 # that may mean that it is complicated to manipulate, and that also
20 # therefore means that it is reserved for developers and experienced
21 # professionals having in-depth computer knowledge. Users are therefore
22 # encouraged to load and test the software's suitability as regards their
23 # requirements in conditions enabling the security of their systems and/or
24 # data to be ensured and, more generally, to use and operate it in the
25 # same conditions as regards security.
26 #
27 # The fact that you are presently reading this means that you have had
28 # knowledge of the CeCILL license and that you accept its terms.
29 #
30 """Some useful functions"""
31
32 import sys, os
33 import random
34 import subprocess
35
36
37 def writeFile(fileName, content):
38 """
39 Write the content of a file
40 """
41 handle = open(fileName, "w")
42 handle.write(content)
43 handle.close()
44
45 def sumOfLists(list1, list2):
46 """
47 Element by element sum
48 """
49 if len(list1) != len(list2):
50 sys.exit("Cannot sum list whose sizes are different!")
51 return [list1[i] + list2[i] for i in range(len(list1))]
52
53
54 def protectBackslashes(string):
55 """
56 Protect the backslashes in a path by adding another backslash
57 """
58 return string.replace("\\", "\\\\")
59
60
61 def getHammingDistance(string1, string2):
62 """
63 Compute Hamming distance between two strings
64 """
65 if len(string1) != len(string2):
66 raise Exception("Error, size of %s and %s differ" % (string1, string2))
67 return sum(ch1 != ch2 for ch1, ch2 in zip(string1, string2))
68
69
70 def getLevenshteinDistance(string1, string2):
71 """
72 Compute Levenshtein distance between two strings
73 """
74 if len(string1) < len(string2):
75 return getLevenshteinDistance(string2, string1)
76 if not string1:
77 return len(string2)
78 previousRow = xrange(len(string2) + 1)
79 for i, c1 in enumerate(string1):
80 currentRow = [i + 1]
81 for j, c2 in enumerate(string2):
82 insertions = previousRow[j + 1] + 1
83 deletions = currentRow[j] + 1
84 substitutions = previousRow[j] + (c1 != c2)
85 currentRow.append(min(insertions, deletions, substitutions))
86 previousRow = currentRow
87 return previousRow[-1]
88
89
90 def getMinAvgMedMax(values):
91 """
92 Get some stats about a dict
93 @param values: a distribution (the value being the number of occurrences of the key)
94 @type values: dict int to int
95 @return: a tuple
96 """
97 minValues = min(values.keys())
98 maxValues = max(values.keys())
99 sumValues = sum([value * values[value] for value in values])
100 nbValues = sum(values.values())
101 allValues = []
102 for key in values:
103 for i in range(values[key]):
104 allValues.append(key)
105 sortedValues = sorted(allValues)
106 sorted(values.values())
107 if (nbValues % 2 == 0):
108 medValues = (sortedValues[nbValues / 2 - 1] + sortedValues[nbValues / 2]) / 2.0
109 else:
110 medValues = sortedValues[(nbValues + 1) / 2 - 1]
111 return (minValues, float(sumValues) / nbValues, medValues, maxValues)
112
113
114 def xor(value1, value2):
115 """
116 Logical xor
117 @param value1: a value
118 @type value1: anything
119 @param value2: a value
120 @type value2: anything
121 """
122 return bool(value1) != bool(value2)
123
124
125 def diff(fileName1, fileName2):
126 """
127 Compare two files
128 @param fileName1: a file name
129 @type fileName1: string
130 @param fileName2: another file name
131 @type fileName2: string
132 @return: None if the files are the same, a string otherwise
133 """
134 handle1 = open(fileName1)
135 lines1 = handle1.readlines()
136 handle2 = open(fileName2)
137 lines2 = handle2.readlines()
138 if len(lines1) != len(lines2):
139 print "Sizes of files differ (%d != %d)" % (len(lines1), len(lines2))
140 return False
141 for i in xrange(len(lines1)):
142 if lines1[i] != lines2[i]:
143 print "Line %d differ ('%s' != '%s')" % (i, lines1[i].strip(), lines2[i].strip())
144 return False
145 return True
146
147
148 def binomialCoefficient(a, b):
149 """
150 Compute cumulated product from a to b
151 @param a: a value
152 @type a: int
153 @param b: a value
154 @type b: int
155 """
156 if a > b / 2:
157 a = b-a
158 p = 1.0
159 for i in range(b-a+1, b+1):
160 p *= i
161 q = 1.0
162 for i in range(1, a+1):
163 q *= i
164 return p / q
165
166
167 memory = {}
168
169 # def fisherExactPValue(a, b, c, d):
170 # """
171 # P-value of Fisher exact test for 2x2 contingency table
172 # """
173 # if (a, b, c, d) in memory:
174 # return memory[(a, b, c, d)]
175
176 # n = a + b + c + d
177 # i1 = binomialCoefficient(a, a+b)
178 # i2 = binomialCoefficient(c, a+c)
179 # i3 = binomialCoefficient(c+d, n)
180 # pValue = i1 * i2 / i3
181
182 # memory[(a, b, c, d)] = pValue
183
184 # return pValue
185
186
187 def fisherExactPValue(a, b, c, d):
188 if (a, b, c, d) in memory:
189 return memory[(a, b, c, d)]
190
191 scriptFileName = "tmpScript-%d.R" % (random.randint(0, 10000))
192 rScript = open(scriptFileName, "w")
193 rScript.write("data = matrix(c(%d, %d, %d, %d), nr=2)\n" % (a, b, c, d))
194 rScript.write("fisher.test(data)\n")
195 #rScript.write("chisq.test(data)\n")
196 rScript.close()
197
198 rCommand = "R"
199 if "SMARTRPATH" in os.environ:
200 rCommand = os.environ["SMARTRPATH"]
201 command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName)
202 status = subprocess.call(command, shell=True)
203
204 if status != 0:
205 sys.exit("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status))
206
207 outputRFileName = "%sout" % (scriptFileName)
208 outputRFile = open(outputRFileName)
209 pValue = None
210 pValueTag = "p-value "
211 for line in outputRFile:
212 line = line.strip()
213 if line == "": continue
214 for splittedLine in line.split(","):
215 splittedLine = splittedLine.strip()
216 if splittedLine.startswith(pValueTag):
217 pValue = float(splittedLine.split()[-1])
218 break
219
220 if pValue == None:
221 sys.exit("Problem with the cannot find p-value! File %s, values are: %d, %d, %d, %d" % (scriptFileName, a, b, c, d))
222
223 os.remove(scriptFileName)
224 os.remove(outputRFileName)
225
226 memory[(a, b, c, d)] = pValue
227
228 return pValue
229
230
231 def fisherExactPValueBulk(list):
232
233 scriptFileName = "tmpScript-%d.R" % (random.randint(0, 10000))
234 rScript = open(scriptFileName, "w")
235 for element in list:
236 rScript.write("fisher.test(matrix(c(%d, %d, %d, %d), nr=2))$p.value\n" % (int(element[0]), int(element[1]), int(element[2]), int(element[3])))
237 rScript.close()
238
239 rCommand = "R"
240 if "SMARTRPATH" in os.environ:
241 rCommand = os.environ["SMARTRPATH"]
242 command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName)
243 status = subprocess.call(command, shell=True)
244
245 if status != 0:
246 sys.exit("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status))
247
248 outputRFileName = "%sout" % (scriptFileName)
249 outputRFile = open(outputRFileName)
250 pValue = None
251 pValueTag = "[1] "
252 results = {}
253 cpt = 0
254 for line in outputRFile:
255 line = line.strip()
256 if line == "": continue
257 if line.startswith(pValueTag):
258 pValue = float(line.split()[-1])
259 results[list[cpt][0:2]] = pValue
260 cpt += 1
261
262 if pValue == None:
263 sys.exit("Problem with the cannot find p-value!")
264 if cpt != len(list):
265 sys.exit("Error in the number of p-values computed by R in file '%s'!" % (scriptFileName))
266
267 os.remove(scriptFileName)
268 os.remove(outputRFileName)
269
270 return results
271