| 6 | 1 # | 
|  | 2 # Copyright INRA-URGI 2009-2010 | 
|  | 3 # | 
|  | 4 # This software is governed by the CeCILL license under French law and | 
|  | 5 # abiding by the rules of distribution of free software. You can use, | 
|  | 6 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 7 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 8 # "http://www.cecill.info". | 
|  | 9 # | 
|  | 10 # As a counterpart to the access to the source code and rights to copy, | 
|  | 11 # modify and redistribute granted by the license, users are provided only | 
|  | 12 # with a limited warranty and the software's author, the holder of the | 
|  | 13 # economic rights, and the successive licensors have only limited | 
|  | 14 # liability. | 
|  | 15 # | 
|  | 16 # In this respect, the user's attention is drawn to the risks associated | 
|  | 17 # with loading, using, modifying and/or developing or reproducing the | 
|  | 18 # software by the user in light of its specific status of free software, | 
|  | 19 # that may mean that it is complicated to manipulate, and that also | 
|  | 20 # therefore means that it is reserved for developers and experienced | 
|  | 21 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 22 # encouraged to load and test the software's suitability as regards their | 
|  | 23 # requirements in conditions enabling the security of their systems and/or | 
|  | 24 # data to be ensured and, more generally, to use and operate it in the | 
|  | 25 # same conditions as regards security. | 
|  | 26 # | 
|  | 27 # The fact that you are presently reading this means that you have had | 
|  | 28 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 29 # | 
|  | 30 """Some useful functions""" | 
|  | 31 | 
|  | 32 import sys, os | 
|  | 33 import random | 
|  | 34 import subprocess | 
|  | 35 | 
|  | 36 | 
|  | 37 def writeFile(fileName, content): | 
|  | 38     """ | 
|  | 39     Write the content of a file | 
|  | 40     """ | 
|  | 41     handle = open(fileName, "w") | 
|  | 42     handle.write(content) | 
|  | 43     handle.close() | 
|  | 44 | 
|  | 45 def sumOfLists(list1, list2): | 
|  | 46     """ | 
|  | 47     Element by element sum | 
|  | 48     """ | 
|  | 49     if len(list1) != len(list2): | 
|  | 50         sys.exit("Cannot sum list whose sizes are different!") | 
|  | 51     return [list1[i] + list2[i] for i in range(len(list1))] | 
|  | 52 | 
|  | 53 | 
|  | 54 def protectBackslashes(string): | 
|  | 55     """ | 
|  | 56     Protect the backslashes in a path by adding another backslash | 
|  | 57     """ | 
|  | 58     return string.replace("\\", "\\\\") | 
|  | 59 | 
|  | 60 | 
|  | 61 def getHammingDistance(string1, string2): | 
|  | 62     """ | 
|  | 63     Compute Hamming distance between two strings | 
|  | 64     """ | 
|  | 65     if len(string1) != len(string2): | 
|  | 66         raise Exception("Error, size of %s and %s differ" % (string1, string2)) | 
|  | 67     return sum(ch1 != ch2 for ch1, ch2 in zip(string1, string2)) | 
|  | 68 | 
|  | 69 | 
|  | 70 def getLevenshteinDistance(string1, string2): | 
|  | 71     """ | 
|  | 72     Compute Levenshtein distance between two strings | 
|  | 73     """ | 
|  | 74     if len(string1) < len(string2): | 
|  | 75         return getLevenshteinDistance(string2, string1) | 
|  | 76     if not string1: | 
|  | 77         return len(string2) | 
|  | 78     previousRow = xrange(len(string2) + 1) | 
|  | 79     for i, c1 in enumerate(string1): | 
|  | 80         currentRow = [i + 1] | 
|  | 81         for j, c2 in enumerate(string2): | 
|  | 82             insertions    = previousRow[j + 1] + 1 | 
|  | 83             deletions     = currentRow[j] + 1 | 
|  | 84             substitutions = previousRow[j] + (c1 != c2) | 
|  | 85             currentRow.append(min(insertions, deletions, substitutions)) | 
|  | 86         previousRow = currentRow | 
|  | 87     return previousRow[-1] | 
|  | 88 | 
|  | 89 | 
|  | 90 def getMinAvgMedMax(values): | 
|  | 91     """ | 
|  | 92     Get some stats about a dict | 
|  | 93     @param values: a distribution (the value being the number of occurrences of the key) | 
|  | 94     @type values: dict int to int | 
|  | 95     @return: a tuple | 
|  | 96     """ | 
|  | 97     minValues = min(values.keys()) | 
|  | 98     maxValues = max(values.keys()) | 
|  | 99     sumValues = sum([value * values[value] for value in values]) | 
|  | 100     nbValues = sum(values.values()) | 
|  | 101     allValues = [] | 
|  | 102     for key in values: | 
|  | 103         for i in range(values[key]): | 
|  | 104             allValues.append(key) | 
|  | 105     sortedValues = sorted(allValues) | 
|  | 106     sorted(values.values()) | 
|  | 107     if (nbValues % 2 == 0): | 
|  | 108         medValues = (sortedValues[nbValues / 2 - 1] + sortedValues[nbValues / 2]) / 2.0 | 
|  | 109     else: | 
|  | 110         medValues = sortedValues[(nbValues + 1) / 2 - 1] | 
|  | 111     return (minValues, float(sumValues) / nbValues, medValues, maxValues) | 
|  | 112 | 
|  | 113 | 
|  | 114 def xor(value1, value2): | 
|  | 115     """ | 
|  | 116     Logical xor | 
|  | 117     @param value1: a value | 
|  | 118     @type value1: anything | 
|  | 119     @param value2: a value | 
|  | 120     @type value2: anything | 
|  | 121     """ | 
|  | 122     return bool(value1) != bool(value2) | 
|  | 123 | 
|  | 124 | 
|  | 125 def diff(fileName1, fileName2): | 
|  | 126     """ | 
|  | 127     Compare two files | 
|  | 128     @param fileName1: a file name | 
|  | 129     @type fileName1: string | 
|  | 130     @param fileName2: another file name | 
|  | 131     @type fileName2: string | 
|  | 132     @return: None if the files are the same, a string otherwise | 
|  | 133     """ | 
|  | 134     handle1 = open(fileName1) | 
|  | 135     lines1 = handle1.readlines() | 
|  | 136     handle2 = open(fileName2) | 
|  | 137     lines2 = handle2.readlines() | 
|  | 138     if len(lines1) != len(lines2): | 
|  | 139         print "Sizes of files differ (%d != %d)" % (len(lines1), len(lines2)) | 
|  | 140         return False | 
|  | 141     for i in xrange(len(lines1)): | 
|  | 142         if lines1[i] != lines2[i]: | 
|  | 143             print "Line %d differ ('%s' != '%s')" % (i, lines1[i].strip(), lines2[i].strip()) | 
|  | 144             return False | 
|  | 145     return True | 
|  | 146 | 
|  | 147 | 
|  | 148 def binomialCoefficient(a, b): | 
|  | 149     """ | 
|  | 150     Compute cumulated product from a to b | 
|  | 151     @param a: a value | 
|  | 152     @type    a: int | 
|  | 153     @param b: a value | 
|  | 154     @type    b: int | 
|  | 155     """ | 
|  | 156     if a > b / 2: | 
|  | 157         a = b-a | 
|  | 158     p = 1.0 | 
|  | 159     for i in range(b-a+1, b+1): | 
|  | 160         p *= i | 
|  | 161     q = 1.0 | 
|  | 162     for i in range(1, a+1): | 
|  | 163         q *= i | 
|  | 164     return p / q | 
|  | 165 | 
|  | 166 | 
|  | 167 memory = {} | 
|  | 168 | 
|  | 169 # def fisherExactPValue(a, b, c, d): | 
|  | 170 #     """ | 
|  | 171 #     P-value of Fisher exact test for 2x2 contingency table | 
|  | 172 #     """ | 
|  | 173 #     if (a, b, c, d) in memory: | 
|  | 174 #         return memory[(a, b, c, d)] | 
|  | 175 | 
|  | 176 #     n = a + b + c + d | 
|  | 177 #     i1 = binomialCoefficient(a, a+b) | 
|  | 178 #     i2 = binomialCoefficient(c, a+c) | 
|  | 179 #     i3 = binomialCoefficient(c+d, n) | 
|  | 180 #     pValue = i1 * i2 / i3 | 
|  | 181 | 
|  | 182 #     memory[(a, b, c, d)] = pValue | 
|  | 183 | 
|  | 184 #     return pValue | 
|  | 185 | 
|  | 186 | 
|  | 187 def fisherExactPValue(a, b, c, d): | 
|  | 188     if (a, b, c, d) in memory: | 
|  | 189         return memory[(a, b, c, d)] | 
|  | 190 | 
|  | 191     scriptFileName = "tmpScript-%d.R" % (random.randint(0, 10000)) | 
|  | 192     rScript = open(scriptFileName, "w") | 
|  | 193     rScript.write("data = matrix(c(%d, %d, %d, %d), nr=2)\n" % (a, b, c, d)) | 
|  | 194     rScript.write("fisher.test(data)\n") | 
|  | 195     #rScript.write("chisq.test(data)\n") | 
|  | 196     rScript.close() | 
|  | 197 | 
|  | 198     rCommand = "R" | 
|  | 199     if "SMARTRPATH" in os.environ: | 
|  | 200         rCommand = os.environ["SMARTRPATH"] | 
|  | 201     command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName) | 
|  | 202     status = subprocess.call(command, shell=True) | 
|  | 203 | 
|  | 204     if status != 0: | 
|  | 205         sys.exit("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status)) | 
|  | 206 | 
|  | 207     outputRFileName = "%sout" % (scriptFileName) | 
|  | 208     outputRFile = open(outputRFileName) | 
|  | 209     pValue = None | 
|  | 210     pValueTag = "p-value " | 
|  | 211     for line in outputRFile: | 
|  | 212         line = line.strip() | 
|  | 213         if line == "": continue | 
|  | 214         for splittedLine in line.split(","): | 
|  | 215             splittedLine = splittedLine.strip() | 
|  | 216             if splittedLine.startswith(pValueTag): | 
|  | 217                 pValue = float(splittedLine.split()[-1]) | 
|  | 218                 break | 
|  | 219 | 
|  | 220     if pValue == None: | 
|  | 221         sys.exit("Problem with the cannot find p-value! File %s, values are: %d, %d, %d, %d" % (scriptFileName, a, b, c, d)) | 
|  | 222 | 
|  | 223     os.remove(scriptFileName) | 
|  | 224     os.remove(outputRFileName) | 
|  | 225 | 
|  | 226     memory[(a, b, c, d)] = pValue | 
|  | 227 | 
|  | 228     return pValue | 
|  | 229 | 
|  | 230 | 
|  | 231 def fisherExactPValueBulk(list): | 
|  | 232 | 
|  | 233     scriptFileName = "tmpScript-%d.R" % (random.randint(0, 10000)) | 
|  | 234     rScript = open(scriptFileName, "w") | 
|  | 235     for element in list: | 
|  | 236         rScript.write("fisher.test(matrix(c(%d, %d, %d, %d), nr=2))$p.value\n" % (int(element[0]), int(element[1]), int(element[2]), int(element[3]))) | 
|  | 237     rScript.close() | 
|  | 238 | 
|  | 239     rCommand = "R" | 
|  | 240     if "SMARTRPATH" in os.environ: | 
|  | 241         rCommand = os.environ["SMARTRPATH"] | 
|  | 242     command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName) | 
|  | 243     status = subprocess.call(command, shell=True) | 
|  | 244 | 
|  | 245     if status != 0: | 
|  | 246         sys.exit("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status)) | 
|  | 247 | 
|  | 248     outputRFileName = "%sout" % (scriptFileName) | 
|  | 249     outputRFile = open(outputRFileName) | 
|  | 250     pValue = None | 
|  | 251     pValueTag = "[1] " | 
|  | 252     results = {} | 
|  | 253     cpt = 0 | 
|  | 254     for line in outputRFile: | 
|  | 255         line = line.strip() | 
|  | 256         if line == "": continue | 
|  | 257         if line.startswith(pValueTag): | 
|  | 258             pValue = float(line.split()[-1]) | 
|  | 259             results[list[cpt][0:2]] = pValue | 
|  | 260             cpt += 1 | 
|  | 261 | 
|  | 262     if pValue == None: | 
|  | 263         sys.exit("Problem with the cannot find p-value!") | 
|  | 264     if cpt != len(list): | 
|  | 265         sys.exit("Error in the number of p-values computed by R in file '%s'!" % (scriptFileName)) | 
|  | 266 | 
|  | 267     os.remove(scriptFileName) | 
|  | 268     os.remove(outputRFileName) | 
|  | 269 | 
|  | 270     return results | 
|  | 271 |