Mercurial > repos > fcaramia > contra
comparison Contra/scripts/average_count.py @ 0:7564f3b1e675
Uploaded
| author | fcaramia | 
|---|---|
| date | Thu, 13 Sep 2012 02:31:43 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:7564f3b1e675 | 
|---|---|
| 1 # ----------------------------------------------------------------------# | |
| 2 # Copyright (c) 2011, Richard Lupat & Jason Li. | |
| 3 # | |
| 4 # > Source License < | |
| 5 # This file is part of CONTRA. | |
| 6 # | |
| 7 # CONTRA is free software: you can redistribute it and/or modify | |
| 8 # it under the terms of the GNU General Public License as published by | |
| 9 # the Free Software Foundation, either version 3 of the License, or | |
| 10 # (at your option) any later version. | |
| 11 # | |
| 12 # CONTRA is distributed in the hope that it will be useful, | |
| 13 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 15 # GNU General Public License for more details. | |
| 16 # | |
| 17 # You should have received a copy of the GNU General Public License | |
| 18 # along with CONTRA. If not, see <http://www.gnu.org/licenses/>. | |
| 19 # | |
| 20 # | |
| 21 #-----------------------------------------------------------------------# | |
| 22 # Last Updated : 28 Sep 2011 11:00AM | |
| 23 | |
| 24 import sys | |
| 25 import math | |
| 26 | |
| 27 def getAverage(list1): | |
| 28 if len(list1) > 0: | |
| 29 return float(sum(list1))/len(list1) | |
| 30 | |
| 31 return 0.0 | |
| 32 | |
| 33 def getStdDev(list1, avg): | |
| 34 var = 0.0 | |
| 35 for x in list1: | |
| 36 var += (avg - x) ** 2 | |
| 37 | |
| 38 if (len(list1)-1) > 0: | |
| 39 var /= (len(list1)-1) | |
| 40 | |
| 41 return math.sqrt(var) | |
| 42 | |
| 43 def getMinMax(list1): | |
| 44 length = len(list1) | |
| 45 if length != 0: | |
| 46 min = list1[0] | |
| 47 max = list1[length-1] | |
| 48 else: | |
| 49 min = 0 | |
| 50 max = 0 | |
| 51 | |
| 52 return min, max | |
| 53 | |
| 54 def getMedian(list1): | |
| 55 length = len(list1) | |
| 56 if length == 0: | |
| 57 median = 0 | |
| 58 elif length % 2 == 0: | |
| 59 median = (list1[length/2]+list1[(length/2) - 1])/2 | |
| 60 else: | |
| 61 median = list1[length/2] | |
| 62 return median | |
| 63 | |
| 64 def createDataDict(count, list1, r, offset, id_check, exon_check): | |
| 65 tDict = {} | |
| 66 tDictOri = {} | |
| 67 | |
| 68 while count < len(list1): | |
| 69 t = list1[count].split() | |
| 70 tId = t[5] | |
| 71 tExon = t[6] | |
| 72 | |
| 73 if (tId != id_check) or (tExon != exon_check): | |
| 74 return count, tDict, tDictOri | |
| 75 | |
| 76 tStart = int(t[2]) | |
| 77 tEnd = int(t[3]) | |
| 78 tCov = float(t[4]) / r + offset #GeoMean Normalisation | |
| 79 tCovOri = float(t[4]) + offset #without scaling | |
| 80 | |
| 81 #filling dict | |
| 82 while tStart < tEnd: | |
| 83 tDict[tStart] = tCov | |
| 84 tDictOri[tStart] = tCovOri #without scaling | |
| 85 tStart += 1 | |
| 86 | |
| 87 count += 1 | |
| 88 | |
| 89 return count, tDict, tDictOri | |
| 90 | |
| 91 def getFactor (val1, val2): | |
| 92 r = math.sqrt(val1 * val2) | |
| 93 r1 = val1/r | |
| 94 r2 = val2/r | |
| 95 return r1, r2 | |
| 96 | |
| 97 def averageCount(tFile, nFile, averageOut, tReadCount, nReadCount, rd_threshold, minNBases): | |
| 98 tList = file.readlines(open(tFile)) | |
| 99 nList = file.readlines(open(nFile)) | |
| 100 # constant & counter | |
| 101 OFF = 1 | |
| 102 tCount = 0 | |
| 103 nCount = 0 | |
| 104 | |
| 105 # create and open files | |
| 106 output = open(averageOut, "w") | |
| 107 | |
| 108 # Offset and Ratio for Geometric Mean Normalisation | |
| 109 r1, r2 = getFactor(tReadCount, nReadCount) | |
| 110 if rd_threshold > 0: | |
| 111 #OFF = 0 | |
| 112 OFF = 0.5 | |
| 113 | |
| 114 #big loop | |
| 115 while (nCount < len(nList)): | |
| 116 # initialisation, get the chr, geneID, geneName | |
| 117 init = tList[tCount].split() | |
| 118 initial = init[5] | |
| 119 _exon = init[6] | |
| 120 chr = init[1] | |
| 121 gene = init[0] | |
| 122 _start = int(init[2]) | |
| 123 | |
| 124 # check if t-gene and n-gene refer to the same gene | |
| 125 check_init = nList[nCount].split() | |
| 126 if check_init[5] != initial or check_init[6] != _exon: | |
| 127 print "Initial: ", initial | |
| 128 print "Check_Init.id: ", check_init[5] | |
| 129 print "_Exon: ", _exon | |
| 130 print "Check_Init.exon: ", check_init[6] | |
| 131 print "Error. Comparing different Gene" | |
| 132 sys.exit(1) | |
| 133 | |
| 134 # create data dictionary for tumour and normal data (per each regions/ exon) | |
| 135 tCount, tDict, tDictOri = createDataDict(tCount, tList, r1, OFF, initial, _exon) | |
| 136 nCount, nDict, nDictOri = createDataDict(nCount, nList, r2, OFF, initial, _exon) | |
| 137 # check number of bases in the both gene dict | |
| 138 if len(nDict) != len(tDict): | |
| 139 print "N:", len(nDict) | |
| 140 print "T:", len(tDict) | |
| 141 print "Error. Different length of dict" | |
| 142 sys.exit(1) | |
| 143 | |
| 144 # compare coverage | |
| 145 count = _start | |
| 146 _max = max(nDict.keys()) | |
| 147 ratioList = [] | |
| 148 tumourList = [] | |
| 149 normalList = [] | |
| 150 tumourOriList = [] | |
| 151 normalOriList = [] | |
| 152 while count <= _max: | |
| 153 # get ratio | |
| 154 if (nDict[count] < rd_threshold) and (tDict[count] < rd_threshold): | |
| 155 ratio = 0.0 | |
| 156 else: | |
| 157 if tDict[count] == 0: | |
| 158 tDict[count] = 0.5 | |
| 159 | |
| 160 ratio = math.log((float(tDict[count]) / nDict[count]),2) | |
| 161 tumourList.append(tDict[count]) | |
| 162 tumourOriList.append(tDictOri[count]) | |
| 163 normalList.append(nDict[count]) | |
| 164 normalOriList.append(nDictOri[count]) | |
| 165 ratioList.append(ratio) | |
| 166 | |
| 167 count += 1 | |
| 168 | |
| 169 ratioLen = len(ratioList) | |
| 170 | |
| 171 # get average | |
| 172 avg = getAverage(ratioList) | |
| 173 sd = getStdDev(ratioList, avg) | |
| 174 tumourAvg= str(round(getAverage(tumourList),3)) | |
| 175 normalAvg= str(round(getAverage(normalList),3)) | |
| 176 tumourOriAvg = str(round(getAverage(tumourOriList),3)) | |
| 177 normalOriAvg = str(round(getAverage(normalOriList),3)) | |
| 178 | |
| 179 # get median | |
| 180 ratioList.sort() | |
| 181 min_logratio, max_logratio = getMinMax(ratioList) | |
| 182 median = getMedian(ratioList) | |
| 183 | |
| 184 # write output | |
| 185 if ratioLen >= minNBases: | |
| 186 output.write(initial + "\t" + gene + "\t" + str(ratioLen) + "\t") | |
| 187 output.write(str(round(avg,3))+ "\t"+ str(count)+ "\t" + _exon + "\t") | |
| 188 output.write(str(round(sd ,3))+ "\t"+ tumourAvg + "\t" + normalAvg +"\t") | |
| 189 output.write(tumourOriAvg + "\t" + normalOriAvg + "\t") | |
| 190 output.write(str(round(median,3)) + "\t" + str(round(min_logratio,3)) + "\t") | |
| 191 output.write(str(round(max_logratio,3)) + "\n") | |
| 192 | |
| 193 output.close() | |
| 194 | |
| 195 #print "End of averageCount.py with the last target = '%s'" %(initial) | 
