0
|
1 # ----------------------------------------------------------------------#
|
|
2 # Copyright (c) 2011, Richard Lupat & Jason Li.
|
|
3 #
|
|
4 # > Source License <
|
|
5 # This file is part of CONTRA.
|
|
6 #
|
|
7 # CONTRA is free software: you can redistribute it and/or modify
|
|
8 # it under the terms of the GNU General Public License as published by
|
|
9 # the Free Software Foundation, either version 3 of the License, or
|
|
10 # (at your option) any later version.
|
|
11 #
|
|
12 # CONTRA is distributed in the hope that it will be useful,
|
|
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15 # GNU General Public License for more details.
|
|
16 #
|
|
17 # You should have received a copy of the GNU General Public License
|
|
18 # along with CONTRA. If not, see <http://www.gnu.org/licenses/>.
|
|
19 #
|
|
20 #
|
|
21 #-----------------------------------------------------------------------#
|
|
22 # Last Updated : 28 Sep 2011 11:00AM
|
|
23
|
|
24 import sys
|
|
25 import math
|
|
26
|
|
27 def getAverage(list1):
|
|
28 if len(list1) > 0:
|
|
29 return float(sum(list1))/len(list1)
|
|
30
|
|
31 return 0.0
|
|
32
|
|
33 def getStdDev(list1, avg):
|
|
34 var = 0.0
|
|
35 for x in list1:
|
|
36 var += (avg - x) ** 2
|
|
37
|
|
38 if (len(list1)-1) > 0:
|
|
39 var /= (len(list1)-1)
|
|
40
|
|
41 return math.sqrt(var)
|
|
42
|
|
43 def getMinMax(list1):
|
|
44 length = len(list1)
|
|
45 if length != 0:
|
|
46 min = list1[0]
|
|
47 max = list1[length-1]
|
|
48 else:
|
|
49 min = 0
|
|
50 max = 0
|
|
51
|
|
52 return min, max
|
|
53
|
|
54 def getMedian(list1):
|
|
55 length = len(list1)
|
|
56 if length == 0:
|
|
57 median = 0
|
|
58 elif length % 2 == 0:
|
|
59 median = (list1[length/2]+list1[(length/2) - 1])/2
|
|
60 else:
|
|
61 median = list1[length/2]
|
|
62 return median
|
|
63
|
|
64 def createDataDict(count, list1, r, offset, id_check, exon_check):
|
|
65 tDict = {}
|
|
66 tDictOri = {}
|
|
67
|
|
68 while count < len(list1):
|
|
69 t = list1[count].split()
|
|
70 tId = t[5]
|
|
71 tExon = t[6]
|
|
72
|
|
73 if (tId != id_check) or (tExon != exon_check):
|
|
74 return count, tDict, tDictOri
|
|
75
|
|
76 tStart = int(t[2])
|
|
77 tEnd = int(t[3])
|
|
78 tCov = float(t[4]) / r + offset #GeoMean Normalisation
|
|
79 tCovOri = float(t[4]) + offset #without scaling
|
|
80
|
|
81 #filling dict
|
|
82 while tStart < tEnd:
|
|
83 tDict[tStart] = tCov
|
|
84 tDictOri[tStart] = tCovOri #without scaling
|
|
85 tStart += 1
|
|
86
|
|
87 count += 1
|
|
88
|
|
89 return count, tDict, tDictOri
|
|
90
|
|
91 def getFactor (val1, val2):
|
|
92 r = math.sqrt(val1 * val2)
|
|
93 r1 = val1/r
|
|
94 r2 = val2/r
|
|
95 return r1, r2
|
|
96
|
|
97 def averageCount(tFile, nFile, averageOut, tReadCount, nReadCount, rd_threshold, minNBases):
|
|
98 tList = file.readlines(open(tFile))
|
|
99 nList = file.readlines(open(nFile))
|
|
100 # constant & counter
|
|
101 OFF = 1
|
|
102 tCount = 0
|
|
103 nCount = 0
|
|
104
|
|
105 # create and open files
|
|
106 output = open(averageOut, "w")
|
|
107
|
|
108 # Offset and Ratio for Geometric Mean Normalisation
|
|
109 r1, r2 = getFactor(tReadCount, nReadCount)
|
|
110 if rd_threshold > 0:
|
|
111 #OFF = 0
|
|
112 OFF = 0.5
|
|
113
|
|
114 #big loop
|
|
115 while (nCount < len(nList)):
|
|
116 # initialisation, get the chr, geneID, geneName
|
|
117 init = tList[tCount].split()
|
|
118 initial = init[5]
|
|
119 _exon = init[6]
|
|
120 chr = init[1]
|
|
121 gene = init[0]
|
|
122 _start = int(init[2])
|
|
123
|
|
124 # check if t-gene and n-gene refer to the same gene
|
|
125 check_init = nList[nCount].split()
|
|
126 if check_init[5] != initial or check_init[6] != _exon:
|
|
127 print "Initial: ", initial
|
|
128 print "Check_Init.id: ", check_init[5]
|
|
129 print "_Exon: ", _exon
|
|
130 print "Check_Init.exon: ", check_init[6]
|
|
131 print "Error. Comparing different Gene"
|
|
132 sys.exit(1)
|
|
133
|
|
134 # create data dictionary for tumour and normal data (per each regions/ exon)
|
|
135 tCount, tDict, tDictOri = createDataDict(tCount, tList, r1, OFF, initial, _exon)
|
|
136 nCount, nDict, nDictOri = createDataDict(nCount, nList, r2, OFF, initial, _exon)
|
|
137 # check number of bases in the both gene dict
|
|
138 if len(nDict) != len(tDict):
|
|
139 print "N:", len(nDict)
|
|
140 print "T:", len(tDict)
|
|
141 print "Error. Different length of dict"
|
|
142 sys.exit(1)
|
|
143
|
|
144 # compare coverage
|
|
145 count = _start
|
|
146 _max = max(nDict.keys())
|
|
147 ratioList = []
|
|
148 tumourList = []
|
|
149 normalList = []
|
|
150 tumourOriList = []
|
|
151 normalOriList = []
|
|
152 while count <= _max:
|
|
153 # get ratio
|
|
154 if (nDict[count] < rd_threshold) and (tDict[count] < rd_threshold):
|
|
155 ratio = 0.0
|
|
156 else:
|
|
157 if tDict[count] == 0:
|
|
158 tDict[count] = 0.5
|
|
159
|
|
160 ratio = math.log((float(tDict[count]) / nDict[count]),2)
|
|
161 tumourList.append(tDict[count])
|
|
162 tumourOriList.append(tDictOri[count])
|
|
163 normalList.append(nDict[count])
|
|
164 normalOriList.append(nDictOri[count])
|
|
165 ratioList.append(ratio)
|
|
166
|
|
167 count += 1
|
|
168
|
|
169 ratioLen = len(ratioList)
|
|
170
|
|
171 # get average
|
|
172 avg = getAverage(ratioList)
|
|
173 sd = getStdDev(ratioList, avg)
|
|
174 tumourAvg= str(round(getAverage(tumourList),3))
|
|
175 normalAvg= str(round(getAverage(normalList),3))
|
|
176 tumourOriAvg = str(round(getAverage(tumourOriList),3))
|
|
177 normalOriAvg = str(round(getAverage(normalOriList),3))
|
|
178
|
|
179 # get median
|
|
180 ratioList.sort()
|
|
181 min_logratio, max_logratio = getMinMax(ratioList)
|
|
182 median = getMedian(ratioList)
|
|
183
|
|
184 # write output
|
|
185 if ratioLen >= minNBases:
|
|
186 output.write(initial + "\t" + gene + "\t" + str(ratioLen) + "\t")
|
|
187 output.write(str(round(avg,3))+ "\t"+ str(count)+ "\t" + _exon + "\t")
|
|
188 output.write(str(round(sd ,3))+ "\t"+ tumourAvg + "\t" + normalAvg +"\t")
|
|
189 output.write(tumourOriAvg + "\t" + normalOriAvg + "\t")
|
|
190 output.write(str(round(median,3)) + "\t" + str(round(min_logratio,3)) + "\t")
|
|
191 output.write(str(round(max_logratio,3)) + "\n")
|
|
192
|
|
193 output.close()
|
|
194
|
|
195 #print "End of averageCount.py with the last target = '%s'" %(initial)
|