comparison cluster_onConnctdComps.py @ 31:a631c2f6d913

Update to Miller Lab devshed revision 3c4110ffacc3
author Richard Burhans <burhans@bx.psu.edu>
date Fri, 20 Sep 2013 13:25:27 -0400
parents
children
comparison
equal deleted inserted replaced
30:4188853b940b 31:a631c2f6d913
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # Cluster_GOKEGG.py
5 #
6 # Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu>
7 #
8 # This program is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program; if not, write to the Free Software
20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
21 # MA 02110-1301, USA.
22
23 import argparse
24 import os
25 from networkx import connected_components,Graph,clustering
26 from numpy import percentile
27 from decimal import Decimal,getcontext
28 from itertools import permutations,combinations
29 import sys
30
31 def rtrnClustrsOnCltrCoff(dNodesWeightMin,threshold,perctile=True):
32 """
33 From a file with three columns: nodeA, nodeB and a score, it returns
34 the strong and weak connected components produced when the edges
35 below the percentage threshold (or value) are excluded.
36 """
37 #~
38 Gmin = Graph()
39 for nodeA,nodeB in dNodesWeightMin:
40 wMin=dNodesWeightMin[nodeA,nodeB]
41 Gmin.add_edge(nodeA,nodeB,weight=wMin)
42 #~
43 clstrCoffcMin=clustering(Gmin,weight='weight')
44 #~
45 if perctile:
46 umbralMin=percentile(clstrCoffcMin.values(),threshold)
47 else:
48 umbralMin=threshold
49 #~
50 GminNdsRmv=[x for x in clstrCoffcMin if clstrCoffcMin[x]<umbralMin]
51 #~
52 Gmin.remove_nodes_from(GminNdsRmv)
53 #~
54 dTermCmptNumbWkMin=rtrndata(Gmin)
55 #~
56 salelClustr=[]
57 srtdterms=sorted(dTermCmptNumbWkMin.keys())
58 for echTerm in srtdterms:
59 try:
60 MinT=dTermCmptNumbWkMin[echTerm]
61 except:
62 MinT='-'
63 salelClustr.append('\t'.join([echTerm,MinT]))
64 #~
65 return salelClustr
66
67 def rtrndata(G):
68 """
69 returna list of terms and its clustering, as well as clusters from
70 a networkx formatted file.
71 """
72 #~
73 cntCompnts=0
74 dTermCmptNumbWk={}
75 for echCompnt in connected_components(G):
76 cntCompnts+=1
77 #print '.'.join(echCompnt)
78 for echTerm in echCompnt:
79 dTermCmptNumbWk[echTerm]=str(cntCompnts)
80 #~
81 return dTermCmptNumbWk
82
83 def rtrnCATcENSEMBLc(inCATfile,classClmns,ENSEMBLTcolmn,nonHdr=True):
84 """
85 return a dictionary of all the categories in an input file with
86 a set of genes. Takes as input a file with categories an genes.
87 """
88 dCAT={}
89 dENSEMBLTCAT={}
90 for eachl in open(inCATfile,'r'):
91 if nonHdr and eachl.strip():
92 ENSEMBLT=eachl.splitlines()[0].split('\t')[ENSEMBLTcolmn]
93 sCAT=set()
94 for CATcolmn in classClmns:
95 sCAT.update(set([x for x in eachl.splitlines()[0].split('\t')[CATcolmn].split('.')]))
96 sCAT=sCAT.difference(set(['','U','N']))
97 if len(sCAT)>0:
98 dENSEMBLTCAT[ENSEMBLT]=sCAT
99 for CAT in sCAT:
100 try:
101 dCAT[CAT].add(ENSEMBLT)
102 except:
103 dCAT[CAT]=set([ENSEMBLT])
104 nonHdr=True
105 #~
106 dCAT=dict([(x,len(dCAT[x])) for x in dCAT.keys()])
107 #~
108 return dCAT,dENSEMBLTCAT
109
110
111 def calcDistance(sCAT1,sCAT2):
112 """
113 takes as input two set of genesin different categories and returns
114 a value 1-percentage of gene shared cat1->cat2, and cat2->cat1.
115 """
116 getcontext().prec=5
117 lgensS1=Decimal(len(sCAT1))
118 lgensS2=Decimal(len(sCAT2))
119 shrdGns=sCAT1.intersection(sCAT2)
120 lenshrdGns=len(shrdGns)
121 #~
122 dC1C2=1-(lenshrdGns/lgensS1)
123 dC2C1=1-(lenshrdGns/lgensS2)
124 #~
125 return dC1C2,dC2C1
126
127 def rtnPrwsdtncs(dCAT,dENSEMBLTCAT):
128 """
129 return a mcl formated pairwise distances from a list of categories
130 """
131 #~
132 getcontext().prec=5
133 dCATdst={}
134 lENSEMBL=dENSEMBLTCAT.keys()
135 l=len(lENSEMBL)
136 c=0
137 for ENSEMBL in lENSEMBL:
138 c+=1
139 lCAT=dENSEMBLTCAT.pop(ENSEMBL)
140 for CAT1,CAT2 in combinations(lCAT, 2):
141 try:
142 dCATdst[CAT1,CAT2]+=1
143 except:
144 dCATdst[CAT1,CAT2]=1
145 try:
146 dCATdst[CAT2,CAT1]+=1
147 except:
148 dCATdst[CAT2,CAT1]=1
149 #~
150 dNodesWeightMin={}
151 l=len(dCATdst)
152 for CAT1,CAT2 in dCATdst.keys():
153 shrdGns=dCATdst.pop((CAT1,CAT2))
154 dC1C2=float(shrdGns)
155 nodeA,nodeB=sorted([CAT1,CAT2])
156 try:
157 cscor=dNodesWeightMin[nodeA,nodeB]
158 if cscor>=dC1C2:
159 dNodesWeightMin[nodeA,nodeB]=dC1C2
160 except:
161 dNodesWeightMin[nodeA,nodeB]=dC1C2
162 #
163 return dNodesWeightMin
164
165 def parse_class_columns(val, max_col):
166 int_list = []
167
168 for elem in [x.strip() for x in val.split(',')]:
169 if elem[0].lower() != 'c':
170 print >> sys.stderr, "bad column format:", elem
171 sys.exit(1)
172
173 int_val = as_int(elem[1:])
174
175 if int_val is None:
176 print >> sys.stderr, "bad column format:", elem
177 sys.exit(1)
178 elif not 1 <= int_val <= max_col:
179 print >> sys.stderr, "column out of range:", elem
180 sys.exit(1)
181
182 int_list.append(int_val - 1)
183
184 return int_list
185
186 def as_int(val):
187 try:
188 return int(val)
189 except ValueError:
190 return None
191 else:
192 raise
193
194 def main():
195 """
196 """
197 #~ bpython cluster_onConnctdComps.py --input=../conctFinal_CEU.tsv --outfile=../borrar.txt --threshold=90 --ENSEMBLTcolmn=1 --classClmns='20 22'
198 parser = argparse.ArgumentParser(description='Returns the count of genes in ...')
199 parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format.',required=True)
200 parser.add_argument('--input_columns',metavar='input INT value',type=int,help='the number of columns in the input file.',required=True)
201 parser.add_argument('--outfile',metavar='input TXT file',type=str,help='the output file with the connected components.',required=True)
202 parser.add_argument('--threshold',metavar='input FLOAT value',type=float,help='the threshold to disconnect the nodes.',required=True)
203 parser.add_argument('--ENSEMBLTcolmn',metavar='input INT file',type=int,help='the column with the ENSEMBLE code in the input.',required=True)
204 parser.add_argument('--classClmns',metavar='input STR value',type=str,help='the list of columns with the gene categories separated by space.',required=True)
205 args = parser.parse_args()
206 infile = args.input
207 threshold = args.threshold
208 outfile = args.outfile
209 ENSEMBLTcolmn = args.ENSEMBLTcolmn
210 classClmns = parse_class_columns(args.classClmns, args.input_columns)
211 #~
212 dCAT,dENSEMBLTCAT=rtrnCATcENSEMBLc(infile,classClmns,ENSEMBLTcolmn)
213 dNodesWeightMin=rtnPrwsdtncs(dCAT,dENSEMBLTCAT)
214 salelClustr=rtrnClustrsOnCltrCoff(dNodesWeightMin,threshold)
215 #~
216 with open(outfile, 'w') as salef:
217 print >> salef, '\n'.join(salelClustr)
218 #~
219 #~
220
221 if __name__ == '__main__':
222 main()
223