Mercurial > repos > miller-lab > genome_diversity
comparison cluster_onConnctdComps.py @ 31:a631c2f6d913
Update to Miller Lab devshed revision 3c4110ffacc3
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Fri, 20 Sep 2013 13:25:27 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
30:4188853b940b | 31:a631c2f6d913 |
---|---|
1 #!/usr/bin/env python | |
2 # -*- coding: utf-8 -*- | |
3 # | |
4 # Cluster_GOKEGG.py | |
5 # | |
6 # Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu> | |
7 # | |
8 # This program is free software; you can redistribute it and/or modify | |
9 # it under the terms of the GNU General Public License as published by | |
10 # the Free Software Foundation; either version 2 of the License, or | |
11 # (at your option) any later version. | |
12 # | |
13 # This program is distributed in the hope that it will be useful, | |
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 # GNU General Public License for more details. | |
17 # | |
18 # You should have received a copy of the GNU General Public License | |
19 # along with this program; if not, write to the Free Software | |
20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
21 # MA 02110-1301, USA. | |
22 | |
23 import argparse | |
24 import os | |
25 from networkx import connected_components,Graph,clustering | |
26 from numpy import percentile | |
27 from decimal import Decimal,getcontext | |
28 from itertools import permutations,combinations | |
29 import sys | |
30 | |
31 def rtrnClustrsOnCltrCoff(dNodesWeightMin,threshold,perctile=True): | |
32 """ | |
33 From a file with three columns: nodeA, nodeB and a score, it returns | |
34 the strong and weak connected components produced when the edges | |
35 below the percentage threshold (or value) are excluded. | |
36 """ | |
37 #~ | |
38 Gmin = Graph() | |
39 for nodeA,nodeB in dNodesWeightMin: | |
40 wMin=dNodesWeightMin[nodeA,nodeB] | |
41 Gmin.add_edge(nodeA,nodeB,weight=wMin) | |
42 #~ | |
43 clstrCoffcMin=clustering(Gmin,weight='weight') | |
44 #~ | |
45 if perctile: | |
46 umbralMin=percentile(clstrCoffcMin.values(),threshold) | |
47 else: | |
48 umbralMin=threshold | |
49 #~ | |
50 GminNdsRmv=[x for x in clstrCoffcMin if clstrCoffcMin[x]<umbralMin] | |
51 #~ | |
52 Gmin.remove_nodes_from(GminNdsRmv) | |
53 #~ | |
54 dTermCmptNumbWkMin=rtrndata(Gmin) | |
55 #~ | |
56 salelClustr=[] | |
57 srtdterms=sorted(dTermCmptNumbWkMin.keys()) | |
58 for echTerm in srtdterms: | |
59 try: | |
60 MinT=dTermCmptNumbWkMin[echTerm] | |
61 except: | |
62 MinT='-' | |
63 salelClustr.append('\t'.join([echTerm,MinT])) | |
64 #~ | |
65 return salelClustr | |
66 | |
67 def rtrndata(G): | |
68 """ | |
69 returna list of terms and its clustering, as well as clusters from | |
70 a networkx formatted file. | |
71 """ | |
72 #~ | |
73 cntCompnts=0 | |
74 dTermCmptNumbWk={} | |
75 for echCompnt in connected_components(G): | |
76 cntCompnts+=1 | |
77 #print '.'.join(echCompnt) | |
78 for echTerm in echCompnt: | |
79 dTermCmptNumbWk[echTerm]=str(cntCompnts) | |
80 #~ | |
81 return dTermCmptNumbWk | |
82 | |
83 def rtrnCATcENSEMBLc(inCATfile,classClmns,ENSEMBLTcolmn,nonHdr=True): | |
84 """ | |
85 return a dictionary of all the categories in an input file with | |
86 a set of genes. Takes as input a file with categories an genes. | |
87 """ | |
88 dCAT={} | |
89 dENSEMBLTCAT={} | |
90 for eachl in open(inCATfile,'r'): | |
91 if nonHdr and eachl.strip(): | |
92 ENSEMBLT=eachl.splitlines()[0].split('\t')[ENSEMBLTcolmn] | |
93 sCAT=set() | |
94 for CATcolmn in classClmns: | |
95 sCAT.update(set([x for x in eachl.splitlines()[0].split('\t')[CATcolmn].split('.')])) | |
96 sCAT=sCAT.difference(set(['','U','N'])) | |
97 if len(sCAT)>0: | |
98 dENSEMBLTCAT[ENSEMBLT]=sCAT | |
99 for CAT in sCAT: | |
100 try: | |
101 dCAT[CAT].add(ENSEMBLT) | |
102 except: | |
103 dCAT[CAT]=set([ENSEMBLT]) | |
104 nonHdr=True | |
105 #~ | |
106 dCAT=dict([(x,len(dCAT[x])) for x in dCAT.keys()]) | |
107 #~ | |
108 return dCAT,dENSEMBLTCAT | |
109 | |
110 | |
111 def calcDistance(sCAT1,sCAT2): | |
112 """ | |
113 takes as input two set of genesin different categories and returns | |
114 a value 1-percentage of gene shared cat1->cat2, and cat2->cat1. | |
115 """ | |
116 getcontext().prec=5 | |
117 lgensS1=Decimal(len(sCAT1)) | |
118 lgensS2=Decimal(len(sCAT2)) | |
119 shrdGns=sCAT1.intersection(sCAT2) | |
120 lenshrdGns=len(shrdGns) | |
121 #~ | |
122 dC1C2=1-(lenshrdGns/lgensS1) | |
123 dC2C1=1-(lenshrdGns/lgensS2) | |
124 #~ | |
125 return dC1C2,dC2C1 | |
126 | |
127 def rtnPrwsdtncs(dCAT,dENSEMBLTCAT): | |
128 """ | |
129 return a mcl formated pairwise distances from a list of categories | |
130 """ | |
131 #~ | |
132 getcontext().prec=5 | |
133 dCATdst={} | |
134 lENSEMBL=dENSEMBLTCAT.keys() | |
135 l=len(lENSEMBL) | |
136 c=0 | |
137 for ENSEMBL in lENSEMBL: | |
138 c+=1 | |
139 lCAT=dENSEMBLTCAT.pop(ENSEMBL) | |
140 for CAT1,CAT2 in combinations(lCAT, 2): | |
141 try: | |
142 dCATdst[CAT1,CAT2]+=1 | |
143 except: | |
144 dCATdst[CAT1,CAT2]=1 | |
145 try: | |
146 dCATdst[CAT2,CAT1]+=1 | |
147 except: | |
148 dCATdst[CAT2,CAT1]=1 | |
149 #~ | |
150 dNodesWeightMin={} | |
151 l=len(dCATdst) | |
152 for CAT1,CAT2 in dCATdst.keys(): | |
153 shrdGns=dCATdst.pop((CAT1,CAT2)) | |
154 dC1C2=float(shrdGns) | |
155 nodeA,nodeB=sorted([CAT1,CAT2]) | |
156 try: | |
157 cscor=dNodesWeightMin[nodeA,nodeB] | |
158 if cscor>=dC1C2: | |
159 dNodesWeightMin[nodeA,nodeB]=dC1C2 | |
160 except: | |
161 dNodesWeightMin[nodeA,nodeB]=dC1C2 | |
162 # | |
163 return dNodesWeightMin | |
164 | |
165 def parse_class_columns(val, max_col): | |
166 int_list = [] | |
167 | |
168 for elem in [x.strip() for x in val.split(',')]: | |
169 if elem[0].lower() != 'c': | |
170 print >> sys.stderr, "bad column format:", elem | |
171 sys.exit(1) | |
172 | |
173 int_val = as_int(elem[1:]) | |
174 | |
175 if int_val is None: | |
176 print >> sys.stderr, "bad column format:", elem | |
177 sys.exit(1) | |
178 elif not 1 <= int_val <= max_col: | |
179 print >> sys.stderr, "column out of range:", elem | |
180 sys.exit(1) | |
181 | |
182 int_list.append(int_val - 1) | |
183 | |
184 return int_list | |
185 | |
186 def as_int(val): | |
187 try: | |
188 return int(val) | |
189 except ValueError: | |
190 return None | |
191 else: | |
192 raise | |
193 | |
194 def main(): | |
195 """ | |
196 """ | |
197 #~ bpython cluster_onConnctdComps.py --input=../conctFinal_CEU.tsv --outfile=../borrar.txt --threshold=90 --ENSEMBLTcolmn=1 --classClmns='20 22' | |
198 parser = argparse.ArgumentParser(description='Returns the count of genes in ...') | |
199 parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format.',required=True) | |
200 parser.add_argument('--input_columns',metavar='input INT value',type=int,help='the number of columns in the input file.',required=True) | |
201 parser.add_argument('--outfile',metavar='input TXT file',type=str,help='the output file with the connected components.',required=True) | |
202 parser.add_argument('--threshold',metavar='input FLOAT value',type=float,help='the threshold to disconnect the nodes.',required=True) | |
203 parser.add_argument('--ENSEMBLTcolmn',metavar='input INT file',type=int,help='the column with the ENSEMBLE code in the input.',required=True) | |
204 parser.add_argument('--classClmns',metavar='input STR value',type=str,help='the list of columns with the gene categories separated by space.',required=True) | |
205 args = parser.parse_args() | |
206 infile = args.input | |
207 threshold = args.threshold | |
208 outfile = args.outfile | |
209 ENSEMBLTcolmn = args.ENSEMBLTcolmn | |
210 classClmns = parse_class_columns(args.classClmns, args.input_columns) | |
211 #~ | |
212 dCAT,dENSEMBLTCAT=rtrnCATcENSEMBLc(infile,classClmns,ENSEMBLTcolmn) | |
213 dNodesWeightMin=rtnPrwsdtncs(dCAT,dENSEMBLTCAT) | |
214 salelClustr=rtrnClustrsOnCltrCoff(dNodesWeightMin,threshold) | |
215 #~ | |
216 with open(outfile, 'w') as salef: | |
217 print >> salef, '\n'.join(salelClustr) | |
218 #~ | |
219 #~ | |
220 | |
221 if __name__ == '__main__': | |
222 main() | |
223 |