diff cluster_onConnctdComps.py @ 31:a631c2f6d913

Update to Miller Lab devshed revision 3c4110ffacc3
author Richard Burhans <burhans@bx.psu.edu>
date Fri, 20 Sep 2013 13:25:27 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster_onConnctdComps.py	Fri Sep 20 13:25:27 2013 -0400
@@ -0,0 +1,223 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#       Cluster_GOKEGG.py
+#       
+#       Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu>
+#       
+#       This program is free software; you can redistribute it and/or modify
+#       it under the terms of the GNU General Public License as published by
+#       the Free Software Foundation; either version 2 of the License, or
+#       (at your option) any later version.
+#       
+#       This program is distributed in the hope that it will be useful,
+#       but WITHOUT ANY WARRANTY; without even the implied warranty of
+#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#       GNU General Public License for more details.
+#       
+#       You should have received a copy of the GNU General Public License
+#       along with this program; if not, write to the Free Software
+#       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+#       MA 02110-1301, USA.
+
+import argparse
+import os
+from networkx import connected_components,Graph,clustering
+from numpy import percentile
+from decimal import Decimal,getcontext
+from itertools import permutations,combinations
+import sys
+
+def rtrnClustrsOnCltrCoff(dNodesWeightMin,threshold,perctile=True):
+	"""
+	From a file with three columns: nodeA, nodeB and a score, it returns
+	the strong and weak connected components produced when the edges 
+	below the percentage threshold (or value) are excluded.
+	"""
+	#~ 
+	Gmin = Graph()
+	for nodeA,nodeB in dNodesWeightMin:
+		wMin=dNodesWeightMin[nodeA,nodeB]
+		Gmin.add_edge(nodeA,nodeB,weight=wMin)
+	#~ 
+	clstrCoffcMin=clustering(Gmin,weight='weight')
+	#~ 
+	if perctile:
+		umbralMin=percentile(clstrCoffcMin.values(),threshold)
+	else:
+		umbralMin=threshold
+	#~ 
+	GminNdsRmv=[x for x in clstrCoffcMin if clstrCoffcMin[x]<umbralMin]
+	#~ 
+	Gmin.remove_nodes_from(GminNdsRmv)
+	#~ 
+	dTermCmptNumbWkMin=rtrndata(Gmin)
+	#~
+	salelClustr=[]
+	srtdterms=sorted(dTermCmptNumbWkMin.keys())
+	for echTerm in srtdterms:
+		try:
+			MinT=dTermCmptNumbWkMin[echTerm]
+		except:
+			MinT='-'
+		salelClustr.append('\t'.join([echTerm,MinT]))
+	#~ 
+	return salelClustr
+	
+def rtrndata(G):
+	"""
+	returna list of terms and its clustering, as well as clusters from
+	a networkx formatted file.
+	"""
+	#~ 
+	cntCompnts=0
+	dTermCmptNumbWk={}
+	for echCompnt in connected_components(G):
+		cntCompnts+=1
+		#print '.'.join(echCompnt)
+		for echTerm in echCompnt:
+			dTermCmptNumbWk[echTerm]=str(cntCompnts)
+	#~ 
+	return dTermCmptNumbWk
+
+def rtrnCATcENSEMBLc(inCATfile,classClmns,ENSEMBLTcolmn,nonHdr=True):
+	"""
+	return a dictionary of all the categories in an input file with 
+	a set of genes. Takes as input a file with categories an genes.
+	"""
+	dCAT={}
+	dENSEMBLTCAT={}
+	for eachl in open(inCATfile,'r'):
+		if nonHdr and eachl.strip():
+			ENSEMBLT=eachl.splitlines()[0].split('\t')[ENSEMBLTcolmn]
+			sCAT=set()
+			for CATcolmn in classClmns:
+				sCAT.update(set([x for x in eachl.splitlines()[0].split('\t')[CATcolmn].split('.')]))
+			sCAT=sCAT.difference(set(['','U','N']))
+			if len(sCAT)>0:
+				dENSEMBLTCAT[ENSEMBLT]=sCAT
+			for CAT in sCAT:
+				try:
+					dCAT[CAT].add(ENSEMBLT)
+				except:
+					dCAT[CAT]=set([ENSEMBLT])
+		nonHdr=True
+	#~ 
+	dCAT=dict([(x,len(dCAT[x])) for x in dCAT.keys()])
+	#~ 
+	return dCAT,dENSEMBLTCAT
+
+
+def calcDistance(sCAT1,sCAT2):
+	"""
+	takes as input two set of genesin different categories and returns
+	a value 1-percentage of gene shared cat1->cat2, and cat2->cat1.
+	"""
+	getcontext().prec=5
+	lgensS1=Decimal(len(sCAT1))
+	lgensS2=Decimal(len(sCAT2))
+	shrdGns=sCAT1.intersection(sCAT2)
+	lenshrdGns=len(shrdGns)
+	#~ 
+	dC1C2=1-(lenshrdGns/lgensS1)
+	dC2C1=1-(lenshrdGns/lgensS2)
+	#~ 
+	return dC1C2,dC2C1
+
+def rtnPrwsdtncs(dCAT,dENSEMBLTCAT):
+	"""
+	return a mcl formated pairwise distances from a list of categories
+	"""	
+	#~ 
+	getcontext().prec=5
+	dCATdst={}
+	lENSEMBL=dENSEMBLTCAT.keys()
+	l=len(lENSEMBL)
+	c=0
+	for ENSEMBL in lENSEMBL:
+		c+=1
+		lCAT=dENSEMBLTCAT.pop(ENSEMBL)
+		for CAT1,CAT2 in combinations(lCAT, 2):
+			try:
+				dCATdst[CAT1,CAT2]+=1
+			except:
+				dCATdst[CAT1,CAT2]=1
+			try:
+				dCATdst[CAT2,CAT1]+=1
+			except:
+				dCATdst[CAT2,CAT1]=1
+	#~ 
+	dNodesWeightMin={}
+	l=len(dCATdst)
+	for CAT1,CAT2 in dCATdst.keys():
+		shrdGns=dCATdst.pop((CAT1,CAT2))
+		dC1C2=float(shrdGns)
+		nodeA,nodeB=sorted([CAT1,CAT2])
+		try:
+			cscor=dNodesWeightMin[nodeA,nodeB]
+			if cscor>=dC1C2:
+				dNodesWeightMin[nodeA,nodeB]=dC1C2
+		except:
+			dNodesWeightMin[nodeA,nodeB]=dC1C2
+	#
+	return dNodesWeightMin
+
+def parse_class_columns(val, max_col):
+	int_list = []
+
+	for elem in [x.strip() for x in val.split(',')]:
+		if elem[0].lower() != 'c':
+			print >> sys.stderr, "bad column format:", elem
+			sys.exit(1)
+
+        int_val = as_int(elem[1:])
+
+        if int_val is None:
+			print >> sys.stderr, "bad column format:", elem
+			sys.exit(1)
+        elif not 1 <= int_val <= max_col:
+			print >> sys.stderr, "column out of range:", elem
+			sys.exit(1)
+
+        int_list.append(int_val - 1)
+
+	return int_list
+
+def as_int(val):
+    try:
+        return int(val)
+    except ValueError:
+        return None
+    else:
+        raise
+
+def main():
+	"""
+	"""
+	#~ bpython cluster_onConnctdComps.py --input=../conctFinal_CEU.tsv --outfile=../borrar.txt --threshold=90 --ENSEMBLTcolmn=1 --classClmns='20 22'
+	parser = argparse.ArgumentParser(description='Returns the count of genes in ...')
+	parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format.',required=True)
+	parser.add_argument('--input_columns',metavar='input INT value',type=int,help='the number of columns in the input file.',required=True)
+	parser.add_argument('--outfile',metavar='input TXT file',type=str,help='the output file with the connected components.',required=True)
+	parser.add_argument('--threshold',metavar='input FLOAT value',type=float,help='the threshold to disconnect the nodes.',required=True)
+	parser.add_argument('--ENSEMBLTcolmn',metavar='input INT file',type=int,help='the column with the ENSEMBLE code in the input.',required=True)
+	parser.add_argument('--classClmns',metavar='input STR value',type=str,help='the list of columns with the gene categories separated by space.',required=True)
+	args = parser.parse_args()
+	infile = args.input
+	threshold = args.threshold
+	outfile = args.outfile
+	ENSEMBLTcolmn = args.ENSEMBLTcolmn
+	classClmns = parse_class_columns(args.classClmns, args.input_columns)
+	#~ 
+	dCAT,dENSEMBLTCAT=rtrnCATcENSEMBLc(infile,classClmns,ENSEMBLTcolmn)
+	dNodesWeightMin=rtnPrwsdtncs(dCAT,dENSEMBLTCAT)
+	salelClustr=rtrnClustrsOnCltrCoff(dNodesWeightMin,threshold)
+	#~ 
+	with open(outfile, 'w') as salef:
+		print >> salef, '\n'.join(salelClustr)
+	#~ 
+	#~ 
+
+if __name__ == '__main__':
+	main()
+