Mercurial > repos > miller-lab > genome_diversity
view cluster_onConnctdComps.py @ 32:03c22b722882
remove BeautifulSoup dependency
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Fri, 20 Sep 2013 13:54:23 -0400 |
parents | a631c2f6d913 |
children |
line wrap: on
line source
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Cluster_GOKEGG.py # # Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. import argparse import os from networkx import connected_components,Graph,clustering from numpy import percentile from decimal import Decimal,getcontext from itertools import permutations,combinations import sys def rtrnClustrsOnCltrCoff(dNodesWeightMin,threshold,perctile=True): """ From a file with three columns: nodeA, nodeB and a score, it returns the strong and weak connected components produced when the edges below the percentage threshold (or value) are excluded. """ #~ Gmin = Graph() for nodeA,nodeB in dNodesWeightMin: wMin=dNodesWeightMin[nodeA,nodeB] Gmin.add_edge(nodeA,nodeB,weight=wMin) #~ clstrCoffcMin=clustering(Gmin,weight='weight') #~ if perctile: umbralMin=percentile(clstrCoffcMin.values(),threshold) else: umbralMin=threshold #~ GminNdsRmv=[x for x in clstrCoffcMin if clstrCoffcMin[x]<umbralMin] #~ Gmin.remove_nodes_from(GminNdsRmv) #~ dTermCmptNumbWkMin=rtrndata(Gmin) #~ salelClustr=[] srtdterms=sorted(dTermCmptNumbWkMin.keys()) for echTerm in srtdterms: try: MinT=dTermCmptNumbWkMin[echTerm] except: MinT='-' salelClustr.append('\t'.join([echTerm,MinT])) #~ return salelClustr def rtrndata(G): """ returna list of terms and its clustering, as well as clusters from a networkx formatted file. """ #~ cntCompnts=0 dTermCmptNumbWk={} for echCompnt in connected_components(G): cntCompnts+=1 #print '.'.join(echCompnt) for echTerm in echCompnt: dTermCmptNumbWk[echTerm]=str(cntCompnts) #~ return dTermCmptNumbWk def rtrnCATcENSEMBLc(inCATfile,classClmns,ENSEMBLTcolmn,nonHdr=True): """ return a dictionary of all the categories in an input file with a set of genes. Takes as input a file with categories an genes. """ dCAT={} dENSEMBLTCAT={} for eachl in open(inCATfile,'r'): if nonHdr and eachl.strip(): ENSEMBLT=eachl.splitlines()[0].split('\t')[ENSEMBLTcolmn] sCAT=set() for CATcolmn in classClmns: sCAT.update(set([x for x in eachl.splitlines()[0].split('\t')[CATcolmn].split('.')])) sCAT=sCAT.difference(set(['','U','N'])) if len(sCAT)>0: dENSEMBLTCAT[ENSEMBLT]=sCAT for CAT in sCAT: try: dCAT[CAT].add(ENSEMBLT) except: dCAT[CAT]=set([ENSEMBLT]) nonHdr=True #~ dCAT=dict([(x,len(dCAT[x])) for x in dCAT.keys()]) #~ return dCAT,dENSEMBLTCAT def calcDistance(sCAT1,sCAT2): """ takes as input two set of genesin different categories and returns a value 1-percentage of gene shared cat1->cat2, and cat2->cat1. """ getcontext().prec=5 lgensS1=Decimal(len(sCAT1)) lgensS2=Decimal(len(sCAT2)) shrdGns=sCAT1.intersection(sCAT2) lenshrdGns=len(shrdGns) #~ dC1C2=1-(lenshrdGns/lgensS1) dC2C1=1-(lenshrdGns/lgensS2) #~ return dC1C2,dC2C1 def rtnPrwsdtncs(dCAT,dENSEMBLTCAT): """ return a mcl formated pairwise distances from a list of categories """ #~ getcontext().prec=5 dCATdst={} lENSEMBL=dENSEMBLTCAT.keys() l=len(lENSEMBL) c=0 for ENSEMBL in lENSEMBL: c+=1 lCAT=dENSEMBLTCAT.pop(ENSEMBL) for CAT1,CAT2 in combinations(lCAT, 2): try: dCATdst[CAT1,CAT2]+=1 except: dCATdst[CAT1,CAT2]=1 try: dCATdst[CAT2,CAT1]+=1 except: dCATdst[CAT2,CAT1]=1 #~ dNodesWeightMin={} l=len(dCATdst) for CAT1,CAT2 in dCATdst.keys(): shrdGns=dCATdst.pop((CAT1,CAT2)) dC1C2=float(shrdGns) nodeA,nodeB=sorted([CAT1,CAT2]) try: cscor=dNodesWeightMin[nodeA,nodeB] if cscor>=dC1C2: dNodesWeightMin[nodeA,nodeB]=dC1C2 except: dNodesWeightMin[nodeA,nodeB]=dC1C2 # return dNodesWeightMin def parse_class_columns(val, max_col): int_list = [] for elem in [x.strip() for x in val.split(',')]: if elem[0].lower() != 'c': print >> sys.stderr, "bad column format:", elem sys.exit(1) int_val = as_int(elem[1:]) if int_val is None: print >> sys.stderr, "bad column format:", elem sys.exit(1) elif not 1 <= int_val <= max_col: print >> sys.stderr, "column out of range:", elem sys.exit(1) int_list.append(int_val - 1) return int_list def as_int(val): try: return int(val) except ValueError: return None else: raise def main(): """ """ #~ bpython cluster_onConnctdComps.py --input=../conctFinal_CEU.tsv --outfile=../borrar.txt --threshold=90 --ENSEMBLTcolmn=1 --classClmns='20 22' parser = argparse.ArgumentParser(description='Returns the count of genes in ...') parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format.',required=True) parser.add_argument('--input_columns',metavar='input INT value',type=int,help='the number of columns in the input file.',required=True) parser.add_argument('--outfile',metavar='input TXT file',type=str,help='the output file with the connected components.',required=True) parser.add_argument('--threshold',metavar='input FLOAT value',type=float,help='the threshold to disconnect the nodes.',required=True) parser.add_argument('--ENSEMBLTcolmn',metavar='input INT file',type=int,help='the column with the ENSEMBLE code in the input.',required=True) parser.add_argument('--classClmns',metavar='input STR value',type=str,help='the list of columns with the gene categories separated by space.',required=True) args = parser.parse_args() infile = args.input threshold = args.threshold outfile = args.outfile ENSEMBLTcolmn = args.ENSEMBLTcolmn classClmns = parse_class_columns(args.classClmns, args.input_columns) #~ dCAT,dENSEMBLTCAT=rtrnCATcENSEMBLc(infile,classClmns,ENSEMBLTcolmn) dNodesWeightMin=rtnPrwsdtncs(dCAT,dENSEMBLTCAT) salelClustr=rtrnClustrsOnCltrCoff(dNodesWeightMin,threshold) #~ with open(outfile, 'w') as salef: print >> salef, '\n'.join(salelClustr) #~ #~ if __name__ == '__main__': main()