Mercurial > repos > bornea > saint_interactions
view ProteinInteractions_v2.py @ 5:fd6f8df5a043 draft
Uploaded
author | bornea |
---|---|
date | Thu, 19 Nov 2015 11:17:15 -0500 |
parents | cd2c68e1b1ae |
children | 6ff557cd705f |
line wrap: on
line source
################################################################################ # This program will read in a SAINT 'list.txt' file and the interactions from # the consensus path db database and return all the interactions that we saw in # our experiment in a format suitable for cytoscape. This allows us to filter # before getting PPIs so that it doesn't affect our SAINT score or include # interactions that don't score well ################################################################################ import urllib2 import itertools import sys ################################################################################ ## REQUIRED INPUT ## # 1) listfile: SAINTexpress output # 2) SAINT_cutoff: Saint score cutoff for import (between 0 and 1) # 3) Int_conf: Confidence of PPI from CPDB to include # - low: no filtering # - medium: >0.5 # - high: >0.7 # - very high: >0.9 # 4) Species: Human, Yeast, or Mouse ############################################################################### listfile = sys.argv[1] SAINT_cutoff = sys.argv[2] Int_conf = sys.argv[3] Species = sys.argv[4] cyto_file = sys.argv[5] db_path = sys.arv[6] ############################################################################### class ReturnValue1(object): def __init__(self, uniprot_acc, gene, swissprot): self.up = uniprot_acc self.gn = gene self.sp = swissprot class ReturnValue2(object): def __init__(self, getdata, getproteins, getheader): self.data = getdata self.proteins = getproteins self.header = getheader def main(listfile, SAINT_cutoff, Int_conf, Species): cytoscape(dd_network(listfile, SAINT_cutoff, Int_conf), listfile, SAINT_cutoff) def readtab(infile): with open(infile,'r') as x: # read in tab-delim text output = [] for line in x: line = line.strip() temp = line.split('\t') output.append(temp) return output def read_listfile(listfile): # Get data, proteins and header from scaffold output dupes = readtab(listfile) header = dupes[0] prot_start = header.index("PreyGene")-1 data = dupes[1:] # cut off blank line and END OF FILE proteins = [] for protein in data: proteins.append(protein[prot_start]) return ReturnValue2(data, proteins, header) def get_info(uniprot_accession_in): #get aa lengths and gene name error = open('error proteins.txt', 'a+') i=0 while i==0: try: data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta") break except urllib2.HTTPError, err: i = i + 1 if i == 50: sys.exit("More than 50 errors. Check your file or try again later.") if err.code == 404: error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n') seqlength = 'NA' genename = 'NA' return ReturnValue1(seqlength, genename) elif err.code == 302: sys.exit("Request timed out. Check connection and try again.") else: sys.exit("Uniprot had some other error") lines = data.readlines() header = lines[0] lst = header.split('|') lst2 = lst[2].split(' ') swissprot = lst2[0] uniprot_acc = lst[1] if lines == []: error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') error.close uniprot_acc = 'NA' genename = 'NA' return ReturnValue1(uniprot_acc, genename, swissprot) if lines != []: seqlength = 0 header = lines[0] if 'GN=' in header: lst = header.split('GN=') lst2 = lst[1].split(' ') genename = lst2[0] error.close return ReturnValue1(uniprot_acc, genename, swissprot) if 'GN=' not in header: genename = 'NA' error.close return ReturnValue1(uniprot_acc, genename, swissprot) def dd_network(listfile, SAINTscore, CPDB_filter): ## Filter by SS and CPDB data = read_listfile(listfile).data # change to filtered list SS = (read_listfile(listfile).header).index("SaintScore") filt_data = [] for i in data: if i[SS] >= SAINTscore: filt_data.append(i) accessions = [] for i in filt_data: accessions.append(get_info(i[1]).sp) GO=[] for i in CPDB[2:]: if i[3] >= CPDB_filter: # filter interaction confidence GO.append(i[2]) # all known interactions GO2 = [] for i in GO: GO2.append(i.split(',')) # make interactions list friendly unfiltered_network = {} for i in accessions: interactions = [] for j in GO2: if i in j: # find the interactions if j not in interactions:# dont add duplicate interactions interactions.append(j) merged = list(itertools.chain(*interactions)) # flatten list of lists unfiltered_network[i]=merged # assign all possible interactions to protein in a dictionary dd_network = {} #data dependent network for i in unfiltered_network: temp = [] for j in unfiltered_network[i]: if j in accessions: if j not in temp: if j != i: temp.append(j) dd_network[i]=temp return dd_network def cytoscape(dd_network, listfile, SAINTscore): with open('network.sif','wt') as y: data = read_listfile(listfile).data SS = (read_listfile(listfile).header).index("SaintScore") filt_data = [] for i in data: if i[SS] >= SAINTscore: filt_data.append(i) header = ["Prey", "Interactions"] header = '\t'.join(header) y.write(header + '\n') for i in filt_data: if dd_network[i[1]] != []: lst = [] #x='\t'.join(i) for j in dd_network[i[1]]: lst.append(j) for j in lst: y.write(i[1]+'\t'+'pp'+'\t' + j+'\n') if Species == "Human": CPDB = readtab(str(db_path) + 'ConsensusPathDB_human_PPI.txt') if Species == "Yeast": CPDB = readtab(str(db_path) + 'ConsensusPathDB_yeast_PPI.txt') if Species == "Mouse": CPDB = readtab(str(db_path) +'ConsensusPathDB_mouse_PPI.txt') if __name__ == '__main__': main(listfile, SAINT_cutoff, Int_conf, Species) #main("Crizo_list.txt", 0.7, 0.7, 'Human') os.rename('network.sif', str(cyto_file))