Mercurial > repos > bornea > saint_interactions
diff ProteinInteractions_v2.py @ 4:cd2c68e1b1ae draft
Uploaded
author | bornea |
---|---|
date | Thu, 19 Nov 2015 11:17:03 -0500 |
parents | |
children | 6ff557cd705f |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ProteinInteractions_v2.py Thu Nov 19 11:17:03 2015 -0500 @@ -0,0 +1,170 @@ +################################################################################ +# This program will read in a SAINT 'list.txt' file and the interactions from +# the consensus path db database and return all the interactions that we saw in +# our experiment in a format suitable for cytoscape. This allows us to filter +# before getting PPIs so that it doesn't affect our SAINT score or include +# interactions that don't score well +################################################################################ +import urllib2 +import itertools +import sys +################################################################################ +## REQUIRED INPUT ## + +# 1) listfile: SAINTexpress output +# 2) SAINT_cutoff: Saint score cutoff for import (between 0 and 1) +# 3) Int_conf: Confidence of PPI from CPDB to include +# - low: no filtering +# - medium: >0.5 +# - high: >0.7 +# - very high: >0.9 +# 4) Species: Human, Yeast, or Mouse +############################################################################### +listfile = sys.argv[1] +SAINT_cutoff = sys.argv[2] +Int_conf = sys.argv[3] +Species = sys.argv[4] +cyto_file = sys.argv[5] +db_path = sys.arv[6] +############################################################################### +class ReturnValue1(object): + def __init__(self, uniprot_acc, gene, swissprot): + self.up = uniprot_acc + self.gn = gene + self.sp = swissprot +class ReturnValue2(object): + def __init__(self, getdata, getproteins, getheader): + self.data = getdata + self.proteins = getproteins + self.header = getheader +def main(listfile, SAINT_cutoff, Int_conf, Species): + cytoscape(dd_network(listfile, SAINT_cutoff, Int_conf), listfile, SAINT_cutoff) +def readtab(infile): + with open(infile,'r') as x: # read in tab-delim text + output = [] + for line in x: + line = line.strip() + temp = line.split('\t') + output.append(temp) + return output +def read_listfile(listfile): # Get data, proteins and header from scaffold output + dupes = readtab(listfile) + header = dupes[0] + prot_start = header.index("PreyGene")-1 + data = dupes[1:] # cut off blank line and END OF FILE + proteins = [] + for protein in data: + proteins.append(protein[prot_start]) + return ReturnValue2(data, proteins, header) + +def get_info(uniprot_accession_in): #get aa lengths and gene name + error = open('error proteins.txt', 'a+') + i=0 + while i==0: + try: + data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta") + break + except urllib2.HTTPError, err: + i = i + 1 + if i == 50: + sys.exit("More than 50 errors. Check your file or try again later.") + if err.code == 404: + error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n') + seqlength = 'NA' + genename = 'NA' + return ReturnValue1(seqlength, genename) + elif err.code == 302: + sys.exit("Request timed out. Check connection and try again.") + else: + sys.exit("Uniprot had some other error") + lines = data.readlines() + header = lines[0] + lst = header.split('|') + lst2 = lst[2].split(' ') + swissprot = lst2[0] + uniprot_acc = lst[1] + if lines == []: + error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') + error.close + uniprot_acc = 'NA' + genename = 'NA' + return ReturnValue1(uniprot_acc, genename, swissprot) + if lines != []: + seqlength = 0 + header = lines[0] + if 'GN=' in header: + lst = header.split('GN=') + lst2 = lst[1].split(' ') + genename = lst2[0] + error.close + return ReturnValue1(uniprot_acc, genename, swissprot) + if 'GN=' not in header: + genename = 'NA' + error.close + return ReturnValue1(uniprot_acc, genename, swissprot) + +def dd_network(listfile, SAINTscore, CPDB_filter): ## Filter by SS and CPDB + data = read_listfile(listfile).data # change to filtered list + SS = (read_listfile(listfile).header).index("SaintScore") + filt_data = [] + for i in data: + if i[SS] >= SAINTscore: + filt_data.append(i) + accessions = [] + for i in filt_data: + accessions.append(get_info(i[1]).sp) + GO=[] + for i in CPDB[2:]: + if i[3] >= CPDB_filter: # filter interaction confidence + GO.append(i[2]) # all known interactions + GO2 = [] + for i in GO: + GO2.append(i.split(',')) # make interactions list friendly + unfiltered_network = {} + for i in accessions: + interactions = [] + for j in GO2: + if i in j: # find the interactions + if j not in interactions:# dont add duplicate interactions + interactions.append(j) + merged = list(itertools.chain(*interactions)) # flatten list of lists + unfiltered_network[i]=merged # assign all possible interactions to protein in a dictionary + dd_network = {} #data dependent network + for i in unfiltered_network: + temp = [] + for j in unfiltered_network[i]: + if j in accessions: + if j not in temp: + if j != i: + temp.append(j) + dd_network[i]=temp + return dd_network +def cytoscape(dd_network, listfile, SAINTscore): + with open('network.sif','wt') as y: + data = read_listfile(listfile).data + SS = (read_listfile(listfile).header).index("SaintScore") + filt_data = [] + for i in data: + if i[SS] >= SAINTscore: + filt_data.append(i) + header = ["Prey", "Interactions"] + header = '\t'.join(header) + y.write(header + '\n') + for i in filt_data: + if dd_network[i[1]] != []: + lst = [] + #x='\t'.join(i) + for j in dd_network[i[1]]: + lst.append(j) + for j in lst: + y.write(i[1]+'\t'+'pp'+'\t' + j+'\n') +if Species == "Human": + CPDB = readtab(str(db_path) + 'ConsensusPathDB_human_PPI.txt') +if Species == "Yeast": + CPDB = readtab(str(db_path) + 'ConsensusPathDB_yeast_PPI.txt') +if Species == "Mouse": + CPDB = readtab(str(db_path) +'ConsensusPathDB_mouse_PPI.txt') +if __name__ == '__main__': + main(listfile, SAINT_cutoff, Int_conf, Species) + #main("Crizo_list.txt", 0.7, 0.7, 'Human') + os.rename('network.sif', str(cyto_file))