Mercurial > repos > bornea > saint_interactions

diff ProteinInteractions_v2.py @ 4:cd2c68e1b1ae draft
Uploaded
author: bornea
date: Thu, 19 Nov 2015 11:17:03 -0500
children: 6ff557cd705f
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ProteinInteractions_v2.py	Thu Nov 19 11:17:03 2015 -0500
@@ -0,0 +1,170 @@
+################################################################################
+# This program will read in a SAINT 'list.txt' file and the interactions from
+# the consensus path db database and return all the interactions that we saw in
+# our experiment in a format suitable for cytoscape. This allows us to filter
+# before getting PPIs so that it doesn't affect our SAINT score or include
+# interactions that don't score well
+################################################################################
+import urllib2
+import itertools
+import sys
+################################################################################
+## REQUIRED INPUT ##
+
+# 1) listfile: SAINTexpress output
+# 2) SAINT_cutoff: Saint score cutoff for import (between 0 and 1)
+# 3) Int_conf: Confidence of PPI from CPDB to include
+#       - low: no filtering
+#       - medium: >0.5
+#       - high: >0.7
+#       - very high: >0.9
+# 4) Species: Human, Yeast, or Mouse
+###############################################################################
+listfile = sys.argv[1]
+SAINT_cutoff = sys.argv[2]
+Int_conf = sys.argv[3]
+Species = sys.argv[4]
+cyto_file = sys.argv[5]
+db_path = sys.arv[6]
+###############################################################################
+class ReturnValue1(object):
+    def __init__(self, uniprot_acc, gene, swissprot):
+        self.up = uniprot_acc
+        self.gn = gene
+        self.sp = swissprot
+class ReturnValue2(object):
+    def __init__(self, getdata, getproteins, getheader):
+        self.data = getdata
+        self.proteins = getproteins
+        self.header = getheader
+def main(listfile, SAINT_cutoff, Int_conf, Species):
+    cytoscape(dd_network(listfile, SAINT_cutoff, Int_conf), listfile, SAINT_cutoff)
+def readtab(infile):
+    with open(infile,'r') as x: # read in tab-delim text
+        output = []
+        for line in x:
+            line = line.strip()
+            temp = line.split('\t')
+            output.append(temp)
+    return output
+def read_listfile(listfile): # Get data, proteins and header from scaffold output
+    dupes = readtab(listfile)
+    header = dupes[0]
+    prot_start = header.index("PreyGene")-1
+    data = dupes[1:] # cut off blank line and END OF FILE
+    proteins = []
+    for protein in data:
+        proteins.append(protein[prot_start])
+    return ReturnValue2(data, proteins, header)
+
+def get_info(uniprot_accession_in): #get aa lengths and gene name
+    error = open('error proteins.txt', 'a+')
+    i=0
+    while i==0:
+        try:
+            data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")
+            break
+        except urllib2.HTTPError, err:
+            i = i + 1
+            if i == 50:
+                sys.exit("More than 50 errors. Check your file or try again later.")
+            if err.code == 404:
+                error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
+                seqlength = 'NA'
+                genename = 'NA'
+                return ReturnValue1(seqlength, genename)
+            elif err.code == 302:
+                sys.exit("Request timed out. Check connection and try again.")
+            else:
+                sys.exit("Uniprot had some other error")
+    lines = data.readlines()
+    header = lines[0] 
+    lst = header.split('|')
+    lst2 = lst[2].split(' ')
+    swissprot = lst2[0]
+    uniprot_acc = lst[1]
+    if lines == []:
+        error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
+        error.close
+        uniprot_acc = 'NA'
+        genename = 'NA'
+        return ReturnValue1(uniprot_acc, genename, swissprot)
+    if lines != []:
+        seqlength = 0
+        header = lines[0] 
+        if 'GN=' in header:
+            lst = header.split('GN=')
+            lst2 = lst[1].split(' ')
+            genename = lst2[0]
+            error.close
+            return ReturnValue1(uniprot_acc, genename, swissprot)
+        if 'GN=' not in header:
+            genename = 'NA'
+            error.close
+            return ReturnValue1(uniprot_acc, genename, swissprot)
+
+def dd_network(listfile, SAINTscore, CPDB_filter): ## Filter by SS and CPDB
+    data = read_listfile(listfile).data # change to filtered list
+    SS = (read_listfile(listfile).header).index("SaintScore")
+    filt_data = []
+    for i in data: 
+        if i[SS] >= SAINTscore:
+            filt_data.append(i)
+    accessions = []
+    for i in filt_data:
+        accessions.append(get_info(i[1]).sp)
+    GO=[]
+    for i in CPDB[2:]:
+        if i[3] >= CPDB_filter: # filter interaction confidence
+            GO.append(i[2]) # all known interactions
+    GO2 = []
+    for i in GO:
+        GO2.append(i.split(',')) # make interactions list friendly
+    unfiltered_network = {}
+    for i in accessions:
+        interactions = []
+        for j in GO2:
+            if i in j: # find the interactions
+                if j not in interactions:# dont add duplicate interactions
+                    interactions.append(j)
+        merged = list(itertools.chain(*interactions)) # flatten list of lists
+        unfiltered_network[i]=merged # assign all possible interactions to protein in a dictionary
+    dd_network = {} #data dependent network
+    for i in unfiltered_network:
+        temp = []
+        for j in unfiltered_network[i]:
+            if j in accessions:
+                if j not in temp:
+                    if j != i:
+                        temp.append(j)
+        dd_network[i]=temp
+    return dd_network
+def cytoscape(dd_network, listfile, SAINTscore):
+    with open('network.sif','wt') as y:
+        data = read_listfile(listfile).data
+        SS = (read_listfile(listfile).header).index("SaintScore")
+        filt_data = []
+        for i in data: 
+            if i[SS] >= SAINTscore:
+                filt_data.append(i)
+        header = ["Prey", "Interactions"]
+        header = '\t'.join(header)
+        y.write(header + '\n')
+        for i in filt_data:
+            if dd_network[i[1]] != []:
+                lst = []
+                #x='\t'.join(i)
+                for j in dd_network[i[1]]:
+                    lst.append(j)
+                for j in lst:
+                    y.write(i[1]+'\t'+'pp'+'\t' + j+'\n')
+if Species == "Human":
+        CPDB = readtab(str(db_path) + 'ConsensusPathDB_human_PPI.txt')
+if Species == "Yeast":
+        CPDB = readtab(str(db_path) + 'ConsensusPathDB_yeast_PPI.txt')
+if Species == "Mouse":
+        CPDB = readtab(str(db_path) +'ConsensusPathDB_mouse_PPI.txt')
+if __name__ == '__main__':
+    main(listfile, SAINT_cutoff, Int_conf, Species)
+    #main("Crizo_list.txt", 0.7, 0.7, 'Human')
+    os.rename('network.sif', str(cyto_file))
author	bornea
date	Thu, 19 Nov 2015 11:17:03 -0500
parents
children	6ff557cd705f