| 
4
 | 
     1 ################################################################################
 | 
| 
 | 
     2 # This program will read in a SAINT 'list.txt' file and the interactions from
 | 
| 
 | 
     3 # the consensus path db database and return all the interactions that we saw in
 | 
| 
 | 
     4 # our experiment in a format suitable for cytoscape. This allows us to filter
 | 
| 
 | 
     5 # before getting PPIs so that it doesn't affect our SAINT score or include
 | 
| 
 | 
     6 # interactions that don't score well
 | 
| 
 | 
     7 ################################################################################
 | 
| 
 | 
     8 import urllib2
 | 
| 
 | 
     9 import itertools
 | 
| 
 | 
    10 import sys
 | 
| 
10
 | 
    11 import os
 | 
| 
4
 | 
    12 ################################################################################
 | 
| 
 | 
    13 ## REQUIRED INPUT ##
 | 
| 
 | 
    14 
 | 
| 
 | 
    15 # 1) listfile: SAINTexpress output
 | 
| 
 | 
    16 # 2) SAINT_cutoff: Saint score cutoff for import (between 0 and 1)
 | 
| 
 | 
    17 # 3) Int_conf: Confidence of PPI from CPDB to include
 | 
| 
 | 
    18 #       - low: no filtering
 | 
| 
 | 
    19 #       - medium: >0.5
 | 
| 
 | 
    20 #       - high: >0.7
 | 
| 
 | 
    21 #       - very high: >0.9
 | 
| 
 | 
    22 # 4) Species: Human, Yeast, or Mouse
 | 
| 
 | 
    23 ###############################################################################
 | 
| 
 | 
    24 listfile = sys.argv[1]
 | 
| 
 | 
    25 SAINT_cutoff = sys.argv[2]
 | 
| 
 | 
    26 Int_conf = sys.argv[3]
 | 
| 
 | 
    27 Species = sys.argv[4]
 | 
| 
 | 
    28 cyto_file = sys.argv[5]
 | 
| 
9
 | 
    29 db_path = sys.argv[6]
 | 
| 
4
 | 
    30 ###############################################################################
 | 
| 
 | 
    31 class ReturnValue1(object):
 | 
| 
 | 
    32     def __init__(self, uniprot_acc, gene, swissprot):
 | 
| 
 | 
    33         self.up = uniprot_acc
 | 
| 
 | 
    34         self.gn = gene
 | 
| 
 | 
    35         self.sp = swissprot
 | 
| 
 | 
    36 class ReturnValue2(object):
 | 
| 
 | 
    37     def __init__(self, getdata, getproteins, getheader):
 | 
| 
 | 
    38         self.data = getdata
 | 
| 
 | 
    39         self.proteins = getproteins
 | 
| 
 | 
    40         self.header = getheader
 | 
| 
 | 
    41 def main(listfile, SAINT_cutoff, Int_conf, Species):
 | 
| 
 | 
    42     cytoscape(dd_network(listfile, SAINT_cutoff, Int_conf), listfile, SAINT_cutoff)
 | 
| 
 | 
    43 def readtab(infile):
 | 
| 
 | 
    44     with open(infile,'r') as x: # read in tab-delim text
 | 
| 
 | 
    45         output = []
 | 
| 
 | 
    46         for line in x:
 | 
| 
 | 
    47             line = line.strip()
 | 
| 
 | 
    48             temp = line.split('\t')
 | 
| 
 | 
    49             output.append(temp)
 | 
| 
 | 
    50     return output
 | 
| 
 | 
    51 def read_listfile(listfile): # Get data, proteins and header from scaffold output
 | 
| 
 | 
    52     dupes = readtab(listfile)
 | 
| 
 | 
    53     header = dupes[0]
 | 
| 
 | 
    54     prot_start = header.index("PreyGene")-1
 | 
| 
 | 
    55     data = dupes[1:] # cut off blank line and END OF FILE
 | 
| 
 | 
    56     proteins = []
 | 
| 
 | 
    57     for protein in data:
 | 
| 
 | 
    58         proteins.append(protein[prot_start])
 | 
| 
 | 
    59     return ReturnValue2(data, proteins, header)
 | 
| 
 | 
    60 
 | 
| 
 | 
    61 def get_info(uniprot_accession_in): #get aa lengths and gene name
 | 
| 
 | 
    62     error = open('error proteins.txt', 'a+')
 | 
| 
 | 
    63     i=0
 | 
| 
 | 
    64     while i==0:
 | 
| 
 | 
    65         try:
 | 
| 
 | 
    66             data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")
 | 
| 
 | 
    67             break
 | 
| 
 | 
    68         except urllib2.HTTPError, err:
 | 
| 
 | 
    69             i = i + 1
 | 
| 
 | 
    70             if i == 50:
 | 
| 
 | 
    71                 sys.exit("More than 50 errors. Check your file or try again later.")
 | 
| 
 | 
    72             if err.code == 404:
 | 
| 
 | 
    73                 error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
 | 
| 
 | 
    74                 seqlength = 'NA'
 | 
| 
 | 
    75                 genename = 'NA'
 | 
| 
 | 
    76                 return ReturnValue1(seqlength, genename)
 | 
| 
 | 
    77             elif err.code == 302:
 | 
| 
 | 
    78                 sys.exit("Request timed out. Check connection and try again.")
 | 
| 
 | 
    79             else:
 | 
| 
 | 
    80                 sys.exit("Uniprot had some other error")
 | 
| 
 | 
    81     lines = data.readlines()
 | 
| 
 | 
    82     header = lines[0] 
 | 
| 
 | 
    83     lst = header.split('|')
 | 
| 
 | 
    84     lst2 = lst[2].split(' ')
 | 
| 
 | 
    85     swissprot = lst2[0]
 | 
| 
 | 
    86     uniprot_acc = lst[1]
 | 
| 
 | 
    87     if lines == []:
 | 
| 
 | 
    88         error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
 | 
| 
 | 
    89         error.close
 | 
| 
 | 
    90         uniprot_acc = 'NA'
 | 
| 
 | 
    91         genename = 'NA'
 | 
| 
 | 
    92         return ReturnValue1(uniprot_acc, genename, swissprot)
 | 
| 
 | 
    93     if lines != []:
 | 
| 
 | 
    94         seqlength = 0
 | 
| 
 | 
    95         header = lines[0] 
 | 
| 
 | 
    96         if 'GN=' in header:
 | 
| 
 | 
    97             lst = header.split('GN=')
 | 
| 
 | 
    98             lst2 = lst[1].split(' ')
 | 
| 
 | 
    99             genename = lst2[0]
 | 
| 
 | 
   100             error.close
 | 
| 
 | 
   101             return ReturnValue1(uniprot_acc, genename, swissprot)
 | 
| 
 | 
   102         if 'GN=' not in header:
 | 
| 
 | 
   103             genename = 'NA'
 | 
| 
 | 
   104             error.close
 | 
| 
 | 
   105             return ReturnValue1(uniprot_acc, genename, swissprot)
 | 
| 
 | 
   106 
 | 
| 
 | 
   107 def dd_network(listfile, SAINTscore, CPDB_filter): ## Filter by SS and CPDB
 | 
| 
 | 
   108     data = read_listfile(listfile).data # change to filtered list
 | 
| 
 | 
   109     SS = (read_listfile(listfile).header).index("SaintScore")
 | 
| 
 | 
   110     filt_data = []
 | 
| 
 | 
   111     for i in data: 
 | 
| 
 | 
   112         if i[SS] >= SAINTscore:
 | 
| 
 | 
   113             filt_data.append(i)
 | 
| 
 | 
   114     accessions = []
 | 
| 
 | 
   115     for i in filt_data:
 | 
| 
 | 
   116         accessions.append(get_info(i[1]).sp)
 | 
| 
 | 
   117     GO=[]
 | 
| 
 | 
   118     for i in CPDB[2:]:
 | 
| 
 | 
   119         if i[3] >= CPDB_filter: # filter interaction confidence
 | 
| 
 | 
   120             GO.append(i[2]) # all known interactions
 | 
| 
 | 
   121     GO2 = []
 | 
| 
 | 
   122     for i in GO:
 | 
| 
 | 
   123         GO2.append(i.split(',')) # make interactions list friendly
 | 
| 
 | 
   124     unfiltered_network = {}
 | 
| 
 | 
   125     for i in accessions:
 | 
| 
 | 
   126         interactions = []
 | 
| 
 | 
   127         for j in GO2:
 | 
| 
 | 
   128             if i in j: # find the interactions
 | 
| 
 | 
   129                 if j not in interactions:# dont add duplicate interactions
 | 
| 
 | 
   130                     interactions.append(j)
 | 
| 
 | 
   131         merged = list(itertools.chain(*interactions)) # flatten list of lists
 | 
| 
 | 
   132         unfiltered_network[i]=merged # assign all possible interactions to protein in a dictionary
 | 
| 
 | 
   133     dd_network = {} #data dependent network
 | 
| 
 | 
   134     for i in unfiltered_network:
 | 
| 
 | 
   135         temp = []
 | 
| 
 | 
   136         for j in unfiltered_network[i]:
 | 
| 
 | 
   137             if j in accessions:
 | 
| 
 | 
   138                 if j not in temp:
 | 
| 
 | 
   139                     if j != i:
 | 
| 
 | 
   140                         temp.append(j)
 | 
| 
 | 
   141         dd_network[i]=temp
 | 
| 
 | 
   142     return dd_network
 | 
| 
 | 
   143 def cytoscape(dd_network, listfile, SAINTscore):
 | 
| 
 | 
   144     with open('network.sif','wt') as y:
 | 
| 
 | 
   145         data = read_listfile(listfile).data
 | 
| 
 | 
   146         SS = (read_listfile(listfile).header).index("SaintScore")
 | 
| 
 | 
   147         filt_data = []
 | 
| 
 | 
   148         for i in data: 
 | 
| 
 | 
   149             if i[SS] >= SAINTscore:
 | 
| 
 | 
   150                 filt_data.append(i)
 | 
| 
 | 
   151         header = ["Prey", "Interactions"]
 | 
| 
 | 
   152         header = '\t'.join(header)
 | 
| 
 | 
   153         y.write(header + '\n')
 | 
| 
 | 
   154         for i in filt_data:
 | 
| 
 | 
   155             if dd_network[i[1]] != []:
 | 
| 
 | 
   156                 lst = []
 | 
| 
 | 
   157                 #x='\t'.join(i)
 | 
| 
 | 
   158                 for j in dd_network[i[1]]:
 | 
| 
 | 
   159                     lst.append(j)
 | 
| 
 | 
   160                 for j in lst:
 | 
| 
 | 
   161                     y.write(i[1]+'\t'+'pp'+'\t' + j+'\n')
 | 
| 
 | 
   162 if Species == "Human":
 | 
| 
 | 
   163         CPDB = readtab(str(db_path) + 'ConsensusPathDB_human_PPI.txt')
 | 
| 
 | 
   164 if Species == "Yeast":
 | 
| 
 | 
   165         CPDB = readtab(str(db_path) + 'ConsensusPathDB_yeast_PPI.txt')
 | 
| 
 | 
   166 if Species == "Mouse":
 | 
| 
 | 
   167         CPDB = readtab(str(db_path) +'ConsensusPathDB_mouse_PPI.txt')
 | 
| 
 | 
   168 if __name__ == '__main__':
 | 
| 
 | 
   169     main(listfile, SAINT_cutoff, Int_conf, Species)
 | 
| 
 | 
   170     #main("Crizo_list.txt", 0.7, 0.7, 'Human')
 | 
| 
 | 
   171     os.rename('network.sif', str(cyto_file))
 |