Mercurial > repos > bornea > saint_protein_interactions
comparison ProteinInteractions.py @ 5:5935066604a7 draft
Uploaded
| author | bornea |
|---|---|
| date | Tue, 15 Mar 2016 15:36:27 -0400 |
| parents | |
| children | 355ad1ce474b |
comparison
equal
deleted
inserted
replaced
| 4:1c89c5b8fadf | 5:5935066604a7 |
|---|---|
| 1 ################################################################################ | |
| 2 # This program will read in a SAINT 'list.txt' file and the interactions from | |
| 3 # the consensus path db database and return all the interactions that we saw in | |
| 4 # our experiment in a format suitable for cytoscape. This allows us to filter | |
| 5 # before getting PPIs so that it doesn't affect our SAINT score or include | |
| 6 # interactions that don't score well | |
| 7 ################################################################################ | |
| 8 # Copyright (C) Brent Kuenzi. | |
| 9 # Permission is granted to copy, distribute and/or modify this document | |
| 10 # under the terms of the GNU Free Documentation License, Version 1.3 | |
| 11 # or any later version published by the Free Software Foundation; | |
| 12 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. | |
| 13 # A copy of the license is included in the section entitled "GNU | |
| 14 # Free Documentation License". | |
| 15 ################################################################################ | |
| 16 ## REQUIRED INPUT ## | |
| 17 | |
| 18 # 1) listfile: SAINTexpress output | |
| 19 # 2) SAINT_cutoff: Saint score cutoff for import (between 0 and 1) | |
| 20 # 3) Int_conf: Confidence of PPI from CPDB to include | |
| 21 # - low: no filtering | |
| 22 # - medium: >0.5 | |
| 23 # - high: >0.7 | |
| 24 # - very high: >0.9 | |
| 25 # 4) Species: Human, Yeast, or Mouse | |
| 26 ################################################################################ | |
| 27 | |
| 28 | |
| 29 import urllib2 | |
| 30 import itertools | |
| 31 import sys | |
| 32 import os | |
| 33 | |
| 34 | |
| 35 listfile = sys.argv[1] | |
| 36 SAINT_cutoff = sys.argv[2] | |
| 37 Int_conf = sys.argv[3] | |
| 38 Species = sys.argv[4] | |
| 39 cyto_file = sys.argv[5] | |
| 40 db_path = sys.argv[6] | |
| 41 | |
| 42 class ReturnValue1(object): | |
| 43 def __init__(self, uniprot_acc, gene, swissprot): | |
| 44 self.up = uniprot_acc | |
| 45 self.gn = gene | |
| 46 self.sp = swissprot | |
| 47 class ReturnValue2(object): | |
| 48 def __init__(self, getdata, getproteins, getheader): | |
| 49 self.data = getdata | |
| 50 self.proteins = getproteins | |
| 51 self.header = getheader | |
| 52 | |
| 53 | |
| 54 def main(listfile, SAINT_cutoff, Int_conf, Species): | |
| 55 cytoscape(dd_network(listfile, SAINT_cutoff, Int_conf), listfile, SAINT_cutoff) | |
| 56 | |
| 57 | |
| 58 def readtab(infile): | |
| 59 with open(infile, 'r') as file_to_read: | |
| 60 # Read in tab-delim text. | |
| 61 output = [] | |
| 62 for line in file_to_read: | |
| 63 line = line.strip() | |
| 64 temp = line.split('\t') | |
| 65 output.append(temp) | |
| 66 return output | |
| 67 | |
| 68 | |
| 69 def read_listfile(listfile): | |
| 70 # Get data, proteins and header from scaffold output | |
| 71 dupes = readtab(listfile) | |
| 72 header = dupes[0] | |
| 73 prot_start = header.index("PreyGene")-1 | |
| 74 data = dupes[1:] | |
| 75 # Cut off blank line and END OF FILE. | |
| 76 proteins = [] | |
| 77 for protein in data: | |
| 78 proteins.append(protein[prot_start]) | |
| 79 return ReturnValue2(data, proteins, header) | |
| 80 | |
| 81 | |
| 82 def get_info(uniprot_accession_in): | |
| 83 # Get aa lengths and gene name. | |
| 84 error = open('error proteins.txt', 'a+') | |
| 85 i = 0 | |
| 86 while i == 0: | |
| 87 try: | |
| 88 data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in | |
| 89 + ".fasta") | |
| 90 break | |
| 91 except urllib2.HTTPError, err: | |
| 92 i = i + 1 | |
| 93 if i == 50: | |
| 94 sys.exit("More than 50 errors. Check your file or try again later.") | |
| 95 if err.code == 404: | |
| 96 error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n') | |
| 97 seqlength = 'NA' | |
| 98 genename = 'NA' | |
| 99 return ReturnValue1(seqlength, genename) | |
| 100 elif err.code == 302: | |
| 101 sys.exit("Request timed out. Check connection and try again.") | |
| 102 else: | |
| 103 sys.exit("Uniprot had some other error") | |
| 104 lines = data.readlines() | |
| 105 header = lines[0] | |
| 106 lst = header.split('|') | |
| 107 lst2 = lst[2].split(' ') | |
| 108 swissprot = lst2[0] | |
| 109 uniprot_acc = lst[1] | |
| 110 if lines == []: | |
| 111 error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') | |
| 112 error.close | |
| 113 uniprot_acc = 'NA' | |
| 114 genename = 'NA' | |
| 115 return ReturnValue1(uniprot_acc, genename, swissprot) | |
| 116 if lines != []: | |
| 117 seqlength = 0 | |
| 118 header = lines[0] | |
| 119 if 'GN=' in header: | |
| 120 lst = header.split('GN=') | |
| 121 lst2 = lst[1].split(' ') | |
| 122 genename = lst2[0] | |
| 123 error.close | |
| 124 return ReturnValue1(uniprot_acc, genename, swissprot) | |
| 125 if 'GN=' not in header: | |
| 126 genename = 'NA' | |
| 127 error.close | |
| 128 return ReturnValue1(uniprot_acc, genename, swissprot) | |
| 129 | |
| 130 | |
| 131 def dd_network(listfile, SAINTscore, CPDB_filter): | |
| 132 # Filter by SS and CPDB. | |
| 133 data = read_listfile(listfile).data | |
| 134 # Change to filtered list. | |
| 135 SS = (read_listfile(listfile).header).index("SaintScore") | |
| 136 filt_data = [] | |
| 137 for i in data: | |
| 138 if i[SS] >= SAINTscore: | |
| 139 filt_data.append(i) | |
| 140 accessions = [] | |
| 141 for i in filt_data: | |
| 142 accessions.append(get_info(i[1]).sp) | |
| 143 GO = [] | |
| 144 for i in CPDB[2:]: | |
| 145 if i[3] >= CPDB_filter: | |
| 146 # Filter interaction confidence. | |
| 147 GO.append(i[2]) | |
| 148 # All known interactions. | |
| 149 GO2 = [] | |
| 150 for i in GO: | |
| 151 GO2.append(i.split(',')) | |
| 152 # Make interactions list friendly. | |
| 153 unfiltered_network = {} | |
| 154 for i in accessions: | |
| 155 interactions = [] | |
| 156 for j in GO2: | |
| 157 if i in j: | |
| 158 # Find the interactions. | |
| 159 if j not in interactions: | |
| 160 # Dont add duplicate interactions. | |
| 161 interactions.append(j) | |
| 162 merged = list(itertools.chain(*interactions)) | |
| 163 # Flatten list of lists. | |
| 164 unfiltered_network[i] = merged | |
| 165 # Assign all possible interactions to protein in a dictionary. | |
| 166 dd_network = {} | |
| 167 # Data dependent network. | |
| 168 for i in unfiltered_network: | |
| 169 temp = [] | |
| 170 for j in unfiltered_network[i]: | |
| 171 if j in accessions: | |
| 172 if j not in temp: | |
| 173 if j != i: | |
| 174 temp.append(j) | |
| 175 dd_network[i] = temp | |
| 176 return dd_network | |
| 177 | |
| 178 | |
| 179 def cytoscape(dd_network, listfile, SAINTscore): | |
| 180 with open('network.sif', 'wt') as y: | |
| 181 data = read_listfile(listfile).data | |
| 182 SS = (read_listfile(listfile).header).index("SaintScore") | |
| 183 filt_data = [] | |
| 184 for i in data: | |
| 185 if i[SS] >= SAINTscore: | |
| 186 filt_data.append(get_info(i[1]).sp) | |
| 187 for i in filt_data: | |
| 188 if dd_network[i] != []: | |
| 189 lst = [] | |
| 190 for j in dd_network[i]: | |
| 191 lst.append(j) | |
| 192 for j in lst: | |
| 193 y.write(i+'\t'+'pp'+'\t' + j+'\n') | |
| 194 | |
| 195 | |
| 196 if Species == "Human": | |
| 197 CPDB = readtab(str(db_path) + 'ConsensusPathDB_human_PPI.txt') | |
| 198 if Species == "Yeast": | |
| 199 CPDB = readtab(str(db_path) + 'ConsensusPathDB_yeast_PPI.txt') | |
| 200 if Species == "Mouse": | |
| 201 CPDB = readtab(str(db_path) +'ConsensusPathDB_mouse_PPI.txt') | |
| 202 if __name__ == '__main__': | |
| 203 main(listfile, SAINT_cutoff, Int_conf, Species) | |
| 204 os.rename('network.sif', str(cyto_file)) |
