comparison SAINT_preprocessing_v6.py @ 8:30d53a378141 draft

Uploaded
author bornea
date Tue, 10 Nov 2015 13:15:26 -0500
parents
children 13383ba55336
comparison
equal deleted inserted replaced
7:c88212194b4e 8:30d53a378141
1 #######################################################################################
2 # Python-code: SAINT pre-processing from Scaffold "Samples Report" output
3 # Author: Brent Kuenzi
4 #######################################################################################
5 # This program reads in a raw Scaffold "Samples Report" output and a user generated
6 # bait file and autoformats it into prey and interaction files for SAINTexpress
7 # analysis
8 #######################################################################################
9 import sys
10 import urllib2
11 import os.path
12 #######################################################################################
13 ## REQUIRED INPUT ##
14
15 # 1) infile: Scaffold "Samples Report" output
16 # 2) baitfile: SAINT formatted bait file generated in Galaxy
17 # 3) prey: Y or N for generating a prey file (requires internet connection)
18 #######################################################################################
19 infile = sys.argv[1] #Scaffold "Samples Report" output
20 prey = sys.argv[2] # Y or N
21 fasta_db = sys.argv[3]
22 tool_path = sys.argv[8]
23 if fasta_db == "None":
24 fasta_db = str(tool_path) + "/SwissProt_HUMAN_2014_08.fasta"
25 make_bait= sys.argv[5]
26
27
28 baits = make_bait.split()
29 i = 0
30 bait_file_tmp = open("bait.txt", "wr")
31 order = []
32 bait_cache = []
33
34 while i < len(baits):
35 if baits[i+2] == "true":
36 T_C = "C"
37 else:
38 T_C = "T"
39 line1 = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n"
40 q = open(infile,"r")
41 for line2 in q:
42 line2 = line2.strip()
43 temp = line2.split('\t')
44 if "Quantitative Variance" in str(temp):
45 if baits[i] in temp:
46 number_bait = temp.index(str(baits[i]))
47 number_bait = number_bait - 9
48 bait_cache.append((number_bait, str(line1)))
49 else:
50 print "Error: bad bait " + str(baits[i])
51 sys.exit()
52 else:
53 pass
54 i = i + 3
55
56 bait_cache.sort()
57 for line in bait_cache:
58 bait_file_tmp.write(line[1])
59
60 bait_file_tmp.close()
61 baitfile = "bait.txt"
62
63 class ReturnValue1(object):
64 def __init__(self, sequence, gene):
65 self.seqlength = sequence
66 self.genename = gene
67 class ReturnValue2(object):
68 def __init__(self, getdata, getproteins, getheader):
69 self.data = getdata
70 self.proteins = getproteins
71 self.header = getheader
72
73 def main(scaffold_input, baitfile):
74 bait_check(baitfile, scaffold_input)
75 make_inter(scaffold_input)
76 if prey == 'true':
77 make_prey(scaffold_input)
78 no_error_inter(scaffold_input)
79 os.rename('prey.txt', sys.argv[5])
80 elif prey == 'false':
81 if os.path.isfile('error proteins.txt') == True:
82 no_error_inter(scaffold_input)
83 pass
84 elif prey != 'true' or 'false':
85 sys.exit("Invalid Prey Argument: Y or N")
86
87 def get_info(uniprot_accession_in): # get aa lengths and gene name
88 error = open('error proteins.txt', 'a+')
89 # while True:
90 # i = 0
91 # try:
92 # data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")
93 # break
94 # except urllib2.HTTPError, err:
95 # i = i + 1
96 # if i == 50:
97 # sys.exit("More than 50 errors. Check your file or try again later.")
98 # if err.code == 404:
99 # error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
100 # seqlength = 'NA'
101 # genename = 'NA'
102 # return ReturnValue1(seqlength, genename)
103 # elif err.code == 302:
104 # sys.exit("Request timed out. Check connection and try again.")
105 # else:
106 # sys.exit("Uniprot had some other error")
107 # lines = data.readlines()
108 # if lines == []:
109 # error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
110 # error.close
111 # seqlength = 'NA'
112 # genename = 'NA'
113 # return ReturnValue1(seqlength, genename)
114 # if lines != []:
115 # seqlength = 0
116 # header = lines[0]
117 # for line in lines[1:]:
118 # line = line.replace("\n","") # strip \n or else it gets counted in the length
119 # seqlength += len(line)
120 # if 'GN=' in header:
121 # lst = header.split('GN=')
122 # lst2 = lst[1].split(' ')
123 # genename = lst2[0]
124 # error.close
125 # return ReturnValue1(seqlength, genename)
126 # if 'GN=' not in header:
127 # genename = 'NA'
128 # error.close
129 # return ReturnValue1(seqlength, genename)
130 data = open(fasta_db,'r')
131 lines = data.readlines()
132 db_len = len(lines)
133 seqlength = 0
134 count = 0
135 for i in lines:
136 if ">sp" in i:
137 if uniprot_accession_in == i.split("|")[1]:
138 match = count+1
139 if 'GN=' in i:
140 lst = i.split('GN=')
141 lst2 = lst[1].split(' ')
142 genename = lst2[0]
143 if 'GN=' not in i:
144 genename = 'NA'
145 while ">sp" not in lines[match]:
146 if match <= db_len:
147 seqlength = seqlength + len(lines[match].strip())
148 match = match + 1
149 else:
150 break
151 return ReturnValue1(seqlength, genename)
152 count = count + 1
153
154
155 if seqlength == 0:
156 error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n')
157 error.close
158 seqlength = 'NA'
159 genename = 'NA'
160 return ReturnValue1(seqlength, genename)
161
162 def readtab(infile):
163 with open(infile,'r') as x: # read in tab-delim text
164 output = []
165 for line in x:
166 line = line.strip()
167 temp = line.split('\t')
168 output.append(temp)
169 return output
170 def read_scaffold(scaffold_input): # Get data, proteins and header from scaffold output
171 dupes = readtab(scaffold_input)
172 cnt = 0
173 for i in dupes:
174 cnt += 1
175 if i[0] == '#': # finds the start of second header
176 header_start = cnt-1
177 header = dupes[header_start]
178 prot_start = header.index("Accession Number")
179 data = dupes[header_start+1:len(dupes)-2] # cut off blank line and END OF FILE
180 proteins = []
181 for i in data:
182 i[4] = i[4].split()[0] # removes the (+##) that sometimes is attached
183 for protein in data:
184 proteins.append(protein[prot_start])
185 return ReturnValue2(data, proteins, header)
186 def make_inter(scaffold_input):
187 bait = readtab(baitfile)
188 data = read_scaffold(scaffold_input).data
189 header = read_scaffold(scaffold_input).header
190 proteins = read_scaffold(scaffold_input).proteins
191 bait_index = []
192 for i in bait:
193 bait_index.append(header.index(i[0])) # Find just the baits defined in bait file
194 with open('inter.txt', 'w') as y:
195 a = 0; l=0
196 for bb in bait:
197 for lst in data:
198 y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n')
199 a+=1
200 if a == len(proteins):
201 a = 0; l+=1
202 def make_prey(scaffold_input):
203 proteins = read_scaffold(scaffold_input).proteins
204 output_file = open("prey.txt",'w')
205 for a in proteins:
206 a = a.replace("\n","") # remove \n for input into function
207 a = a.replace("\r","") # ditto for \r
208 seq = get_info(a).seqlength
209 GN = get_info(a).genename
210 if seq != 'NA':
211 output_file.write(a+"\t"+str(seq)+ "\t" + str(GN) + "\n")
212 output_file.close()
213 def no_error_inter(scaffold_input): # remake inter file without protein errors from Uniprot
214 err = readtab("error proteins.txt")
215 bait = readtab(baitfile)
216 data = read_scaffold(scaffold_input).data
217 header = read_scaffold(scaffold_input).header
218 bait_index = []
219 for i in bait:
220 bait_index.append(header.index(i[0]))
221 proteins = read_scaffold(scaffold_input).proteins
222 errors = []
223 for e in err:
224 errors.append(e[0])
225 with open('inter.txt', 'w') as y:
226 l = 0; a = 0
227 for bb in bait:
228 for lst in data:
229 if proteins[a] not in errors:
230 y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n')
231 a+=1
232 if a == len(proteins):
233 l += 1; a = 0
234 def bait_check(bait, scaffold_input): # check that bait names share header titles
235 bait_in = readtab(bait)
236 header = read_scaffold(scaffold_input).header
237 for i in bait_in:
238 if i[0] not in header:
239 sys.exit("Bait must share header titles with Scaffold output")
240
241 if __name__ == '__main__':
242 main(infile, baitfile)
243
244 os.rename('inter.txt', sys.argv[4])
245 os.rename("bait.txt", sys.argv[7])