Mercurial > repos > bornea > saint_preproc
comparison SAINT_preprocessing_v6.py @ 8:30d53a378141 draft
Uploaded
author | bornea |
---|---|
date | Tue, 10 Nov 2015 13:15:26 -0500 |
parents | |
children | 13383ba55336 |
comparison
equal
deleted
inserted
replaced
7:c88212194b4e | 8:30d53a378141 |
---|---|
1 ####################################################################################### | |
2 # Python-code: SAINT pre-processing from Scaffold "Samples Report" output | |
3 # Author: Brent Kuenzi | |
4 ####################################################################################### | |
5 # This program reads in a raw Scaffold "Samples Report" output and a user generated | |
6 # bait file and autoformats it into prey and interaction files for SAINTexpress | |
7 # analysis | |
8 ####################################################################################### | |
9 import sys | |
10 import urllib2 | |
11 import os.path | |
12 ####################################################################################### | |
13 ## REQUIRED INPUT ## | |
14 | |
15 # 1) infile: Scaffold "Samples Report" output | |
16 # 2) baitfile: SAINT formatted bait file generated in Galaxy | |
17 # 3) prey: Y or N for generating a prey file (requires internet connection) | |
18 ####################################################################################### | |
19 infile = sys.argv[1] #Scaffold "Samples Report" output | |
20 prey = sys.argv[2] # Y or N | |
21 fasta_db = sys.argv[3] | |
22 tool_path = sys.argv[8] | |
23 if fasta_db == "None": | |
24 fasta_db = str(tool_path) + "/SwissProt_HUMAN_2014_08.fasta" | |
25 make_bait= sys.argv[5] | |
26 | |
27 | |
28 baits = make_bait.split() | |
29 i = 0 | |
30 bait_file_tmp = open("bait.txt", "wr") | |
31 order = [] | |
32 bait_cache = [] | |
33 | |
34 while i < len(baits): | |
35 if baits[i+2] == "true": | |
36 T_C = "C" | |
37 else: | |
38 T_C = "T" | |
39 line1 = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n" | |
40 q = open(infile,"r") | |
41 for line2 in q: | |
42 line2 = line2.strip() | |
43 temp = line2.split('\t') | |
44 if "Quantitative Variance" in str(temp): | |
45 if baits[i] in temp: | |
46 number_bait = temp.index(str(baits[i])) | |
47 number_bait = number_bait - 9 | |
48 bait_cache.append((number_bait, str(line1))) | |
49 else: | |
50 print "Error: bad bait " + str(baits[i]) | |
51 sys.exit() | |
52 else: | |
53 pass | |
54 i = i + 3 | |
55 | |
56 bait_cache.sort() | |
57 for line in bait_cache: | |
58 bait_file_tmp.write(line[1]) | |
59 | |
60 bait_file_tmp.close() | |
61 baitfile = "bait.txt" | |
62 | |
63 class ReturnValue1(object): | |
64 def __init__(self, sequence, gene): | |
65 self.seqlength = sequence | |
66 self.genename = gene | |
67 class ReturnValue2(object): | |
68 def __init__(self, getdata, getproteins, getheader): | |
69 self.data = getdata | |
70 self.proteins = getproteins | |
71 self.header = getheader | |
72 | |
73 def main(scaffold_input, baitfile): | |
74 bait_check(baitfile, scaffold_input) | |
75 make_inter(scaffold_input) | |
76 if prey == 'true': | |
77 make_prey(scaffold_input) | |
78 no_error_inter(scaffold_input) | |
79 os.rename('prey.txt', sys.argv[5]) | |
80 elif prey == 'false': | |
81 if os.path.isfile('error proteins.txt') == True: | |
82 no_error_inter(scaffold_input) | |
83 pass | |
84 elif prey != 'true' or 'false': | |
85 sys.exit("Invalid Prey Argument: Y or N") | |
86 | |
87 def get_info(uniprot_accession_in): # get aa lengths and gene name | |
88 error = open('error proteins.txt', 'a+') | |
89 # while True: | |
90 # i = 0 | |
91 # try: | |
92 # data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta") | |
93 # break | |
94 # except urllib2.HTTPError, err: | |
95 # i = i + 1 | |
96 # if i == 50: | |
97 # sys.exit("More than 50 errors. Check your file or try again later.") | |
98 # if err.code == 404: | |
99 # error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n') | |
100 # seqlength = 'NA' | |
101 # genename = 'NA' | |
102 # return ReturnValue1(seqlength, genename) | |
103 # elif err.code == 302: | |
104 # sys.exit("Request timed out. Check connection and try again.") | |
105 # else: | |
106 # sys.exit("Uniprot had some other error") | |
107 # lines = data.readlines() | |
108 # if lines == []: | |
109 # error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') | |
110 # error.close | |
111 # seqlength = 'NA' | |
112 # genename = 'NA' | |
113 # return ReturnValue1(seqlength, genename) | |
114 # if lines != []: | |
115 # seqlength = 0 | |
116 # header = lines[0] | |
117 # for line in lines[1:]: | |
118 # line = line.replace("\n","") # strip \n or else it gets counted in the length | |
119 # seqlength += len(line) | |
120 # if 'GN=' in header: | |
121 # lst = header.split('GN=') | |
122 # lst2 = lst[1].split(' ') | |
123 # genename = lst2[0] | |
124 # error.close | |
125 # return ReturnValue1(seqlength, genename) | |
126 # if 'GN=' not in header: | |
127 # genename = 'NA' | |
128 # error.close | |
129 # return ReturnValue1(seqlength, genename) | |
130 data = open(fasta_db,'r') | |
131 lines = data.readlines() | |
132 db_len = len(lines) | |
133 seqlength = 0 | |
134 count = 0 | |
135 for i in lines: | |
136 if ">sp" in i: | |
137 if uniprot_accession_in == i.split("|")[1]: | |
138 match = count+1 | |
139 if 'GN=' in i: | |
140 lst = i.split('GN=') | |
141 lst2 = lst[1].split(' ') | |
142 genename = lst2[0] | |
143 if 'GN=' not in i: | |
144 genename = 'NA' | |
145 while ">sp" not in lines[match]: | |
146 if match <= db_len: | |
147 seqlength = seqlength + len(lines[match].strip()) | |
148 match = match + 1 | |
149 else: | |
150 break | |
151 return ReturnValue1(seqlength, genename) | |
152 count = count + 1 | |
153 | |
154 | |
155 if seqlength == 0: | |
156 error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n') | |
157 error.close | |
158 seqlength = 'NA' | |
159 genename = 'NA' | |
160 return ReturnValue1(seqlength, genename) | |
161 | |
162 def readtab(infile): | |
163 with open(infile,'r') as x: # read in tab-delim text | |
164 output = [] | |
165 for line in x: | |
166 line = line.strip() | |
167 temp = line.split('\t') | |
168 output.append(temp) | |
169 return output | |
170 def read_scaffold(scaffold_input): # Get data, proteins and header from scaffold output | |
171 dupes = readtab(scaffold_input) | |
172 cnt = 0 | |
173 for i in dupes: | |
174 cnt += 1 | |
175 if i[0] == '#': # finds the start of second header | |
176 header_start = cnt-1 | |
177 header = dupes[header_start] | |
178 prot_start = header.index("Accession Number") | |
179 data = dupes[header_start+1:len(dupes)-2] # cut off blank line and END OF FILE | |
180 proteins = [] | |
181 for i in data: | |
182 i[4] = i[4].split()[0] # removes the (+##) that sometimes is attached | |
183 for protein in data: | |
184 proteins.append(protein[prot_start]) | |
185 return ReturnValue2(data, proteins, header) | |
186 def make_inter(scaffold_input): | |
187 bait = readtab(baitfile) | |
188 data = read_scaffold(scaffold_input).data | |
189 header = read_scaffold(scaffold_input).header | |
190 proteins = read_scaffold(scaffold_input).proteins | |
191 bait_index = [] | |
192 for i in bait: | |
193 bait_index.append(header.index(i[0])) # Find just the baits defined in bait file | |
194 with open('inter.txt', 'w') as y: | |
195 a = 0; l=0 | |
196 for bb in bait: | |
197 for lst in data: | |
198 y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n') | |
199 a+=1 | |
200 if a == len(proteins): | |
201 a = 0; l+=1 | |
202 def make_prey(scaffold_input): | |
203 proteins = read_scaffold(scaffold_input).proteins | |
204 output_file = open("prey.txt",'w') | |
205 for a in proteins: | |
206 a = a.replace("\n","") # remove \n for input into function | |
207 a = a.replace("\r","") # ditto for \r | |
208 seq = get_info(a).seqlength | |
209 GN = get_info(a).genename | |
210 if seq != 'NA': | |
211 output_file.write(a+"\t"+str(seq)+ "\t" + str(GN) + "\n") | |
212 output_file.close() | |
213 def no_error_inter(scaffold_input): # remake inter file without protein errors from Uniprot | |
214 err = readtab("error proteins.txt") | |
215 bait = readtab(baitfile) | |
216 data = read_scaffold(scaffold_input).data | |
217 header = read_scaffold(scaffold_input).header | |
218 bait_index = [] | |
219 for i in bait: | |
220 bait_index.append(header.index(i[0])) | |
221 proteins = read_scaffold(scaffold_input).proteins | |
222 errors = [] | |
223 for e in err: | |
224 errors.append(e[0]) | |
225 with open('inter.txt', 'w') as y: | |
226 l = 0; a = 0 | |
227 for bb in bait: | |
228 for lst in data: | |
229 if proteins[a] not in errors: | |
230 y.write(header[bait_index[l]] + '\t' + bb[1] + '\t' + proteins[a] + '\t' + lst[bait_index[l]] + '\n') | |
231 a+=1 | |
232 if a == len(proteins): | |
233 l += 1; a = 0 | |
234 def bait_check(bait, scaffold_input): # check that bait names share header titles | |
235 bait_in = readtab(bait) | |
236 header = read_scaffold(scaffold_input).header | |
237 for i in bait_in: | |
238 if i[0] not in header: | |
239 sys.exit("Bait must share header titles with Scaffold output") | |
240 | |
241 if __name__ == '__main__': | |
242 main(infile, baitfile) | |
243 | |
244 os.rename('inter.txt', sys.argv[4]) | |
245 os.rename("bait.txt", sys.argv[7]) |