Mercurial > repos > cpt > cpt_putative_osp
comparison spaninFuncs.py @ 1:05b97a4dce94 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:51:44 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:03670eba3480 | 1:05b97a4dce94 |
---|---|
1 """ | |
2 PREMISE | |
3 ### Functions/Classes that are used in both generate-putative-osp.py and generate-putative-isp.py | |
4 ###### Main premise here is to make the above scripts a little more DRY, as well as easily readable for execution. | |
5 ###### Documentation will ATTEMPT to be thourough here | |
6 """ | |
7 | |
8 import re | |
9 from Bio import SeqIO | |
10 from Bio import Seq | |
11 from collections import OrderedDict | |
12 | |
13 # Not written in OOP for a LITTLE bit of trying to keep the complication down in case adjustments are needed by someone else. | |
14 # Much of the manipulation is string based; so it should be straightforward as well as moderately quick | |
15 ################## GLobal Variables | |
16 Lys = "K" | |
17 | |
18 | |
19 def check_back_end_snorkels(seq, tmsize): | |
20 """ | |
21 Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match. | |
22 --> seq : should be the sequence fed from the "search_region" portion of the sequence | |
23 --> tmsize : size of the potential TMD being investigated | |
24 """ | |
25 found = [] | |
26 if seq[tmsize - 4] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 5]): | |
27 found = "match" | |
28 return found | |
29 elif seq[tmsize - 3] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 4]): | |
30 found = "match" | |
31 return found | |
32 elif seq[tmsize - 2] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 3]): | |
33 found = "match" | |
34 return found | |
35 elif seq[tmsize - 1] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 2]): | |
36 found = "match" | |
37 return found | |
38 else: | |
39 found = "NOTmatch" | |
40 return found | |
41 | |
42 | |
43 def prep_a_gff3(fa, spanin_type, org): | |
44 """ | |
45 Function parses an input detailed 'fa' file and outputs a 'gff3' file | |
46 ---> fa = input .fa file | |
47 ---> output = output a returned list of data, easily portable to a gff3 next | |
48 ---> spanin_type = 'isp' or 'osp' | |
49 """ | |
50 with org as f: | |
51 header = f.readline() | |
52 orgacc = header.split(" ") | |
53 orgacc = orgacc[0].split(">")[1].strip() | |
54 fa_zip = tuple_fasta(fa) | |
55 data = [] | |
56 for a_pair in fa_zip: | |
57 # print(a_pair) | |
58 if re.search(("(\[1\])"), a_pair[0]): | |
59 strand = "+" | |
60 elif re.search(("(\[-1\])"), a_pair[0]): | |
61 strand = "-" # column 7 | |
62 start = re.search(("[\d]+\.\."), a_pair[0]).group(0).split("..")[0] # column 4 | |
63 end = re.search(("\.\.[\d]+"), a_pair[0]).group(0).split("..")[1] # column 5 | |
64 orfid = re.search(("(ORF)[\d]+"), a_pair[0]).group(0) # column 1 | |
65 if spanin_type == "isp": | |
66 methodtype = "CDS" # column 3 | |
67 spanin = "isp" | |
68 elif spanin_type == "osp": | |
69 methodtype = "CDS" # column 3 | |
70 spanin = "osp" | |
71 elif spanin_type == "usp": | |
72 methodtype = "CDS" | |
73 spanin = "usp" | |
74 else: | |
75 raise "need to input spanin type" | |
76 source = "cpt.py|putative-*.py" # column 2 | |
77 score = "." # column 6 | |
78 phase = "." # column 8 | |
79 attributes = ( | |
80 "ID=" + orgacc + "|" + orfid + ";ALIAS=" + spanin + ";SEQ=" + a_pair[1] | |
81 ) # column 9 | |
82 sequence = [ | |
83 [orgacc, source, methodtype, start, end, score, strand, phase, attributes] | |
84 ] | |
85 data += sequence | |
86 return data | |
87 | |
88 | |
89 def write_gff3(data, output="results.gff3"): | |
90 """ | |
91 Parses results from prep_a_gff3 into a gff3 file | |
92 ---> input : list from prep_a_gff3 | |
93 ---> output : gff3 file | |
94 """ | |
95 data = data | |
96 filename = output | |
97 with filename as f: | |
98 f.write("#gff-version 3\n") | |
99 for value in data: | |
100 f.write( | |
101 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( | |
102 value[0], | |
103 value[1], | |
104 value[2], | |
105 value[3], | |
106 value[4], | |
107 value[5], | |
108 value[6], | |
109 value[7], | |
110 value[8], | |
111 ) | |
112 ) | |
113 f.close() | |
114 | |
115 | |
116 def find_tmd( | |
117 pair, | |
118 minimum=10, | |
119 maximum=30, | |
120 TMDmin=10, | |
121 TMDmax=20, | |
122 isp_mode=False, | |
123 peri_min=18, | |
124 peri_max=206, | |
125 ): | |
126 """ | |
127 Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD | |
128 ---> pair : Input of tuple with description and AA sequence (str) | |
129 ---> minimum : How close from the initial start codon a TMD can be within | |
130 ---> maximum : How far from the initial start codon a TMD can be within | |
131 ---> TMDmin : The minimum size that a transmembrane can be (default = 10) | |
132 ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20) | |
133 """ | |
134 # hydrophobicAAs = ['P', 'F', 'I', 'W', 'L', 'V', 'M', 'Y', 'C', 'A', 'T', 'G', 'S'] | |
135 tmd = [] | |
136 s = str(pair[1]) # sequence being analyzed | |
137 # print(s) # for trouble shooting | |
138 if maximum > len(s): | |
139 maximum = len(s) | |
140 search_region = s[minimum - 1 : maximum + 1] | |
141 # print(f"this is the search region: {search_region}") | |
142 # print(search_region) # for trouble shooting | |
143 | |
144 for tmsize in range(TMDmin, TMDmax + 1, 1): | |
145 # print(f"this is the current tmsize we're trying: {tmsize}") | |
146 # print('==============='+str(tmsize)+'================') # print for troubleshooting | |
147 pattern = ( | |
148 "[PFIWLVMYCATGS]{" + str(tmsize) + "}" | |
149 ) # searches for these hydrophobic residues tmsize total times | |
150 # print(pattern) | |
151 # print(f"sending to regex: {search_region}") | |
152 if re.search( | |
153 ("[K]"), search_region[1:8] | |
154 ): # grabbing one below with search region, so I want to grab one ahead here when I query. | |
155 store_search = re.search( | |
156 ("[K]"), search_region[1:8] | |
157 ) # storing regex object | |
158 where_we_are = store_search.start() # finding where we got the hit | |
159 if re.search( | |
160 ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1] | |
161 ) and re.search( | |
162 ("[PFIWLVMYCATGS]"), search_region[where_we_are - 1] | |
163 ): # hydrophobic neighbor | |
164 # try: | |
165 g = re.search( | |
166 ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1] | |
167 ).group() | |
168 backend = check_back_end_snorkels(search_region, tmsize) | |
169 if backend == "match": | |
170 if isp_mode: | |
171 g = re.search((pattern), search_region).group() | |
172 end_of_tmd = re.search((g), s).end() + 1 | |
173 amt_peri = len(s) - end_of_tmd | |
174 if peri_min <= amt_peri <= peri_max: | |
175 pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) | |
176 new_pair = (pair_desc, pair[1]) | |
177 tmd.append(new_pair) | |
178 else: | |
179 tmd.append(pair) | |
180 else: | |
181 continue | |
182 # else: | |
183 # print("I'm continuing out of snorkel loop") | |
184 # print(f"{search_region}") | |
185 # continue | |
186 if re.search((pattern), search_region): | |
187 # print(f"found match: {}") | |
188 # print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE") | |
189 # try: | |
190 if isp_mode: | |
191 g = re.search((pattern), search_region).group() | |
192 end_of_tmd = re.search((g), s).end() + 1 | |
193 amt_peri = len(s) - end_of_tmd | |
194 if peri_min <= amt_peri <= peri_max: | |
195 pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) | |
196 new_pair = (pair_desc, pair[1]) | |
197 tmd.append(new_pair) | |
198 else: | |
199 tmd.append(pair) | |
200 else: | |
201 continue | |
202 | |
203 return tmd | |
204 | |
205 | |
206 def find_lipobox( | |
207 pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False | |
208 ): | |
209 """ | |
210 Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox | |
211 ---> minimum - min distance from start codon to first AA of lipobox | |
212 ---> maximum - max distance from start codon to first AA of lipobox | |
213 ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy | |
214 | |
215 """ | |
216 if regex == 1: | |
217 pattern = "[ILMFTV][^REKD][GAS]C" # regex for Lipobox from findSpanin.pl | |
218 elif regex == 2: | |
219 pattern = "[ACGSILMFTV][^REKD][GAS]C" # regex for Lipobox from LipoRy | |
220 | |
221 candidates = [] | |
222 s = str(pair[1]) | |
223 # print(s) # trouble shooting | |
224 search_region = s[ | |
225 minimum - 1 : maximum + 5 | |
226 ] # properly slice the input... add 4 to catch if it hangs off at max input | |
227 # print(search_region) # trouble shooting | |
228 patterns = ["[ILMFTV][^REKD][GAS]C", "AW[AGS]C"] | |
229 for pattern in patterns: | |
230 # print(pattern) # trouble shooting | |
231 if re.search((pattern), search_region): # lipobox must be WITHIN the range... | |
232 # searches the sequence with the input RegEx AND omits if | |
233 g = re.search( | |
234 (pattern), search_region | |
235 ).group() # find the exact group match | |
236 amt_peri = len(s) - re.search((g), s).end() + 1 | |
237 if min_after <= amt_peri <= max_after: # find the lipobox end region | |
238 if osp_mode: | |
239 pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) | |
240 new_pair = (pair_desc, pair[1]) | |
241 candidates.append(new_pair) | |
242 else: | |
243 candidates.append(pair) | |
244 | |
245 return candidates | |
246 | |
247 | |
248 def tuple_fasta(fasta_file): | |
249 """ | |
250 #### INPUT: Fasta File | |
251 #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence | |
252 #### | |
253 """ | |
254 fasta = SeqIO.parse(fasta_file, "fasta") | |
255 descriptions = [] | |
256 sequences = [] | |
257 for r in fasta: # iterates and stores each description and sequence | |
258 description = r.description | |
259 sequence = str(r.seq) | |
260 if ( | |
261 sequence[0] != "I" | |
262 ): # the translation table currently has I as a potential start codon ==> this will remove all ORFs that start with I | |
263 descriptions.append(description) | |
264 sequences.append(sequence) | |
265 else: | |
266 continue | |
267 | |
268 return zip(descriptions, sequences) | |
269 | |
270 | |
271 def lineWrapper(text, charactersize=60): | |
272 | |
273 if len(text) <= charactersize: | |
274 return text | |
275 else: | |
276 return ( | |
277 text[:charactersize] | |
278 + "\n" | |
279 + lineWrapper(text[charactersize:], charactersize) | |
280 ) | |
281 | |
282 | |
283 def getDescriptions(fasta): | |
284 """ | |
285 Takes an output FASTA file, and parses retrieves the description headers. These headers contain information needed | |
286 for finding locations of a potential i-spanin and o-spanin proximity to one another. | |
287 """ | |
288 desc = [] | |
289 with fasta as f: | |
290 for line in f: | |
291 if line.startswith(">"): | |
292 desc.append(line) | |
293 return desc | |
294 | |
295 | |
296 def splitStrands(text, strand="+"): | |
297 # positive_strands = [] | |
298 # negative_strands = [] | |
299 if strand == "+": | |
300 if re.search(("(\[1\])"), text): | |
301 return text | |
302 elif strand == "-": | |
303 if re.search(("(\[-1\])"), text): | |
304 return text | |
305 # return positive_strands, negative_strands | |
306 | |
307 | |
308 def parse_a_range(pair, start, end): | |
309 """ | |
310 Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range | |
311 ---> data : fasta tuple data | |
312 ---> start : start range to keep | |
313 ---> end : end range to keep (will need to + 1) | |
314 """ | |
315 matches = [] | |
316 for each_pair in pair: | |
317 | |
318 s = re.search(("[\d]+\.\."), each_pair[0]).group(0) # Start of the sequence | |
319 s = int(s.split("..")[0]) | |
320 e = re.search(("\.\.[\d]+"), each_pair[0]).group(0) | |
321 e = int(e.split("..")[1]) | |
322 if start - 1 <= s and e <= end + 1: | |
323 matches.append(each_pair) | |
324 else: | |
325 continue | |
326 # else: | |
327 # continue | |
328 # if matches != []: | |
329 return matches | |
330 # else: | |
331 # print('no candidates within selected range') | |
332 | |
333 | |
334 def grabLocs(text): | |
335 """ | |
336 Grabs the locations of the spanin based on NT location (seen from ORF). Grabs the ORF name, as per named from the ORF class/module | |
337 from cpt.py | |
338 """ | |
339 start = re.search(("[\d]+\.\."), text).group( | |
340 0 | |
341 ) # Start of the sequence ; looks for [numbers].. | |
342 end = re.search(("\.\.[\d]+"), text).group( | |
343 0 | |
344 ) # End of the sequence ; Looks for ..[numbers] | |
345 orf = re.search(("(ORF)[\d]+"), text).group( | |
346 0 | |
347 ) # Looks for ORF and the numbers that are after it | |
348 if re.search(("(\[1\])"), text): # stores strand | |
349 strand = "+" | |
350 elif re.search(("(\[-1\])"), text): # stores strand | |
351 strand = "-" | |
352 start = int(start.split("..")[0]) | |
353 end = int(end.split("..")[1]) | |
354 vals = [start, end, orf, strand] | |
355 | |
356 return vals | |
357 | |
358 | |
359 def spaninProximity(isp, osp, max_dist=30): | |
360 """ | |
361 _NOTE THIS FUNCTION COULD BE MODIFIED TO RETURN SEQUENCES_ | |
362 Compares the locations of i-spanins and o-spanins. max_dist is the distance in NT measurement from i-spanin END site | |
363 to o-spanin START. The user will be inputting AA distance, so a conversion will be necessary (<user_input> * 3) | |
364 I modified this on 07.30.2020 to bypass the pick + or - strand. To | |
365 INPUT: list of OSP and ISP candidates | |
366 OUTPUT: Return (improved) candidates for overlapping, embedded, and separate list | |
367 """ | |
368 | |
369 embedded = {} | |
370 overlap = {} | |
371 separate = {} | |
372 for iseq in isp: | |
373 embedded[iseq[2]] = [] | |
374 overlap[iseq[2]] = [] | |
375 separate[iseq[2]] = [] | |
376 for oseq in osp: | |
377 if iseq[3] == "+": | |
378 if oseq[3] == "+": | |
379 if iseq[0] < oseq[0] < iseq[1] and oseq[1] < iseq[1]: | |
380 ### EMBEDDED ### | |
381 combo = [ | |
382 iseq[0], | |
383 iseq[1], | |
384 oseq[2], | |
385 oseq[0], | |
386 oseq[1], | |
387 iseq[3], | |
388 ] # ordering a return for dic | |
389 embedded[iseq[2]] += [combo] | |
390 elif iseq[0] < oseq[0] <= iseq[1] and oseq[1] > iseq[1]: | |
391 ### OVERLAP / SEPARATE ### | |
392 if (iseq[1] - oseq[0]) < 6: | |
393 combo = [ | |
394 iseq[0], | |
395 iseq[1], | |
396 oseq[2], | |
397 oseq[0], | |
398 oseq[1], | |
399 iseq[3], | |
400 ] | |
401 separate[iseq[2]] += [combo] | |
402 else: | |
403 combo = [ | |
404 iseq[0], | |
405 iseq[1], | |
406 oseq[2], | |
407 oseq[0], | |
408 oseq[1], | |
409 iseq[3], | |
410 ] | |
411 overlap[iseq[2]] += [combo] | |
412 elif iseq[1] <= oseq[0] <= iseq[1] + max_dist: | |
413 combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]] | |
414 separate[iseq[2]] += [combo] | |
415 else: | |
416 continue | |
417 if iseq[3] == "-": | |
418 if oseq[3] == "-": | |
419 if iseq[0] <= oseq[1] <= iseq[1] and oseq[0] > iseq[0]: | |
420 ### EMBEDDED ### | |
421 combo = [ | |
422 iseq[0], | |
423 iseq[1], | |
424 oseq[2], | |
425 oseq[0], | |
426 oseq[1], | |
427 iseq[3], | |
428 ] # ordering a return for dict | |
429 embedded[iseq[2]] += [combo] | |
430 elif iseq[0] <= oseq[1] <= iseq[1] and oseq[0] < iseq[0]: | |
431 if (oseq[1] - iseq[0]) < 6: | |
432 combo = [ | |
433 iseq[0], | |
434 iseq[1], | |
435 oseq[2], | |
436 oseq[0], | |
437 oseq[1], | |
438 iseq[3], | |
439 ] | |
440 separate[iseq[2]] += [combo] | |
441 else: | |
442 combo = [ | |
443 iseq[0], | |
444 iseq[1], | |
445 oseq[2], | |
446 oseq[0], | |
447 oseq[1], | |
448 iseq[3], | |
449 ] | |
450 overlap[iseq[2]] += [combo] | |
451 elif iseq[0] - 10 < oseq[1] < iseq[0]: | |
452 combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]] | |
453 separate[iseq[2]] += [combo] | |
454 else: | |
455 continue | |
456 | |
457 embedded = {k: embedded[k] for k in embedded if embedded[k]} | |
458 overlap = {k: overlap[k] for k in overlap if overlap[k]} | |
459 separate = {k: separate[k] for k in separate if separate[k]} | |
460 | |
461 return embedded, overlap, separate | |
462 | |
463 | |
464 def check_for_usp(): | |
465 "pass" | |
466 | |
467 | |
468 ############################################### TEST RANGE ######################################################################### | |
469 #################################################################################################################################### | |
470 if __name__ == "__main__": | |
471 | |
472 #### TMD TEST | |
473 test_desc = ["one", "two", "three", "four", "five"] | |
474 test_seq = [ | |
475 "XXXXXXXXXXXXXXXFMCFMCFMCFMCFMCXXXXXXXXXXXXXXXXXXXXXXXXXX", | |
476 "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX", | |
477 "XXXXXXX", | |
478 "XXXXXXXXXXXKXXXXXXXXXX", | |
479 "XXXXXXXXXXAKXXXXXXXXXXAKXXXXXXXX", | |
480 ] | |
481 # for l in | |
482 # combo = zip(test_desc,test_seq) | |
483 pairs = zip(test_desc, test_seq) | |
484 tmd = [] | |
485 for each_pair in pairs: | |
486 # print(each_pair) | |
487 try: | |
488 tmd += find_tmd(pair=each_pair) | |
489 except (IndexError, TypeError): | |
490 continue | |
491 # try:s = each_pair[1] | |
492 # tmd += find_tmd(seq=s, tmsize=15) | |
493 # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') | |
494 # print(tmd) | |
495 # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') | |
496 | |
497 #### tuple-fasta TEST | |
498 # fasta_file = 'out_isp.fa' | |
499 # ret = tuple_fasta(fasta_file) | |
500 # print('=============') | |
501 # for i in ret: | |
502 # print(i[1]) | |
503 | |
504 #### LipoBox TEST | |
505 test_desc = ["one", "two", "three", "four", "five", "six", "seven"] | |
506 test_seq = [ | |
507 "XXXXXXXXXTGGCXXXXXXXXXXXXXXXX", | |
508 "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX", | |
509 "XXXXXXX", | |
510 "AGGCXXXXXXXXXXXXXXXXXXXXTT", | |
511 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXTGGC", | |
512 "XXXXXXXXXXXXXXXXXXXXXXXXXXTGGC", | |
513 "MSTLRELRLRRALKEQSMRYLLSIKKTLPRWKGALIGLFLICVATISGCASESKLPEPPMVSVDSSLMVEPNLTTEMLNVFSQ*", | |
514 ] | |
515 pairs = zip(test_desc, test_seq) | |
516 lipo = [] | |
517 for each_pair in pairs: | |
518 # print(each_pair) | |
519 # try: | |
520 try: | |
521 lipo += find_lipobox(pair=each_pair, regex=2) # , minimum=8) | |
522 except TypeError: # catches if something doesnt have the min/max requirements (something is too small) | |
523 continue | |
524 # except: | |
525 # continue | |
526 # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n') | |
527 #############################3 | |
528 # g = prep_a_gff3(fa='putative_isp.fa', spanin_type='isp') | |
529 # print(g) | |
530 # write_gff3(data=g) |