comparison spaninFuncs.py @ 1:05b97a4dce94 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:51:44 +0000
parents
children
comparison
equal deleted inserted replaced
0:03670eba3480 1:05b97a4dce94
1 """
2 PREMISE
3 ### Functions/Classes that are used in both generate-putative-osp.py and generate-putative-isp.py
4 ###### Main premise here is to make the above scripts a little more DRY, as well as easily readable for execution.
5 ###### Documentation will ATTEMPT to be thourough here
6 """
7
8 import re
9 from Bio import SeqIO
10 from Bio import Seq
11 from collections import OrderedDict
12
13 # Not written in OOP for a LITTLE bit of trying to keep the complication down in case adjustments are needed by someone else.
14 # Much of the manipulation is string based; so it should be straightforward as well as moderately quick
15 ################## GLobal Variables
16 Lys = "K"
17
18
19 def check_back_end_snorkels(seq, tmsize):
20 """
21 Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match.
22 --> seq : should be the sequence fed from the "search_region" portion of the sequence
23 --> tmsize : size of the potential TMD being investigated
24 """
25 found = []
26 if seq[tmsize - 4] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 5]):
27 found = "match"
28 return found
29 elif seq[tmsize - 3] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 4]):
30 found = "match"
31 return found
32 elif seq[tmsize - 2] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 3]):
33 found = "match"
34 return found
35 elif seq[tmsize - 1] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 2]):
36 found = "match"
37 return found
38 else:
39 found = "NOTmatch"
40 return found
41
42
43 def prep_a_gff3(fa, spanin_type, org):
44 """
45 Function parses an input detailed 'fa' file and outputs a 'gff3' file
46 ---> fa = input .fa file
47 ---> output = output a returned list of data, easily portable to a gff3 next
48 ---> spanin_type = 'isp' or 'osp'
49 """
50 with org as f:
51 header = f.readline()
52 orgacc = header.split(" ")
53 orgacc = orgacc[0].split(">")[1].strip()
54 fa_zip = tuple_fasta(fa)
55 data = []
56 for a_pair in fa_zip:
57 # print(a_pair)
58 if re.search(("(\[1\])"), a_pair[0]):
59 strand = "+"
60 elif re.search(("(\[-1\])"), a_pair[0]):
61 strand = "-" # column 7
62 start = re.search(("[\d]+\.\."), a_pair[0]).group(0).split("..")[0] # column 4
63 end = re.search(("\.\.[\d]+"), a_pair[0]).group(0).split("..")[1] # column 5
64 orfid = re.search(("(ORF)[\d]+"), a_pair[0]).group(0) # column 1
65 if spanin_type == "isp":
66 methodtype = "CDS" # column 3
67 spanin = "isp"
68 elif spanin_type == "osp":
69 methodtype = "CDS" # column 3
70 spanin = "osp"
71 elif spanin_type == "usp":
72 methodtype = "CDS"
73 spanin = "usp"
74 else:
75 raise "need to input spanin type"
76 source = "cpt.py|putative-*.py" # column 2
77 score = "." # column 6
78 phase = "." # column 8
79 attributes = (
80 "ID=" + orgacc + "|" + orfid + ";ALIAS=" + spanin + ";SEQ=" + a_pair[1]
81 ) # column 9
82 sequence = [
83 [orgacc, source, methodtype, start, end, score, strand, phase, attributes]
84 ]
85 data += sequence
86 return data
87
88
89 def write_gff3(data, output="results.gff3"):
90 """
91 Parses results from prep_a_gff3 into a gff3 file
92 ---> input : list from prep_a_gff3
93 ---> output : gff3 file
94 """
95 data = data
96 filename = output
97 with filename as f:
98 f.write("#gff-version 3\n")
99 for value in data:
100 f.write(
101 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
102 value[0],
103 value[1],
104 value[2],
105 value[3],
106 value[4],
107 value[5],
108 value[6],
109 value[7],
110 value[8],
111 )
112 )
113 f.close()
114
115
116 def find_tmd(
117 pair,
118 minimum=10,
119 maximum=30,
120 TMDmin=10,
121 TMDmax=20,
122 isp_mode=False,
123 peri_min=18,
124 peri_max=206,
125 ):
126 """
127 Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD
128 ---> pair : Input of tuple with description and AA sequence (str)
129 ---> minimum : How close from the initial start codon a TMD can be within
130 ---> maximum : How far from the initial start codon a TMD can be within
131 ---> TMDmin : The minimum size that a transmembrane can be (default = 10)
132 ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20)
133 """
134 # hydrophobicAAs = ['P', 'F', 'I', 'W', 'L', 'V', 'M', 'Y', 'C', 'A', 'T', 'G', 'S']
135 tmd = []
136 s = str(pair[1]) # sequence being analyzed
137 # print(s) # for trouble shooting
138 if maximum > len(s):
139 maximum = len(s)
140 search_region = s[minimum - 1 : maximum + 1]
141 # print(f"this is the search region: {search_region}")
142 # print(search_region) # for trouble shooting
143
144 for tmsize in range(TMDmin, TMDmax + 1, 1):
145 # print(f"this is the current tmsize we're trying: {tmsize}")
146 # print('==============='+str(tmsize)+'================') # print for troubleshooting
147 pattern = (
148 "[PFIWLVMYCATGS]{" + str(tmsize) + "}"
149 ) # searches for these hydrophobic residues tmsize total times
150 # print(pattern)
151 # print(f"sending to regex: {search_region}")
152 if re.search(
153 ("[K]"), search_region[1:8]
154 ): # grabbing one below with search region, so I want to grab one ahead here when I query.
155 store_search = re.search(
156 ("[K]"), search_region[1:8]
157 ) # storing regex object
158 where_we_are = store_search.start() # finding where we got the hit
159 if re.search(
160 ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]
161 ) and re.search(
162 ("[PFIWLVMYCATGS]"), search_region[where_we_are - 1]
163 ): # hydrophobic neighbor
164 # try:
165 g = re.search(
166 ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]
167 ).group()
168 backend = check_back_end_snorkels(search_region, tmsize)
169 if backend == "match":
170 if isp_mode:
171 g = re.search((pattern), search_region).group()
172 end_of_tmd = re.search((g), s).end() + 1
173 amt_peri = len(s) - end_of_tmd
174 if peri_min <= amt_peri <= peri_max:
175 pair_desc = pair[0] + ", peri_count~=" + str(amt_peri)
176 new_pair = (pair_desc, pair[1])
177 tmd.append(new_pair)
178 else:
179 tmd.append(pair)
180 else:
181 continue
182 # else:
183 # print("I'm continuing out of snorkel loop")
184 # print(f"{search_region}")
185 # continue
186 if re.search((pattern), search_region):
187 # print(f"found match: {}")
188 # print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE")
189 # try:
190 if isp_mode:
191 g = re.search((pattern), search_region).group()
192 end_of_tmd = re.search((g), s).end() + 1
193 amt_peri = len(s) - end_of_tmd
194 if peri_min <= amt_peri <= peri_max:
195 pair_desc = pair[0] + ", peri_count~=" + str(amt_peri)
196 new_pair = (pair_desc, pair[1])
197 tmd.append(new_pair)
198 else:
199 tmd.append(pair)
200 else:
201 continue
202
203 return tmd
204
205
206 def find_lipobox(
207 pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False
208 ):
209 """
210 Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox
211 ---> minimum - min distance from start codon to first AA of lipobox
212 ---> maximum - max distance from start codon to first AA of lipobox
213 ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy
214
215 """
216 if regex == 1:
217 pattern = "[ILMFTV][^REKD][GAS]C" # regex for Lipobox from findSpanin.pl
218 elif regex == 2:
219 pattern = "[ACGSILMFTV][^REKD][GAS]C" # regex for Lipobox from LipoRy
220
221 candidates = []
222 s = str(pair[1])
223 # print(s) # trouble shooting
224 search_region = s[
225 minimum - 1 : maximum + 5
226 ] # properly slice the input... add 4 to catch if it hangs off at max input
227 # print(search_region) # trouble shooting
228 patterns = ["[ILMFTV][^REKD][GAS]C", "AW[AGS]C"]
229 for pattern in patterns:
230 # print(pattern) # trouble shooting
231 if re.search((pattern), search_region): # lipobox must be WITHIN the range...
232 # searches the sequence with the input RegEx AND omits if
233 g = re.search(
234 (pattern), search_region
235 ).group() # find the exact group match
236 amt_peri = len(s) - re.search((g), s).end() + 1
237 if min_after <= amt_peri <= max_after: # find the lipobox end region
238 if osp_mode:
239 pair_desc = pair[0] + ", peri_count~=" + str(amt_peri)
240 new_pair = (pair_desc, pair[1])
241 candidates.append(new_pair)
242 else:
243 candidates.append(pair)
244
245 return candidates
246
247
248 def tuple_fasta(fasta_file):
249 """
250 #### INPUT: Fasta File
251 #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence
252 ####
253 """
254 fasta = SeqIO.parse(fasta_file, "fasta")
255 descriptions = []
256 sequences = []
257 for r in fasta: # iterates and stores each description and sequence
258 description = r.description
259 sequence = str(r.seq)
260 if (
261 sequence[0] != "I"
262 ): # the translation table currently has I as a potential start codon ==> this will remove all ORFs that start with I
263 descriptions.append(description)
264 sequences.append(sequence)
265 else:
266 continue
267
268 return zip(descriptions, sequences)
269
270
271 def lineWrapper(text, charactersize=60):
272
273 if len(text) <= charactersize:
274 return text
275 else:
276 return (
277 text[:charactersize]
278 + "\n"
279 + lineWrapper(text[charactersize:], charactersize)
280 )
281
282
283 def getDescriptions(fasta):
284 """
285 Takes an output FASTA file, and parses retrieves the description headers. These headers contain information needed
286 for finding locations of a potential i-spanin and o-spanin proximity to one another.
287 """
288 desc = []
289 with fasta as f:
290 for line in f:
291 if line.startswith(">"):
292 desc.append(line)
293 return desc
294
295
296 def splitStrands(text, strand="+"):
297 # positive_strands = []
298 # negative_strands = []
299 if strand == "+":
300 if re.search(("(\[1\])"), text):
301 return text
302 elif strand == "-":
303 if re.search(("(\[-1\])"), text):
304 return text
305 # return positive_strands, negative_strands
306
307
308 def parse_a_range(pair, start, end):
309 """
310 Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range
311 ---> data : fasta tuple data
312 ---> start : start range to keep
313 ---> end : end range to keep (will need to + 1)
314 """
315 matches = []
316 for each_pair in pair:
317
318 s = re.search(("[\d]+\.\."), each_pair[0]).group(0) # Start of the sequence
319 s = int(s.split("..")[0])
320 e = re.search(("\.\.[\d]+"), each_pair[0]).group(0)
321 e = int(e.split("..")[1])
322 if start - 1 <= s and e <= end + 1:
323 matches.append(each_pair)
324 else:
325 continue
326 # else:
327 # continue
328 # if matches != []:
329 return matches
330 # else:
331 # print('no candidates within selected range')
332
333
334 def grabLocs(text):
335 """
336 Grabs the locations of the spanin based on NT location (seen from ORF). Grabs the ORF name, as per named from the ORF class/module
337 from cpt.py
338 """
339 start = re.search(("[\d]+\.\."), text).group(
340 0
341 ) # Start of the sequence ; looks for [numbers]..
342 end = re.search(("\.\.[\d]+"), text).group(
343 0
344 ) # End of the sequence ; Looks for ..[numbers]
345 orf = re.search(("(ORF)[\d]+"), text).group(
346 0
347 ) # Looks for ORF and the numbers that are after it
348 if re.search(("(\[1\])"), text): # stores strand
349 strand = "+"
350 elif re.search(("(\[-1\])"), text): # stores strand
351 strand = "-"
352 start = int(start.split("..")[0])
353 end = int(end.split("..")[1])
354 vals = [start, end, orf, strand]
355
356 return vals
357
358
359 def spaninProximity(isp, osp, max_dist=30):
360 """
361 _NOTE THIS FUNCTION COULD BE MODIFIED TO RETURN SEQUENCES_
362 Compares the locations of i-spanins and o-spanins. max_dist is the distance in NT measurement from i-spanin END site
363 to o-spanin START. The user will be inputting AA distance, so a conversion will be necessary (<user_input> * 3)
364 I modified this on 07.30.2020 to bypass the pick + or - strand. To
365 INPUT: list of OSP and ISP candidates
366 OUTPUT: Return (improved) candidates for overlapping, embedded, and separate list
367 """
368
369 embedded = {}
370 overlap = {}
371 separate = {}
372 for iseq in isp:
373 embedded[iseq[2]] = []
374 overlap[iseq[2]] = []
375 separate[iseq[2]] = []
376 for oseq in osp:
377 if iseq[3] == "+":
378 if oseq[3] == "+":
379 if iseq[0] < oseq[0] < iseq[1] and oseq[1] < iseq[1]:
380 ### EMBEDDED ###
381 combo = [
382 iseq[0],
383 iseq[1],
384 oseq[2],
385 oseq[0],
386 oseq[1],
387 iseq[3],
388 ] # ordering a return for dic
389 embedded[iseq[2]] += [combo]
390 elif iseq[0] < oseq[0] <= iseq[1] and oseq[1] > iseq[1]:
391 ### OVERLAP / SEPARATE ###
392 if (iseq[1] - oseq[0]) < 6:
393 combo = [
394 iseq[0],
395 iseq[1],
396 oseq[2],
397 oseq[0],
398 oseq[1],
399 iseq[3],
400 ]
401 separate[iseq[2]] += [combo]
402 else:
403 combo = [
404 iseq[0],
405 iseq[1],
406 oseq[2],
407 oseq[0],
408 oseq[1],
409 iseq[3],
410 ]
411 overlap[iseq[2]] += [combo]
412 elif iseq[1] <= oseq[0] <= iseq[1] + max_dist:
413 combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]]
414 separate[iseq[2]] += [combo]
415 else:
416 continue
417 if iseq[3] == "-":
418 if oseq[3] == "-":
419 if iseq[0] <= oseq[1] <= iseq[1] and oseq[0] > iseq[0]:
420 ### EMBEDDED ###
421 combo = [
422 iseq[0],
423 iseq[1],
424 oseq[2],
425 oseq[0],
426 oseq[1],
427 iseq[3],
428 ] # ordering a return for dict
429 embedded[iseq[2]] += [combo]
430 elif iseq[0] <= oseq[1] <= iseq[1] and oseq[0] < iseq[0]:
431 if (oseq[1] - iseq[0]) < 6:
432 combo = [
433 iseq[0],
434 iseq[1],
435 oseq[2],
436 oseq[0],
437 oseq[1],
438 iseq[3],
439 ]
440 separate[iseq[2]] += [combo]
441 else:
442 combo = [
443 iseq[0],
444 iseq[1],
445 oseq[2],
446 oseq[0],
447 oseq[1],
448 iseq[3],
449 ]
450 overlap[iseq[2]] += [combo]
451 elif iseq[0] - 10 < oseq[1] < iseq[0]:
452 combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]]
453 separate[iseq[2]] += [combo]
454 else:
455 continue
456
457 embedded = {k: embedded[k] for k in embedded if embedded[k]}
458 overlap = {k: overlap[k] for k in overlap if overlap[k]}
459 separate = {k: separate[k] for k in separate if separate[k]}
460
461 return embedded, overlap, separate
462
463
464 def check_for_usp():
465 "pass"
466
467
468 ############################################### TEST RANGE #########################################################################
469 ####################################################################################################################################
470 if __name__ == "__main__":
471
472 #### TMD TEST
473 test_desc = ["one", "two", "three", "four", "five"]
474 test_seq = [
475 "XXXXXXXXXXXXXXXFMCFMCFMCFMCFMCXXXXXXXXXXXXXXXXXXXXXXXXXX",
476 "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX",
477 "XXXXXXX",
478 "XXXXXXXXXXXKXXXXXXXXXX",
479 "XXXXXXXXXXAKXXXXXXXXXXAKXXXXXXXX",
480 ]
481 # for l in
482 # combo = zip(test_desc,test_seq)
483 pairs = zip(test_desc, test_seq)
484 tmd = []
485 for each_pair in pairs:
486 # print(each_pair)
487 try:
488 tmd += find_tmd(pair=each_pair)
489 except (IndexError, TypeError):
490 continue
491 # try:s = each_pair[1]
492 # tmd += find_tmd(seq=s, tmsize=15)
493 # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
494 # print(tmd)
495 # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
496
497 #### tuple-fasta TEST
498 # fasta_file = 'out_isp.fa'
499 # ret = tuple_fasta(fasta_file)
500 # print('=============')
501 # for i in ret:
502 # print(i[1])
503
504 #### LipoBox TEST
505 test_desc = ["one", "two", "three", "four", "five", "six", "seven"]
506 test_seq = [
507 "XXXXXXXXXTGGCXXXXXXXXXXXXXXXX",
508 "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX",
509 "XXXXXXX",
510 "AGGCXXXXXXXXXXXXXXXXXXXXTT",
511 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXTGGC",
512 "XXXXXXXXXXXXXXXXXXXXXXXXXXTGGC",
513 "MSTLRELRLRRALKEQSMRYLLSIKKTLPRWKGALIGLFLICVATISGCASESKLPEPPMVSVDSSLMVEPNLTTEMLNVFSQ*",
514 ]
515 pairs = zip(test_desc, test_seq)
516 lipo = []
517 for each_pair in pairs:
518 # print(each_pair)
519 # try:
520 try:
521 lipo += find_lipobox(pair=each_pair, regex=2) # , minimum=8)
522 except TypeError: # catches if something doesnt have the min/max requirements (something is too small)
523 continue
524 # except:
525 # continue
526 # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
527 #############################3
528 # g = prep_a_gff3(fa='putative_isp.fa', spanin_type='isp')
529 # print(g)
530 # write_gff3(data=g)