comparison findSpanin.py @ 3:fd70980a516b draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:42:01 +0000
parents
children 673d1776d3b9
comparison
equal deleted inserted replaced
2:1a7fef71aee3 3:fd70980a516b
1 ##### findSpanin.pl --> findSpanin.py
2 ######### Much of this code is very "blocked", in the sense that one thing happens...then a function happens on the return...then another function...etc...etc...
3
4 import argparse
5 import os
6 import re # new
7 import itertools # new
8 from collections import Counter, OrderedDict
9 from spaninFuncs import (
10 getDescriptions,
11 grabLocs,
12 spaninProximity,
13 splitStrands,
14 tuple_fasta,
15 lineWrapper,
16 )
17
18 ### Requirement Inputs
19 #### INPUT : putative_isp.fa & putative_osp.fa (in that order)
20 #### PARAMETERS :
21
22 ###############################################################################
23 def write_output(candidates):
24 """output file function...maybe not needed"""
25 pass
26
27
28 def reconfigure_dict(spanins):
29 """
30 re organizes dictionary to be more friendly for checks
31 """
32
33 new_spanin_dict = {}
34
35 for each_spanin_type, data_dict in spanins.items():
36 # print(f"{each_spanin_type} == {data_dict}")
37 new_spanin_dict[each_spanin_type] = {}
38 new_spanin_dict[each_spanin_type]["positive"] = {}
39 new_spanin_dict[each_spanin_type]["negative"] = {}
40 new_spanin_dict[each_spanin_type]["positive"]["coords"] = []
41 new_spanin_dict[each_spanin_type]["negative"]["coords"] = []
42 for outter_orf, inner_data in data_dict.items():
43 list_of_hits = []
44 for data_content in inner_data:
45 # print(data_content)
46 data_content.insert(0, outter_orf)
47 # print(f"new data_content -> {data_content}")
48 # print(data_content)
49 # list_of_hits += [data_content]
50 # new_spanin_dict[each_spanin_type] += [data_content]
51 if data_content[6] == "+":
52 # print(f"{each_spanin_type} @ POSITIVE")
53 new_spanin_dict[each_spanin_type]["positive"]["coords"] += [
54 data_content
55 ]
56 elif data_content[6] == "-":
57 # print(f"{each_spanin_type} @ NEGATIVE")
58 new_spanin_dict[each_spanin_type]["negative"]["coords"] += [
59 data_content
60 ]
61 # print(new_spanin_dict[each_spanin_type])
62 # print(reorganized)
63 # print(f"{outter_orf} => {inner_data}")
64 # print(new_spanin_dict)
65
66 # print('\n')
67 # for k, v in new_spanin_dict.items():
68 # print(k)
69 # print(v)
70 return new_spanin_dict
71
72
73 def check_for_uniques(spanins):
74 """
75 Checks for unique spanins based on spanin_type.
76 If the positive strand end site is _the same_ for a i-spanin, we would group that as "1".
77 i.e. if ORF1, ORF2, and ORF3 all ended with location 4231, they would not be unique.
78 """
79 pair_dict = {}
80 pair_dict = {
81 "pairs": {
82 "location_amount": [],
83 "pair_number": {},
84 }
85 }
86 for each_spanin_type, spanin_data in spanins.items():
87 # print(f"{each_spanin_type} ===> {spanin_data}")
88 # early declarations for cases of no results
89 pos_check = [] # end checks
90 pos_uniques = []
91 neg_check = [] # start checks
92 neg_uniques = []
93 unique_ends = []
94 pos_amt_unique = 0
95 neg_amt_unique = 0
96 amt_positive = 0
97 amt_negative = 0
98 spanin_data["uniques"] = 0
99 spanin_data["amount"] = 0
100 # spanin_data['positive']['amt_positive'] = 0
101 # spanin_data['positive']['pos_amt_unique'] = 0
102 # spanin_data['positive']['isp_match'] = []
103 # spanin_data['negative']['amt_negative'] = 0
104 # spanin_data['negative']['neg_amt_unique'] = 0
105 # spanin_data['negative']['isp_match'] = []
106 # print(spanin_data)
107 if spanin_data["positive"]["coords"]:
108 # do something...
109 # print('in other function')
110 # print(spanin_data['positive']['coords'])
111 for each_hit in spanin_data["positive"]["coords"]:
112 pos_check.append(each_hit[2])
113 pair_dict["pairs"]["location_amount"].append(each_hit[2])
114 pos_uniques = list(
115 set(
116 [
117 end_site
118 for end_site in pos_check
119 if pos_check.count(end_site) >= 1
120 ]
121 )
122 )
123 # print(pos_check)
124 # print(pos_uniques)
125 amt_positive = len(spanin_data["positive"]["coords"])
126 pos_amt_unique = len(pos_uniques)
127 if amt_positive:
128 spanin_data["positive"]["amt_positive"] = amt_positive
129 spanin_data["positive"]["pos_amt_unique"] = pos_amt_unique
130 # pair_dict['pairs']['locations'].extend(pos_uniques)
131 else:
132 spanin_data["positive"]["amt_positive"] = 0
133 spanin_data["positive"]["pos_amt_unique"] = 0
134 if spanin_data["negative"]["coords"]:
135
136 # do something else...
137 # print('in other function')
138 # print(spanin_data['negative']['coords'])
139 for each_hit in spanin_data["negative"]["coords"]:
140 neg_check.append(each_hit[1])
141 pair_dict["pairs"]["location_amount"].append(each_hit[1])
142 neg_uniques = list(
143 set(
144 [
145 start_site
146 for start_site in neg_check
147 if neg_check.count(start_site) >= 1
148 ]
149 )
150 )
151 # print(neg_uniques)
152 amt_negative = len(spanin_data["negative"]["coords"])
153 neg_amt_unique = len(neg_uniques)
154 if amt_negative:
155 spanin_data["negative"]["amt_negative"] = amt_negative
156 spanin_data["negative"]["neg_amt_unique"] = neg_amt_unique
157 # pair_dict['pairs']['locations'].extend(neg_uniques)
158 else:
159 spanin_data["negative"]["amt_negative"] = 0
160 spanin_data["negative"]["neg_amt_unique"] = 0
161 spanin_data["uniques"] += (
162 spanin_data["positive"]["pos_amt_unique"]
163 + spanin_data["negative"]["neg_amt_unique"]
164 )
165 spanin_data["amount"] += (
166 spanin_data["positive"]["amt_positive"]
167 + spanin_data["negative"]["amt_negative"]
168 )
169 # print(spanin_data['uniques'])
170 list(set(pair_dict["pairs"]["location_amount"]))
171 pair_dict["pairs"]["location_amount"] = dict(
172 Counter(pair_dict["pairs"]["location_amount"])
173 )
174 for data in pair_dict.values():
175 # print(data['locations'])
176 # print(type(data['locations']))
177 v = 0
178 for loc, count in data["location_amount"].items():
179 # data['pair_number'] = {loc
180 v += 1
181 data["pair_number"][loc] = v
182 # print(dict(Counter(pair_dict['pairs']['locations'])))
183 # print(pair_dict)
184 spanins["total_amount"] = (
185 spanins["EMBEDDED"]["amount"]
186 + spanins["SEPARATED"]["amount"]
187 + spanins["OVERLAPPED"]["amount"]
188 )
189 spanins["total_unique"] = (
190 spanins["EMBEDDED"]["uniques"]
191 + spanins["SEPARATED"]["uniques"]
192 + spanins["OVERLAPPED"]["uniques"]
193 )
194 # spanins['total_unique'] = len(pair_dict['pairs']['pair_number'])
195 return spanins, pair_dict
196
197
198 if __name__ == "__main__":
199
200 # Common parameters for both ISP / OSP portion of script
201
202 parser = argparse.ArgumentParser(
203 description="Trim the putative protein candidates and find potential i-spanin / o-spanin pairs"
204 )
205
206 parser.add_argument(
207 "putative_isp_fasta_file",
208 type=argparse.FileType("r"),
209 help='Putative i-spanin FASTA file, output of "generate-putative-isp"',
210 ) # the "input" argument
211
212 parser.add_argument(
213 "putative_osp_fasta_file",
214 type=argparse.FileType("r"),
215 help='Putative o-spanin FASTA file, output of "generate-putative-osp"',
216 )
217
218 parser.add_argument(
219 "--max_isp_osp_distance",
220 dest="max_isp_osp_distance",
221 default=10,
222 type=int,
223 help="max distance from end of i-spanin to start of o-spanin, measured in AAs",
224 )
225
226 parser.add_argument(
227 "--embedded_txt",
228 dest="embedded_txt",
229 type=argparse.FileType("w"),
230 default="_findSpanin_embedded_results.txt",
231 help="Results of potential embedded spanins",
232 )
233 parser.add_argument(
234 "--overlap_txt",
235 dest="overlap_txt",
236 type=argparse.FileType("w"),
237 default="_findSpanin_overlap_results.txt",
238 help="Results of potential overlapping spanins",
239 )
240 parser.add_argument(
241 "--separate_txt",
242 dest="separate_txt",
243 type=argparse.FileType("w"),
244 default="_findSpanin_separated_results.txt",
245 help="Results of potential separated spanins",
246 )
247
248 parser.add_argument(
249 "--summary_txt",
250 dest="summary_txt",
251 type=argparse.FileType("w"),
252 default="_findSpanin_summary.txt",
253 help="Results of potential spanin pairs",
254 )
255 parser.add_argument(
256 "-v", action="version", version="0.3.0"
257 ) # Is this manually updated?
258 args = parser.parse_args()
259
260 #### RE-WRITE
261 SPANIN_TYPES = {}
262 SPANIN_TYPES["EMBEDDED"] = {}
263 SPANIN_TYPES["OVERLAPPED"] = {}
264 SPANIN_TYPES["SEPARATED"] = {}
265 # SPANIN_TYPES = {
266 # 'EMBEDDED' : {},
267 # 'OVERLAPPED' : {},
268 # 'SEPARATED' : {},
269 # }
270
271 isp = getDescriptions(args.putative_isp_fasta_file)
272 args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
273 isp_full = tuple_fasta(args.putative_isp_fasta_file)
274
275 osp = getDescriptions(args.putative_osp_fasta_file)
276 args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
277 osp_full = tuple_fasta(args.putative_osp_fasta_file)
278
279 #### location data
280 location_data = {"isp": [], "osp": []}
281 spanins = [isp, osp]
282 for idx, each_spanin_type in enumerate(spanins):
283 for description in each_spanin_type:
284 locations = grabLocs(description)
285 if idx == 0: # i-spanin
286 location_data["isp"].append(locations)
287 elif idx == 1: # o-spanin
288 location_data["osp"].append(locations)
289
290 #### Check for types of spanins
291 embedded, overlap, separate = spaninProximity(
292 isp=location_data["isp"],
293 osp=location_data["osp"],
294 max_dist=args.max_isp_osp_distance * 3,
295 )
296
297 SPANIN_TYPES["EMBEDDED"] = embedded
298 SPANIN_TYPES["OVERLAPPED"] = overlap
299 SPANIN_TYPES["SEPARATED"] = separate
300
301 # for spanin_type, spanin in SPANIN_TYPES.items():
302 # s = 0
303 # for sequence in spanin.values():
304 # s += len(sequence)
305 # SPANIN_TYPES[spanin_type]['amount'] = s
306 # SPANIN_TYPES[spanin_type]['unique'] = len(spanin.keys())
307
308 # check_for_unique_spanins(SPANIN_TYPES)
309 spanins = reconfigure_dict(SPANIN_TYPES)
310 spanins, pair_dict = check_for_uniques(spanins)
311 # print(pair_dict)
312 with args.summary_txt as f:
313 for each_spanin_type, spanin_data in spanins.items():
314 try:
315 if each_spanin_type not in ["total_amount", "total_unique"]:
316 # print(each_spanin_type)
317 # print(each_spanin_type)
318 f.write(
319 "=~~~~~= "
320 + str(each_spanin_type)
321 + " Spanin Candidate Statistics =~~~~~=\n"
322 )
323 f.writelines(
324 "Total Candidate Pairs = " + str(spanin_data["amount"]) + "\n"
325 )
326 f.writelines(
327 "Total Unique Pairs = " + str(spanin_data["uniques"]) + "\n"
328 )
329 if each_spanin_type == "EMBEDDED":
330 for k, v in SPANIN_TYPES["EMBEDDED"].items():
331 # print(k)
332 f.writelines(
333 ""
334 + str(k)
335 + " ==> Amount of corresponding candidate o-spanins(s): "
336 + str(len(v))
337 + "\n"
338 )
339 if each_spanin_type == "SEPARATED":
340 for k, v in SPANIN_TYPES["SEPARATED"].items():
341 f.writelines(
342 ""
343 + str(k)
344 + " ==> Amount of corresponding candidate o-spanins(s): "
345 + str(len(v))
346 + "\n"
347 )
348 if each_spanin_type == "OVERLAPPED":
349 for k, v in SPANIN_TYPES["OVERLAPPED"].items():
350 f.writelines(
351 ""
352 + str(k)
353 + " ==> Amount of corresponding candidate o-spanins(s): "
354 + str(len(v))
355 + "\n"
356 )
357 except TypeError:
358 continue
359 f.write("\n=~~~~~= Tally from ALL spanin types =~~~~~=\n")
360 f.writelines("Total Candidates = " + str(spanins["total_amount"]) + "\n")
361 f.writelines(
362 "Total Unique Candidate Pairs = " + str(spanins["total_unique"]) + "\n"
363 )
364
365 args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
366 isp_full = tuple_fasta(args.putative_isp_fasta_file)
367
368 args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
369 osp_full = tuple_fasta(args.putative_osp_fasta_file)
370
371 # print(isp_full)
372 isp_seqs = []
373 osp_seqs = []
374 for isp_tupe in isp_full:
375 # print(isp_tupe)
376 for pisp, posp in embedded.items():
377 # print(f"ISP = searching for {pisp} in {isp_tupe[0]}")
378 if re.search(("(" + str(pisp) + ")\D"), isp_tupe[0]):
379 # print(isp_tupe[0])
380 # print(peri_count)
381 peri_count = str.split(isp_tupe[0], "~=")[1]
382 isp_seqs.append((pisp, isp_tupe[1], peri_count))
383 # print(isp_seqs)
384 for osp_tupe in osp_full:
385 for pisp, posp in embedded.items():
386 for data in posp:
387 # print(f"OSP = searching for {data[3]} in {osp_tupe[0]}, coming from this object: {data}")
388 if re.search(("(" + str(data[3]) + ")\D"), osp_tupe[0]):
389 peri_count = str.split(osp_tupe[0], "~=")[1]
390 osp_seqs.append((data[3], osp_tupe[1], peri_count))
391
392 with args.embedded_txt as f:
393 f.write("================ embedded spanin candidates =================\n")
394 f.write(
395 "isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n"
396 )
397 if embedded != {}:
398 # print(embedded)
399 for pisp, posp in embedded.items():
400 # print(f"{pisp} - {posp}")
401 f.write(pisp + "\n")
402 for each_posp in posp:
403 # print(posp)
404 f.write(
405 "\t{}\t{}\t{}\t{}\t{}\t{}\t".format(
406 each_posp[1],
407 each_posp[2],
408 each_posp[3],
409 each_posp[4],
410 each_posp[5],
411 each_posp[6],
412 )
413 )
414 if each_posp[6] == "+":
415 if each_posp[2] in pair_dict["pairs"]["pair_number"].keys():
416 f.write(
417 ""
418 + str(pair_dict["pairs"]["pair_number"][each_posp[2]])
419 + "\n"
420 )
421 elif each_posp[6] == "-":
422 if each_posp[1] in pair_dict["pairs"]["pair_number"].keys():
423 f.write(
424 ""
425 + str(pair_dict["pairs"]["pair_number"][each_posp[1]])
426 + "\n"
427 )
428 else:
429 f.write("nothing found")
430
431 with open(args.embedded_txt.name, "a") as f:
432 f.write("\n================= embedded candidate sequences ================\n")
433 f.write("======================= isp ==========================\n\n")
434 for isp_data in isp_seqs:
435 # print(isp_data)
436 f.write(
437 ">isp_orf::{}-peri_count~={}\n{}\n".format(
438 isp_data[0], isp_data[2], lineWrapper(isp_data[1])
439 )
440 )
441 f.write("\n======================= osp ========================\n\n")
442 for osp_data in osp_seqs:
443 f.write(
444 ">osp_orf::{}-peri_count~={}\n{}\n".format(
445 osp_data[0], osp_data[2], lineWrapper(osp_data[1])
446 )
447 )
448
449 args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
450 isp_full = tuple_fasta(args.putative_isp_fasta_file)
451
452 args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
453 osp_full = tuple_fasta(args.putative_osp_fasta_file)
454
455 isp_seqs = []
456 osp_seqs = []
457 for isp_tupe in isp_full:
458 peri_count = str.split(isp_tupe[0], "~=")[1]
459 for pisp, posp in overlap.items():
460 if re.search(("(" + str(pisp) + ")\D"), isp_tupe[0]):
461 peri_count = str.split(isp_tupe[0], "~=")[1]
462 isp_seqs.append((pisp, isp_tupe[1], peri_count))
463
464 for osp_tupe in osp_full:
465 for pisp, posp in overlap.items():
466 for data in posp:
467 if re.search(("(" + str(data[3]) + ")\D"), osp_tupe[0]):
468 peri_count = str.split(osp_tupe[0], "~=")[1]
469 osp_seqs.append((data[3], osp_tupe[1], peri_count))
470
471 with args.overlap_txt as f:
472 f.write("================ overlap spanin candidates =================\n")
473 f.write(
474 "isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n"
475 )
476 if overlap != {}:
477 for pisp, posp in overlap.items():
478 f.write(pisp + "\n")
479 for each_posp in posp:
480 f.write(
481 "\t{}\t{}\t{}\t{}\t{}\t{}\t".format(
482 each_posp[1],
483 each_posp[2],
484 each_posp[3],
485 each_posp[4],
486 each_posp[5],
487 each_posp[6],
488 )
489 )
490 if each_posp[6] == "+":
491 if each_posp[2] in pair_dict["pairs"]["pair_number"].keys():
492 # print('ovl ; +')
493 f.write(
494 ""
495 + str(pair_dict["pairs"]["pair_number"][each_posp[2]])
496 + "\n"
497 )
498 elif each_posp[6] == "-":
499 if each_posp[1] in pair_dict["pairs"]["pair_number"].keys():
500 f.write(
501 ""
502 + str(pair_dict["pairs"]["pair_number"][each_posp[1]])
503 + "\n"
504 )
505 else:
506 f.write("nothing found")
507
508 with open(args.overlap_txt.name, "a") as f:
509 # print(isp_seqs)
510 f.write("\n================= overlap candidate sequences ================\n")
511 f.write("======================= isp ==========================\n\n")
512 for isp_data in isp_seqs:
513 f.write(
514 ">isp_orf::{}-pericount~={}\n{}\n".format(
515 isp_data[0], isp_data[2], lineWrapper(isp_data[1])
516 )
517 )
518 f.write("\n======================= osp ========================\n\n")
519 for osp_data in osp_seqs:
520 f.write(
521 ">osp_orf::{}-pericount~={}\n{}\n".format(
522 osp_data[0], osp_data[2], lineWrapper(osp_data[1])
523 )
524 )
525
526 args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r")
527 isp_full = tuple_fasta(args.putative_isp_fasta_file)
528 args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r")
529 osp_full = tuple_fasta(args.putative_osp_fasta_file)
530
531 isp_seqs = []
532 osp_seqs = []
533 for isp_tupe in isp_full:
534 for pisp, posp in separate.items():
535 if re.search(("(" + str(pisp) + ")\D"), isp_tupe[0]):
536 peri_count = str.split(isp_tupe[0], "~=")[1]
537 isp_seqs.append((pisp, isp_tupe[1], peri_count))
538 # print(isp_seqs)
539 for osp_tupe in osp_full:
540 for pisp, posp in separate.items():
541 for data in posp:
542 if re.search(("(" + str(data[3]) + ")\D"), osp_tupe[0]):
543 peri_count = str.split(osp_tupe[0], "~=")[1]
544 osp_seqs.append((data[3], osp_tupe[1], peri_count))
545
546 with args.separate_txt as f:
547 f.write("================ separated spanin candidates =================\n")
548 f.write(
549 "isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n"
550 )
551 if separate != {}:
552 for pisp, posp in separate.items():
553 f.write(pisp + "\n")
554 for each_posp in posp:
555 f.write(
556 "\t{}\t{}\t{}\t{}\t{}\t{}\t".format(
557 each_posp[1],
558 each_posp[2],
559 each_posp[3],
560 each_posp[4],
561 each_posp[5],
562 each_posp[6],
563 )
564 )
565 if each_posp[6] == "+":
566 if each_posp[2] in pair_dict["pairs"]["pair_number"].keys():
567 f.write(
568 ""
569 + str(pair_dict["pairs"]["pair_number"][each_posp[2]])
570 + "\n"
571 )
572 elif each_posp[6] == "-":
573 if each_posp[1] in pair_dict["pairs"]["pair_number"].keys():
574 f.write(
575 ""
576 + str(pair_dict["pairs"]["pair_number"][each_posp[1]])
577 + "\n"
578 )
579 else:
580 f.write("nothing found")
581
582 with open(args.separate_txt.name, "a") as f:
583 f.write("\n================= separated candidate sequences ================\n")
584 f.write("======================= isp ==========================\n\n")
585 for isp_data in isp_seqs:
586 f.write(
587 ">isp_orf::{}-pericount~={}\n{}\n".format(
588 isp_data[0], isp_data[2], lineWrapper(isp_data[1])
589 )
590 )
591 f.write("\n======================= osp ========================\n\n")
592 for osp_data in osp_seqs:
593 f.write(
594 ">osp_orf::{}-pericount~={}\n{}\n".format(
595 osp_data[0], osp_data[2], lineWrapper(osp_data[1])
596 )
597 )