comparison trips_create_annotation/create_annotation_sqlite.py @ 0:f24aed7a5cc7 draft

Uploaded
author jackcurragh
date Mon, 04 Apr 2022 09:30:51 +0000
parents
children d70696d3341e
comparison
equal deleted inserted replaced
-1:000000000000 0:f24aed7a5cc7
1 # Python3 script which takes in an annotation file(gtf/gff3) and a transcriptomic fasta file
2 # and produces an sqlite file which can be uploaded to Trips-Viz
3 # All co-ordinates produced are 1 based
4 # All start codon positions (including cds_start) should be at the first nucleotide of the codon
5 # All stop codon positions (including cds_stop) should be at the last nucleotide of the codon
6 import sys
7 import re
8 import sqlite3
9 from intervaltree import Interval, IntervalTree
10 import itertools
11 from sqlitedict import SqliteDict
12 import os
13
14 organism = sys.argv[1]
15 # This should be a GTF or GFF3 file
16 annotation_file = open(sys.argv[2], "r")
17 # This needs to be the transcriptomic fasta file
18 fasta_file = open(sys.argv[3], "r")
19 # This value will be added used to create UTRs of this length, useful when looking at transcriptomes without annotated UTRs
20 pseudo_utr_len = int(sys.argv[4])
21 # An example of a transcript_id from the annotation file, e.g ENST000000123456
22 user_transcript_id = sys.argv[5]
23 # An example of a gene name from the annotation file
24 user_gene_name = sys.argv[6]
25 # Set to true if transcript version is included in transcript_id, e.g: ENST000000123456.1
26 TRAN_VERSION = True
27
28
29 if os.path.isfile("{}.sqlite".format(organism)):
30 print("{}.sqlite already exists".format(organism))
31 sys.exit()
32
33
34 # old_exons = SqliteDict(
35 # "/home/DATA/www/tripsviz/tripsviz/trips_annotations/mus_musculus/transcriptomic_to_genomic.sqlite"
36 # )
37
38
39 delimiters = {
40 "transcripts": {"before": [], "after": ["."], "annot_types": ["cds", "utr"]},
41 "genes": {"before": [], "after": ['"'], "annot_types": ["lnc_rna"]},
42 }
43
44 punctuation = [";", " ", "-", ":", "-", ".", "=", "\t"]
45 # Find delimiters in the annotation and fasta files using the user_transcript_id
46 # and user_gene_name examples given by user.
47 for line in annotation_file:
48 if user_transcript_id in line:
49 tabsplitline = line.split("\t")
50 annot_type = tabsplitline[2]
51 if annot_type not in delimiters["transcripts"]["annot_types"]:
52 delimiters["transcripts"]["annot_types"].append(annot_type.lower())
53 splitline = line.split(user_transcript_id)
54 before_delimiter = splitline[0]
55 for item in punctuation:
56 if item in before_delimiter:
57 if len(before_delimiter.split(item)[-1]) >= 5:
58 before_delimiter = before_delimiter.split(item)[-1]
59 after_delimiter = splitline[1][:2]
60 if (
61 before_delimiter not in delimiters["transcripts"]["before"]
62 and len(before_delimiter) >= 5
63 ):
64 delimiters["transcripts"]["before"].append(before_delimiter)
65 if after_delimiter not in delimiters["transcripts"]["after"]:
66 delimiters["transcripts"]["after"].append(after_delimiter)
67 if user_gene_name in line:
68 tabsplitline = line.split("\t")
69 annot_type = tabsplitline[2]
70 print("ANNOT TYPE", annot_type)
71 if annot_type not in delimiters["genes"]["annot_types"]:
72 delimiters["genes"]["annot_types"].append(annot_type.lower())
73 splitline = line.split(user_gene_name)
74 before_delimiter = splitline[0]
75 for item in punctuation:
76 if item in before_delimiter:
77 if len(before_delimiter.split(item)[-1]) >= 5:
78 before_delimiter = before_delimiter.split(item)[-1]
79 after_delimiter = splitline[1][0]
80 if (
81 before_delimiter not in delimiters["genes"]["before"]
82 and len(before_delimiter) >= 5
83 ):
84 delimiters["genes"]["before"].append(before_delimiter)
85 if after_delimiter not in delimiters["genes"]["after"]:
86 if after_delimiter in punctuation:
87 delimiters["genes"]["after"].append(after_delimiter)
88
89 print("delimeters[genes]", delimiters["transcripts"]["annot_types"])
90
91 for line in fasta_file:
92 if user_transcript_id in line:
93 splitline = line.split(user_transcript_id)
94 before_delimiter = splitline[0]
95 for item in punctuation:
96 if item in before_delimiter:
97 if len(before_delimiter.split(item)[1]) >= 5:
98 before_delimiter = before_delimiter.split(item)[1]
99 after_delimiter = splitline[1][0]
100 if (
101 before_delimiter not in delimiters["transcripts"]["before"]
102 and len(before_delimiter) >= 5
103 ):
104 delimiters["transcripts"]["before"].append(before_delimiter)
105 if after_delimiter not in delimiters["transcripts"]["after"]:
106 delimiters["transcripts"]["after"].append(after_delimiter)
107 fasta_file.close()
108 annotation_file.close()
109
110
111 if delimiters["transcripts"]["before"] == []:
112 print(
113 "ERROR: No transcript_id with the name {} could be found in the annotation file".format(
114 user_transcript_id
115 )
116 )
117 sys.exit()
118 if delimiters["genes"]["before"] == []:
119 print(
120 "ERROR: No gene with the name {} could be found in the annotation file".format(
121 user_gene_name
122 )
123 )
124 sys.exit()
125
126 master_dict = {}
127 coding_dict = {}
128 notinfasta = open("notinfasta.csv", "w")
129
130 # Given a nucleotide sequence returns the positions of all start and stop codons.
131 def get_start_stops(transcript_sequence):
132 transcript_sequence = transcript_sequence.upper()
133 start_codons = ["ATG"]
134 stop_codons = ["TAA", "TAG", "TGA"]
135 seq_frames = {"starts": [], "stops": []}
136 for codons, positions in ((start_codons, "starts"), (stop_codons, "stops")):
137 if len(codons) > 1:
138 pat = re.compile("|".join(codons))
139 else:
140 pat = re.compile(codons[0])
141 for m in re.finditer(pat, transcript_sequence):
142 # Increment position by 1, Frame 1 starts at position 1 not 0,
143 # if it's a stop codon add another 2 so it points to the last nuc of the codon
144 if positions == "starts":
145 start = m.start() + 1
146 else:
147 start = m.start() + 3
148 seq_frames[positions].append(start)
149 return seq_frames
150
151
152 # parse fasta to get the nucleotide sequence of transcripts and the positions of start/stop codons.
153 fasta_file = open(sys.argv[3], "r")
154 read_fasta = fasta_file.read()
155 split_fasta = read_fasta.split(">")
156 for entry in split_fasta[1:]:
157 newline_split = entry.split("\n")
158 tran = newline_split[0]
159 for item in delimiters["transcripts"]["after"]:
160 if item in tran:
161 tran = tran.split(item)[0]
162 tran = tran.replace("-", "_").replace("(", "").replace(")", "")
163 seq = "".join(newline_split[1:])
164 if "_PAR_Y" in tran:
165 tran += "_chrY"
166 elif "_PAR_X" in tran:
167 tran += "_chrX"
168 tran = tran.upper()
169 starts_stops = get_start_stops(seq)
170 print("tran", tran)
171 if tran not in master_dict:
172 master_dict[tran] = {
173 "utr": [],
174 "cds": [],
175 "exon": [],
176 "start_codon": [],
177 "stop_codon": [],
178 "start_list": starts_stops["starts"],
179 "stop_list": starts_stops["stops"],
180 "transcript": [],
181 "strand": "",
182 "gene_name": "",
183 "chrom": "",
184 "seq": seq,
185 "cds_start": "NULL",
186 "cds_stop": "NULL",
187 "length": len(seq),
188 "principal": 0,
189 "version": "NULL",
190 }
191
192
193 def to_ranges(iterable):
194 tup_list = []
195 iterable = sorted(set(iterable))
196 for key, group in itertools.groupby(enumerate(iterable), lambda t: t[1] - t[0]):
197 group = list(group)
198 tup_list.append((group[0][1], group[-1][1]))
199 return tup_list
200
201
202 # parse annotation file to get chromsome, exon location and CDS info for each transcript
203 def parse_gtf_file(annotation_file):
204 for line in annotation_file:
205 if line == "\n":
206 continue
207 if line[0] != "#":
208 splitline = (line.replace("\n", "")).split("\t")
209 chrom = splitline[0]
210 try:
211 annot_type = splitline[2].lower()
212 except:
213 print(
214 "ERROR tried to index to second item in splitline: ",
215 splitline,
216 line,
217 )
218 sys.exit()
219 # if annot_type not in ["cds", "utr", "exon", "transcript","five_prime_utr", "three_prime_utr","stop_codon","start_codon"]:
220 # continue
221 if (
222 annot_type not in delimiters["transcripts"]["annot_types"]
223 and annot_type not in delimiters["genes"]["annot_types"]
224 ):
225 continue
226 if annot_type == "five_prime_utr" or annot_type == "three_prime_utr":
227 annot_type = "utr"
228 strand = splitline[6]
229 if strand == "+":
230 start = int(splitline[3])
231 end = int(splitline[4])
232 else:
233 start = int(splitline[3]) + 1
234 end = int(splitline[4]) + 1
235 desc = splitline[8]
236 tran = desc
237 gene = desc
238 for item in delimiters["transcripts"]["before"]:
239 if item in tran:
240 tran = tran.split(item)[1]
241 for item in delimiters["transcripts"]["after"]:
242 if item in tran:
243 tran = tran.split(item)[0]
244 if "." in tran and TRAN_VERSION == True:
245 # print ("raw tran",tran)
246 tran = tran.split(".")
247 version = int(tran[-1].split("_")[0])
248 tran = tran[0]
249 else:
250 version = "NULL"
251 tran = tran.replace("-", "_").replace(".", "_")
252 tran = tran.replace("(", "").replace(")", "")
253 tran = tran.replace(" ", "").replace("\t", "")
254 tran = tran.upper()
255 tran = tran.replace("GENE_", "").replace("ID_", "")
256 if "_PAR_Y" in desc:
257 # print ("adding _PAR_Y to tran")
258 tran = tran + "_PAR_Y"
259 # print ("New tran ", tran)
260 # if "PAR_Y" in line:
261 # print (line)
262 # #sys.exit()
263 # print ("tran",tran,version)
264 # if tran == "ENST00000316448":
265 # print ("annot type",annot_type)
266 # print ("appending exon to tran", start, end,line)
267
268 gene_found = False
269
270 if annot_type in delimiters["genes"]["annot_types"]:
271 for item in delimiters["genes"]["before"]:
272 if item in gene:
273 gene_found = True
274 gene = gene.split(item)[1]
275 for item in delimiters["genes"]["after"]:
276 if item in gene:
277 gene = gene.split(item)[0]
278 gene = gene.replace("'", "''")
279 gene = gene.replace("GENE_", "")
280 gene = gene.replace("ID_", "")
281 gene = gene.upper()
282 if tran in master_dict:
283 master_dict[tran]["strand"] = strand
284 if strand == "+":
285 if annot_type in master_dict[tran]:
286 master_dict[tran][annot_type].append((start, end))
287 else:
288 if annot_type in master_dict[tran]:
289 master_dict[tran][annot_type].append((start, end))
290 master_dict[tran]["chrom"] = chrom
291 master_dict[tran]["version"] = version
292 if gene_found == True:
293 master_dict[tran]["gene_name"] = gene
294 else:
295 notinfasta.write("{}\n".format(tran))
296
297
298 annotation_file = open(sys.argv[2], "r")
299 parse_gtf_file(annotation_file)
300
301
302 # remove transcripts that were in fasta file but not in annotation_file
303 notinannotation = []
304 for tran in master_dict:
305 if master_dict[tran]["chrom"] == "":
306 # print ("tran {} has no chrom :(".format(tran))
307 notinannotation.append(tran)
308 for tran in notinannotation:
309 print(tran)
310 del master_dict[tran]
311 # Dictionary to store the coding status of a gene, if any transcript of this gene is coding, the value will be True
312 coding_genes_dict = {}
313 # parse master_dict to calculate length, cds_start/stop and exon junction positions
314 for tran in master_dict:
315 try:
316 transeq = master_dict[tran]["seq"]
317 except Exception as e:
318 print("not in fasta", tran)
319 notinfasta.write("{}\n".format(tran))
320 continue
321 exon_junctions = []
322 total_length = len(transeq)
323 three_len = 1
324 five_len = 1
325 strand = master_dict[tran]["strand"]
326 if master_dict[tran]["gene_name"] == "":
327 master_dict[tran]["gene_name"] = tran
328 gene = master_dict[tran]["gene_name"]
329 if gene not in coding_genes_dict:
330 coding_genes_dict[gene] = False
331
332 if master_dict[tran]["cds"] == []:
333 tran_type = "noncoding"
334 cds_start = "NULL"
335 cds_stop = "NULL"
336 else:
337 # get utr lengths from annotation
338 tran_type = "coding"
339 coding_genes_dict[gene] = True
340 sorted_exons = sorted(master_dict[tran]["exon"])
341 sorted_cds = sorted(master_dict[tran]["cds"])
342
343 min_cds = sorted_cds[0][0]
344 # Some annotation files do not have utr annotation types, so fix that here if thats the case
345 if master_dict[tran]["utr"] == []:
346 for exon_tup in master_dict[tran]["exon"]:
347 if exon_tup not in master_dict[tran]["cds"]:
348 # Now check if this overlaps with any of the CDS exons
349 overlap = False
350 for cds_tup in master_dict[tran]["cds"]:
351 if exon_tup[0] == cds_tup[0] and exon_tup[1] != cds_tup[1]:
352 master_dict[tran]["utr"].append((cds_tup[1], exon_tup[1]))
353 overlap = True
354 if exon_tup[0] != cds_tup[0] and exon_tup[1] == cds_tup[1]:
355 master_dict[tran]["utr"].append((exon_tup[0], cds_tup[0]))
356 overlap = True
357 if overlap == False:
358 master_dict[tran]["utr"].append(exon_tup)
359
360 for tup in sorted(master_dict[tran]["utr"]):
361 if tup[0] < min_cds:
362 five_len += (tup[1] - tup[0]) + 1
363 elif tup[0] > min_cds:
364 three_len += (tup[1] - tup[0]) + 1
365 else:
366 pass
367 if strand == "+":
368 if len(sorted_exons) > 1:
369 sorted_exons[0] = (
370 sorted_exons[0][0] - pseudo_utr_len,
371 sorted_exons[0][1],
372 )
373 sorted_exons[-1] = (
374 sorted_exons[-1][0],
375 sorted_exons[-1][1] + pseudo_utr_len,
376 )
377 else:
378 sorted_exons[0] = (
379 sorted_exons[0][0] - pseudo_utr_len,
380 sorted_exons[0][1] + pseudo_utr_len,
381 )
382 master_dict[tran]["exon"] = sorted_exons
383 cds_start = five_len + pseudo_utr_len
384 cds_stop = ((total_length - three_len) - pseudo_utr_len) + 4
385 elif strand == "-":
386 if len(sorted_exons) > 1:
387 sorted_exons[0] = (
388 (sorted_exons[0][0] - pseudo_utr_len),
389 sorted_exons[0][1],
390 )
391 sorted_exons[-1] = (
392 sorted_exons[-1][0],
393 (sorted_exons[-1][1] + pseudo_utr_len),
394 )
395 else:
396 sorted_exons[0] = (
397 (sorted_exons[0][0] - pseudo_utr_len),
398 (sorted_exons[0][1] + pseudo_utr_len),
399 )
400 master_dict[tran]["exon"] = sorted_exons
401 cds_start = three_len + pseudo_utr_len
402 cds_stop = ((total_length - (five_len)) - pseudo_utr_len) + 4
403 # if tran == "ENST00000381401":
404 # print ("cds start, cds stop, five_len, three_len",cds_start,cds_stop,five_len,three_len)
405 # #sys.exit()
406 else:
407 print("strand is unknown: {}".format(strand))
408 sys.exit()
409 # get exon junctions, cds is easy just get end of each tuple except last, same for utr except for if same as cds start/stop
410 total_intronic = 0
411 try:
412 if strand == "+":
413 tx_start = min(sorted(master_dict[tran]["exon"]))[0]
414 prev_end = tx_start
415 for tup in sorted(master_dict[tran]["exon"])[:-1]:
416 total_intronic += tup[0] - prev_end
417 exon_junctions.append(((tup[1]) - tx_start) - total_intronic)
418 prev_end = tup[1]
419 elif strand == "-":
420 tx_start = max(sorted(master_dict[tran]["exon"]))[-1]
421 prev_end = tx_start
422 for tup in (sorted(master_dict[tran]["exon"])[1:])[::-1]:
423 total_intronic += (tup[0] + 1) - prev_end
424 exon_junctions.append(((tup[1]) - tx_start) - total_intronic)
425 prev_end = tup[1]
426 except:
427 if strand == "+":
428 tx_start = min(sorted(master_dict[tran]["cds"]))[0]
429 prev_end = tx_start
430 for tup in sorted(master_dict[tran]["cds"])[:-1]:
431 total_intronic += tup[0] - prev_end
432 exon_junctions.append(((tup[1]) - tx_start) - total_intronic)
433 prev_end = tup[1]
434 elif strand == "-":
435 tx_start = max(sorted(master_dict[tran]["cds"]))[-1]
436 prev_end = tx_start
437 for tup in (sorted(master_dict[tran]["cds"])[1:])[::-1]:
438 total_intronic += (tup[0] + 1) - prev_end
439 exon_junctions.append(((tup[1]) - tx_start) - total_intronic)
440 prev_end = tup[1]
441 # This can happen when a coding transcript doesn't have a properly annotated 3' trailer
442 if cds_stop != "NULL":
443 if cds_stop > total_length:
444 cds_stop = total_length
445 if strand == "+" and cds_start != "NULL":
446 master_dict[tran]["cds_start"] = cds_start
447 master_dict[tran]["cds_stop"] = cds_stop
448 elif strand == "-" and cds_start != "NULL":
449 master_dict[tran]["cds_start"] = cds_start
450 master_dict[tran]["cds_stop"] = cds_stop
451
452 master_dict[tran]["strand"] = strand
453 master_dict[tran]["tran_type"] = tran_type
454 master_dict[tran]["exon_junctions"] = exon_junctions
455
456 longest_tran_dict = {}
457 for tran in master_dict:
458 try:
459 gene = master_dict[tran]["gene_name"]
460 except:
461 continue
462 if coding_genes_dict[gene] == True:
463 if "cds_start" in master_dict[tran]:
464 if (
465 master_dict[tran]["cds_stop"] != "NULL"
466 and master_dict[tran]["cds_start"] != "NULL"
467 ):
468 cds_length = (
469 master_dict[tran]["cds_stop"] - master_dict[tran]["cds_start"]
470 )
471 if gene not in longest_tran_dict:
472 longest_tran_dict[gene] = {"tran": tran, "length": cds_length}
473 else:
474 if cds_length > longest_tran_dict[gene]["length"]:
475 longest_tran_dict[gene] = {"tran": tran, "length": cds_length}
476 if cds_length == longest_tran_dict[gene]["length"]:
477 if (
478 master_dict[tran]["length"]
479 > master_dict[longest_tran_dict[gene]["tran"]]["length"]
480 ):
481 longest_tran_dict[gene] = {
482 "tran": tran,
483 "length": cds_length,
484 }
485 else:
486 length = master_dict[tran]["length"]
487 if gene not in longest_tran_dict:
488 longest_tran_dict[gene] = {"tran": tran, "length": length}
489 elif length > longest_tran_dict[gene]["length"]:
490 longest_tran_dict[gene] = {"tran": tran, "length": length}
491
492
493 for gene in longest_tran_dict:
494 longest_tran = longest_tran_dict[gene]["tran"]
495 master_dict[longest_tran]["principal"] = 1
496
497 gene_sample = []
498 for key in list(master_dict)[:10]:
499 try:
500 gene_sample.append(master_dict[key]["gene_name"])
501 except:
502 pass
503 print(master_dict)
504 print("Here is a sample of the transcript ids: {}".format(list(master_dict)[:10]))
505 print("Here is a sample of the gene names: {}".format(gene_sample))
506
507
508 # Takes a transcript, transcriptomic position and a master_dict (see ribopipe scripts) and returns the genomic position, positions should be passed 1 at a time.
509 def tran_to_genome(tran, start_pos, end_pos, master_dict):
510 pos_list = []
511 for i in range(start_pos, end_pos + 1):
512 pos_list.append(i)
513 genomic_pos_list = []
514 if tran in master_dict:
515 transcript_info = master_dict[tran]
516 else:
517 return ("Null", [])
518
519 chrom = transcript_info["chrom"]
520 strand = transcript_info["strand"]
521 exons = sorted(transcript_info["exon"])
522 # print ("chrom,strand,exons",chrom,strand,exons)
523 for pos in pos_list:
524 # print ("pos",pos)
525 if strand == "+":
526 exon_start = 0
527 for tup in exons:
528 # print ("tup",tup)
529 exon_start = tup[0]
530 exonlen = tup[1] - tup[0]
531 if pos > exonlen:
532 pos = (pos - exonlen) - 1
533 else:
534 break
535 # print ("appending exon_start-pos", exon_start, pos, exon_start+pos)
536 genomic_pos_list.append((exon_start + pos) - 1)
537 elif strand == "-":
538 exon_start = 0
539 for tup in exons[::-1]:
540 # print ("tup",tup)
541 exon_start = tup[1]
542 exonlen = tup[1] - tup[0]
543 # print ("exonlen",exonlen)
544 if pos > exonlen:
545 # print ("pos is greater")
546 pos = (pos - exonlen) - 1
547 # print ("new pos",pos)
548 else:
549 break
550 # print ("appending exon_start-pos", exon_start, pos, exon_start-pos)
551 genomic_pos_list.append((exon_start - pos) + 1)
552 return (chrom, genomic_pos_list)
553
554
555 orf_dict = {
556 "uorf": {},
557 "ouorf": {},
558 "cds": {},
559 "nested": {},
560 "odorf": {},
561 "dorf": {},
562 "minusone": {},
563 "readthrough": {},
564 "plusone": {},
565 "noncoding": {},
566 "extension": {},
567 "inframe_stop": {},
568 }
569
570 start_codons = ["ATG", "CTG", "GTG", "TTG", "ATC", "ATA", "ATT", "ACG", "AAG", "AGG"]
571
572 stop_codons = ["TAG", "TAA", "TGA"]
573
574
575 # Keep track of the longest transcript for each noncoding gene, append this to transcript list later
576 longest_noncoding = {}
577
578
579 tran_count = 0
580 # This section is used to gather all cds regions, convert them to genomic regions and store them in a dictionary to check against later (all transcript contribute to this not just those
581 # in the transcript list)
582 genomic_cds_dict = {}
583 tree_dict = {}
584 for transcript in master_dict:
585 # print (transcript, master_dict[transcript]["tran_type"])
586 tran_count += 1
587 if "seq" not in master_dict[transcript]:
588 continue
589 chrom = master_dict[transcript]["chrom"]
590 if chrom not in genomic_cds_dict:
591 genomic_cds_dict[chrom] = []
592 if "cds_start" in master_dict[transcript]:
593 cds_start = master_dict[transcript]["cds_start"]
594 cds_stop = master_dict[transcript]["cds_stop"]
595 if cds_start != "NULL":
596 cds_pos = []
597 for i in range(cds_start, cds_stop + 1):
598 cds_pos.append(i)
599
600 for tup in master_dict[transcript]["cds"]:
601 if tup[0] != tup[1]:
602 if tup not in genomic_cds_dict[chrom]:
603 genomic_cds_dict[chrom].append(tup)
604
605 print("genomic cds dict built")
606 print(list(genomic_cds_dict))
607 for chrom in genomic_cds_dict:
608 tree_dict[chrom] = IntervalTree.from_tuples(genomic_cds_dict[chrom])
609
610 # print (list(tree_dict))
611
612
613 connection = sqlite3.connect("{}.sqlite".format(organism))
614 cursor = connection.cursor()
615 cursor.execute(
616 "CREATE TABLE IF NOT EXISTS transcripts (transcript VARCHAR(50), gene VARCHAR(50), length INT(6), cds_start INT(6), cds_stop INT(6), sequence VARCHAR(50000), strand CHAR(1), stop_list VARCHAR(10000), start_list VARCHAR(10000), exon_junctions VARCHAR(1000), tran_type INT(1), gene_type INT(1), principal INT(1), version INT(2),gc INT(3),five_gc INT(3), cds_gc INT(3), three_gc INT(3), chrom VARCHAR(20));"
617 )
618 cursor.execute(
619 "CREATE TABLE IF NOT EXISTS coding_regions (transcript VARCHAR(50), coding_start INT(6), coding_stop INT(6));"
620 )
621 cursor.execute(
622 "CREATE TABLE IF NOT EXISTS exons (transcript VARCHAR(50), exon_start INT(6), exon_stop INT(6));"
623 )
624 cursor.execute(
625 "CREATE TABLE IF NOT EXISTS uorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
626 )
627 cursor.execute(
628 "CREATE TABLE IF NOT EXISTS ouorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
629 )
630 cursor.execute(
631 "CREATE TABLE IF NOT EXISTS cds (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
632 )
633 cursor.execute(
634 "CREATE TABLE IF NOT EXISTS nested (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
635 )
636 cursor.execute(
637 "CREATE TABLE IF NOT EXISTS odorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
638 )
639 cursor.execute(
640 "CREATE TABLE IF NOT EXISTS dorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
641 )
642 cursor.execute(
643 "CREATE TABLE IF NOT EXISTS minusone(transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
644 )
645 cursor.execute(
646 "CREATE TABLE IF NOT EXISTS readthrough (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
647 )
648 cursor.execute(
649 "CREATE TABLE IF NOT EXISTS plusone (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
650 )
651 cursor.execute(
652 "CREATE TABLE IF NOT EXISTS noncoding (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
653 )
654 cursor.execute(
655 "CREATE TABLE IF NOT EXISTS extension (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
656 )
657 cursor.execute(
658 "CREATE TABLE IF NOT EXISTS inframe_stop (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
659 )
660 connection.commit()
661
662
663 print("Finding ORFs")
664 transcript_count = 0
665 total_transcripts = len(list(master_dict))
666 for transcript in master_dict:
667 # print ("transcript",transcript)
668 # if transcript != "ENST00000316448":
669 # continue
670 transcript_count += 1
671 if transcript_count % 100 == 0:
672 print("Transcripts complete: {}/{}".format(transcript_count, total_transcripts))
673 if "seq" not in master_dict[transcript]:
674 print("transcript {} has no sequence".format(transcript))
675 continue
676 seq = master_dict[transcript]["seq"]
677 cds_start = "NULL"
678 cds_stop = "NULL"
679 transcript_len = len(seq)
680 if "cds_start" in master_dict[transcript]:
681 cds_start = master_dict[transcript]["cds_start"]
682 cds_stop = master_dict[transcript]["cds_stop"]
683
684 # Find out what regions of this transcript overlap with any other coding regions
685 coding_positions = []
686 if cds_start != "NULL":
687 # If this is a coding transcript don't bother checking the CDS
688 for i in range(cds_start, cds_stop):
689 coding_positions.append(i)
690 # check 5' leader
691 chrom, pos_list = tran_to_genome(transcript, 0, cds_start, master_dict)
692 for i in range(0, cds_start):
693 genomic_pos = pos_list[i]
694 overlap = tree_dict[chrom][genomic_pos]
695 if len(overlap) != 0:
696 coding_positions.append(i)
697 # check 3' trailer
698 chrom, pos_list = tran_to_genome(
699 transcript, cds_stop, transcript_len, master_dict
700 )
701 for i in range(cds_stop, transcript_len + 1):
702 # print ("i",i)
703 genomic_pos = pos_list[i - cds_stop]
704 # print ("genomic position",genomic_pos)
705 overlap = tree_dict[chrom][genomic_pos]
706 if len(overlap) != 0:
707 # print ("overlap not empty appending i",overlap)
708 coding_positions.append(i)
709 else:
710 # check entire transcript
711 chrom, pos_list = tran_to_genome(transcript, 0, transcript_len, master_dict)
712 for i in range(0, transcript_len):
713 genomic_pos = pos_list[i]
714 overlap = tree_dict[chrom][genomic_pos]
715 if len(overlap) != 0:
716 coding_positions.append(i)
717 coding_positions_tuple = to_ranges(coding_positions)
718 coding_dict[transcript] = coding_positions_tuple
719 coding_positions = set(coding_positions)
720 # if this is a coding transcript find the minusone, readhtrough, plusone co-ordinates
721 if cds_start != "NULL":
722 # print ("transcript", transcript)
723 # if pseudo_utr_len != 0:
724 # cds_stop -= 3 # take 3 from stop so we can match it with orf_stop, do it here rather than above in case cds_stop is null
725 recoding_dict = {2: "minusone", 0: "readthrough", 1: "plusone"}
726 for addition in recoding_dict:
727 orftype = recoding_dict[addition]
728 for i in range(cds_stop + addition, transcript_len, 3):
729 if seq[i : i + 3] in stop_codons:
730 # orf_seq = seq[cds_stop:i+3]
731 orf_start = cds_stop
732 if orftype == "readthrough":
733 orf_start -= 2
734 if orftype == "plusone":
735 orf_start -= 1
736 orf_stop = i + 3 # +2 so it refers to the end of the stop codon
737 start_codon = None
738 length = (i + 3) - cds_stop
739 orf_pos_list = []
740 # determine how many nucleotides in this orf overlap with an annotated coding region
741 cds_cov_count = 0.0
742 for position in range(orf_start, orf_stop):
743 orf_pos_list.append(position)
744 for pos in range(orf_start, orf_stop + 1):
745 if pos in coding_positions:
746 cds_cov_count += 1
747 cds_cov = cds_cov_count / length
748 # print ("orftype, start, stop", orftype, orf_start, orf_stop)
749 cursor.execute(
750 "INSERT INTO {} VALUES('{}','{}',{},{},{},{});".format(
751 orftype,
752 transcript,
753 start_codon,
754 length,
755 orf_start,
756 orf_stop,
757 cds_cov,
758 )
759 )
760 break
761 # sys.exit()
762 for frame in [0, 1, 2]:
763 for i in range(frame, transcript_len, 3):
764 if seq[i : i + 3] in start_codons:
765 for x in range(i, transcript_len, 3):
766 if seq[x : x + 3] in stop_codons:
767 # orf_seq = seq[i:x+3]
768 orf_start = i + 1
769 orf_stop = x + 3 # +2 so it refers to the end of the stop codon
770 start_codon = seq[i : i + 3]
771 length = (x + 3) - i
772 orf_pos_list = []
773 # determine how many nucleotides in this orf overlap with an annotated coding region
774 cds_cov_count = 0.0
775 for pos in range(orf_start, orf_stop + 1):
776 if pos in coding_positions:
777 cds_cov_count += 1
778 cds_cov = float(cds_cov_count) / float(length)
779 # Now determine orf type
780 if cds_start == "NULL":
781 orftype = "noncoding"
782 else:
783 # print ("cds start is not null :{}:{}".format(cds_start,cds_stop))
784 # print ("orf start, orf stop", orf_start, orf_stop)
785 if orf_start == cds_start and orf_stop == cds_stop:
786 orftype = "cds"
787 # print ("orf type is cds")
788 elif orf_start < cds_start and orf_stop == cds_stop:
789 orftype = "extension"
790 # special case for extensions, we only take from the orf_start to the cds_start, and re-calculate cds coverage
791 orf_stop = cds_start
792 cds_cov_count = 0.0
793 for pos in range(orf_start, orf_stop + 1):
794 if pos in coding_positions:
795 cds_cov_count += 1
796 cds_cov = float(cds_cov_count) / float(length)
797 # orf_seq = seq[orf_start:cds_start]
798 length = cds_start - orf_start
799 # print ("orf type is extension")
800 elif orf_start < cds_start and orf_stop <= cds_start:
801 orftype = "uorf"
802 # print ("orf type is uorf")
803 elif orf_start < cds_start and orf_stop > cds_start:
804 orftype = "ouorf"
805 # print ("orf type is ouorf")
806 # sys.exit()
807 elif (
808 orf_start >= cds_start
809 and orf_start <= cds_stop
810 and orf_stop <= cds_stop
811 ):
812 if orf_stop == cds_stop:
813 break
814 # print ("Tran, cds_start, cds_stop, orf_start, orf_stop, tranlen",tran, cds_start, cds_stop, orf_start, orf_stop,transcript_len)
815 if (
816 orf_stop < transcript_len
817 and orf_stop % 3 == cds_stop % 3
818 ) or (
819 cds_start != 1
820 and orf_stop % 3 == (cds_start + 2) % 3
821 ):
822 # print ("Transcript {} has an inframe stop codon".format(transcript))
823 break
824 orftype = "nested"
825 # print ("orf type is nested")
826 elif (
827 orf_start >= cds_start
828 and orf_start <= cds_stop
829 and orf_stop > cds_stop
830 ):
831 orftype = "odorf"
832 # print ("orftype is odorf")
833 elif orf_start > cds_stop and orf_stop > cds_stop:
834 orftype = "dorf"
835 # print ("orftype is dorf")
836 # if orf_stop > cds_start and orf_stop < cds_stop:
837 # if (orf_stop+1)%3 == cds_start%3:
838 # orftype = "inframe_stop"
839 # print ("inframe stop, transcript, orf_stop", transcript, orf_stop)
840 # sys.exit()
841 # if transcript not in orf_dict:
842 # orf_dict[orftype][transcript] = []
843 # #print ("some weird stop or something")
844 cursor.execute(
845 "INSERT INTO {} VALUES('{}','{}',{},{},{},{});".format(
846 orftype,
847 transcript,
848 start_codon,
849 length,
850 orf_start,
851 orf_stop,
852 cds_cov,
853 )
854 )
855 break
856 # Used to keep track of the codons at cds_start and cds_stop positions,
857 # If there is an issue with the cds co-ordinates the starts and stops counts will
858 # be much lower than the other count, start with 1 to prevent division by 0
859 nuc_dict = {"stops": {"stops": 1, "other": 0}, "starts": {"starts": 1, "other": 0}}
860
861
862 def calcgc(seq):
863 if seq == "":
864 return "NULL"
865 g_count = 0
866 c_count = 0
867 a_count = 0
868 t_count = 0
869 for char in seq:
870 if char == "A":
871 a_count += 1
872 if char == "T":
873 t_count += 1
874 if char == "G":
875 g_count += 1
876 if char == "C":
877 c_count += 1
878 gc = ((g_count + c_count) / float(len(seq))) * 100
879 return round(gc, 2)
880
881
882 for transcript in master_dict:
883 # print ("transcripts", transcript)
884 length = master_dict[transcript]["length"]
885 cds_start = master_dict[transcript]["cds_start"]
886 cds_stop = master_dict[transcript]["cds_stop"]
887 seq = master_dict[transcript]["seq"]
888 strand = master_dict[transcript]["strand"]
889 chrom = master_dict[transcript]["chrom"]
890 gene = master_dict[transcript]["gene_name"]
891 gc = calcgc(seq)
892 five_gc = "NULL"
893 cds_gc = "NULL"
894 three_gc = "NULL"
895 if cds_start != "NULL":
896 five_gc = calcgc(seq[:cds_start])
897 cds_gc = calcgc(seq[cds_start:cds_stop])
898 three_gc = calcgc(seq[cds_stop:])
899 # check that the nucleotide cds_start points to is the first of the start codon
900 # take one becase cds_start is 1 based but python indexing is 0 based
901 start_nuc = seq[cds_start - 1 : cds_start + 2]
902 # print ("start nuc",start_nuc)
903 if start_nuc == "ATG":
904 nuc_dict["starts"]["starts"] += 1
905 else:
906 nuc_dict["starts"]["other"] += 1
907 stop_nuc = seq[cds_stop - 3 : cds_stop]
908 # print ("stop_nuc",stop_nuc)
909 if stop_nuc in ["TAG", "TAA", "TGA"]:
910 nuc_dict["stops"]["stops"] += 1
911 else:
912 nuc_dict["stops"]["other"] += 1
913 tran_type = master_dict[transcript]["tran_type"]
914 if coding_genes_dict[gene] == True:
915 gene_type = 1
916 else:
917 gene_type = 0
918 # print ("tran type before",tran_type)
919 if tran_type == "coding":
920 tran_type = 1
921 else:
922 tran_type = 0
923 # print ("tran type after",tran_type)
924 start_list = str(master_dict[transcript]["start_list"]).replace(" ", "").strip("[]")
925 stop_list = str(master_dict[transcript]["stop_list"]).replace(" ", "").strip("[]")
926 exon_junctions = (
927 str(master_dict[transcript]["exon_junctions"]).replace(" ", "").strip("[]")
928 )
929 principal = master_dict[transcript]["principal"]
930 version = master_dict[transcript]["version"]
931 # print (master_dict[transcript])
932 # print (tran_type)
933 # print (gene_type)
934 # print (principal)
935 # print (version)
936 # print ("INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{});".format(transcript, gene, length, cds_start, cds_stop, seq, strand,stop_list, start_list, exon_junctions, tran_type,gene_type,principal,version))
937 cursor.execute(
938 "INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{},{},{},{},{},'{}');".format(
939 transcript,
940 gene,
941 length,
942 cds_start,
943 cds_stop,
944 seq,
945 strand,
946 stop_list,
947 start_list,
948 exon_junctions,
949 tran_type,
950 gene_type,
951 principal,
952 version,
953 gc,
954 five_gc,
955 cds_gc,
956 three_gc,
957 chrom,
958 )
959 )
960
961 for tup in master_dict[transcript]["exon"]:
962 cursor.execute(
963 "INSERT INTO exons VALUES('{}',{},{});".format(transcript, tup[0], tup[1])
964 )
965 if transcript in coding_dict:
966 for tup in coding_dict[transcript]:
967 cursor.execute(
968 "INSERT INTO coding_regions VALUES('{}',{},{});".format(
969 transcript, tup[0], tup[1]
970 )
971 )
972
973 connection.commit()
974 connection.close()
975
976 print("delim", delimiters)
977 if (nuc_dict["starts"]["other"] / nuc_dict["starts"]["starts"]) > 0.05:
978 print(
979 "Warning: {} transcripts do not have a an AUG at the CDS start position".format(
980 nuc_dict["starts"]["other"]
981 )
982 )
983 if (nuc_dict["stops"]["other"] / nuc_dict["stops"]["stops"]) > 0.05:
984 print(
985 "Warning: {} transcripts do not have a an stop codon at the CDS stop position".format(
986 nuc_dict["stops"]["other"]
987 )
988 )
989 if len(notinannotation) > 0:
990 print(
991 "Warning: {} transcripts were in the fasta file, but not the annotation file, these will be discarded".format(
992 len(notinannotation)
993 )
994 )