Mercurial > repos > jackcurragh > trips_viz_create_annotation
comparison trips_create_annotation/create_annotation_sqlite.py @ 0:f24aed7a5cc7 draft
Uploaded
| author | jackcurragh |
|---|---|
| date | Mon, 04 Apr 2022 09:30:51 +0000 |
| parents | |
| children | d70696d3341e |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:f24aed7a5cc7 |
|---|---|
| 1 # Python3 script which takes in an annotation file(gtf/gff3) and a transcriptomic fasta file | |
| 2 # and produces an sqlite file which can be uploaded to Trips-Viz | |
| 3 # All co-ordinates produced are 1 based | |
| 4 # All start codon positions (including cds_start) should be at the first nucleotide of the codon | |
| 5 # All stop codon positions (including cds_stop) should be at the last nucleotide of the codon | |
| 6 import sys | |
| 7 import re | |
| 8 import sqlite3 | |
| 9 from intervaltree import Interval, IntervalTree | |
| 10 import itertools | |
| 11 from sqlitedict import SqliteDict | |
| 12 import os | |
| 13 | |
| 14 organism = sys.argv[1] | |
| 15 # This should be a GTF or GFF3 file | |
| 16 annotation_file = open(sys.argv[2], "r") | |
| 17 # This needs to be the transcriptomic fasta file | |
| 18 fasta_file = open(sys.argv[3], "r") | |
| 19 # This value will be added used to create UTRs of this length, useful when looking at transcriptomes without annotated UTRs | |
| 20 pseudo_utr_len = int(sys.argv[4]) | |
| 21 # An example of a transcript_id from the annotation file, e.g ENST000000123456 | |
| 22 user_transcript_id = sys.argv[5] | |
| 23 # An example of a gene name from the annotation file | |
| 24 user_gene_name = sys.argv[6] | |
| 25 # Set to true if transcript version is included in transcript_id, e.g: ENST000000123456.1 | |
| 26 TRAN_VERSION = True | |
| 27 | |
| 28 | |
| 29 if os.path.isfile("{}.sqlite".format(organism)): | |
| 30 print("{}.sqlite already exists".format(organism)) | |
| 31 sys.exit() | |
| 32 | |
| 33 | |
| 34 # old_exons = SqliteDict( | |
| 35 # "/home/DATA/www/tripsviz/tripsviz/trips_annotations/mus_musculus/transcriptomic_to_genomic.sqlite" | |
| 36 # ) | |
| 37 | |
| 38 | |
| 39 delimiters = { | |
| 40 "transcripts": {"before": [], "after": ["."], "annot_types": ["cds", "utr"]}, | |
| 41 "genes": {"before": [], "after": ['"'], "annot_types": ["lnc_rna"]}, | |
| 42 } | |
| 43 | |
| 44 punctuation = [";", " ", "-", ":", "-", ".", "=", "\t"] | |
| 45 # Find delimiters in the annotation and fasta files using the user_transcript_id | |
| 46 # and user_gene_name examples given by user. | |
| 47 for line in annotation_file: | |
| 48 if user_transcript_id in line: | |
| 49 tabsplitline = line.split("\t") | |
| 50 annot_type = tabsplitline[2] | |
| 51 if annot_type not in delimiters["transcripts"]["annot_types"]: | |
| 52 delimiters["transcripts"]["annot_types"].append(annot_type.lower()) | |
| 53 splitline = line.split(user_transcript_id) | |
| 54 before_delimiter = splitline[0] | |
| 55 for item in punctuation: | |
| 56 if item in before_delimiter: | |
| 57 if len(before_delimiter.split(item)[-1]) >= 5: | |
| 58 before_delimiter = before_delimiter.split(item)[-1] | |
| 59 after_delimiter = splitline[1][:2] | |
| 60 if ( | |
| 61 before_delimiter not in delimiters["transcripts"]["before"] | |
| 62 and len(before_delimiter) >= 5 | |
| 63 ): | |
| 64 delimiters["transcripts"]["before"].append(before_delimiter) | |
| 65 if after_delimiter not in delimiters["transcripts"]["after"]: | |
| 66 delimiters["transcripts"]["after"].append(after_delimiter) | |
| 67 if user_gene_name in line: | |
| 68 tabsplitline = line.split("\t") | |
| 69 annot_type = tabsplitline[2] | |
| 70 print("ANNOT TYPE", annot_type) | |
| 71 if annot_type not in delimiters["genes"]["annot_types"]: | |
| 72 delimiters["genes"]["annot_types"].append(annot_type.lower()) | |
| 73 splitline = line.split(user_gene_name) | |
| 74 before_delimiter = splitline[0] | |
| 75 for item in punctuation: | |
| 76 if item in before_delimiter: | |
| 77 if len(before_delimiter.split(item)[-1]) >= 5: | |
| 78 before_delimiter = before_delimiter.split(item)[-1] | |
| 79 after_delimiter = splitline[1][0] | |
| 80 if ( | |
| 81 before_delimiter not in delimiters["genes"]["before"] | |
| 82 and len(before_delimiter) >= 5 | |
| 83 ): | |
| 84 delimiters["genes"]["before"].append(before_delimiter) | |
| 85 if after_delimiter not in delimiters["genes"]["after"]: | |
| 86 if after_delimiter in punctuation: | |
| 87 delimiters["genes"]["after"].append(after_delimiter) | |
| 88 | |
| 89 print("delimeters[genes]", delimiters["transcripts"]["annot_types"]) | |
| 90 | |
| 91 for line in fasta_file: | |
| 92 if user_transcript_id in line: | |
| 93 splitline = line.split(user_transcript_id) | |
| 94 before_delimiter = splitline[0] | |
| 95 for item in punctuation: | |
| 96 if item in before_delimiter: | |
| 97 if len(before_delimiter.split(item)[1]) >= 5: | |
| 98 before_delimiter = before_delimiter.split(item)[1] | |
| 99 after_delimiter = splitline[1][0] | |
| 100 if ( | |
| 101 before_delimiter not in delimiters["transcripts"]["before"] | |
| 102 and len(before_delimiter) >= 5 | |
| 103 ): | |
| 104 delimiters["transcripts"]["before"].append(before_delimiter) | |
| 105 if after_delimiter not in delimiters["transcripts"]["after"]: | |
| 106 delimiters["transcripts"]["after"].append(after_delimiter) | |
| 107 fasta_file.close() | |
| 108 annotation_file.close() | |
| 109 | |
| 110 | |
| 111 if delimiters["transcripts"]["before"] == []: | |
| 112 print( | |
| 113 "ERROR: No transcript_id with the name {} could be found in the annotation file".format( | |
| 114 user_transcript_id | |
| 115 ) | |
| 116 ) | |
| 117 sys.exit() | |
| 118 if delimiters["genes"]["before"] == []: | |
| 119 print( | |
| 120 "ERROR: No gene with the name {} could be found in the annotation file".format( | |
| 121 user_gene_name | |
| 122 ) | |
| 123 ) | |
| 124 sys.exit() | |
| 125 | |
| 126 master_dict = {} | |
| 127 coding_dict = {} | |
| 128 notinfasta = open("notinfasta.csv", "w") | |
| 129 | |
| 130 # Given a nucleotide sequence returns the positions of all start and stop codons. | |
| 131 def get_start_stops(transcript_sequence): | |
| 132 transcript_sequence = transcript_sequence.upper() | |
| 133 start_codons = ["ATG"] | |
| 134 stop_codons = ["TAA", "TAG", "TGA"] | |
| 135 seq_frames = {"starts": [], "stops": []} | |
| 136 for codons, positions in ((start_codons, "starts"), (stop_codons, "stops")): | |
| 137 if len(codons) > 1: | |
| 138 pat = re.compile("|".join(codons)) | |
| 139 else: | |
| 140 pat = re.compile(codons[0]) | |
| 141 for m in re.finditer(pat, transcript_sequence): | |
| 142 # Increment position by 1, Frame 1 starts at position 1 not 0, | |
| 143 # if it's a stop codon add another 2 so it points to the last nuc of the codon | |
| 144 if positions == "starts": | |
| 145 start = m.start() + 1 | |
| 146 else: | |
| 147 start = m.start() + 3 | |
| 148 seq_frames[positions].append(start) | |
| 149 return seq_frames | |
| 150 | |
| 151 | |
| 152 # parse fasta to get the nucleotide sequence of transcripts and the positions of start/stop codons. | |
| 153 fasta_file = open(sys.argv[3], "r") | |
| 154 read_fasta = fasta_file.read() | |
| 155 split_fasta = read_fasta.split(">") | |
| 156 for entry in split_fasta[1:]: | |
| 157 newline_split = entry.split("\n") | |
| 158 tran = newline_split[0] | |
| 159 for item in delimiters["transcripts"]["after"]: | |
| 160 if item in tran: | |
| 161 tran = tran.split(item)[0] | |
| 162 tran = tran.replace("-", "_").replace("(", "").replace(")", "") | |
| 163 seq = "".join(newline_split[1:]) | |
| 164 if "_PAR_Y" in tran: | |
| 165 tran += "_chrY" | |
| 166 elif "_PAR_X" in tran: | |
| 167 tran += "_chrX" | |
| 168 tran = tran.upper() | |
| 169 starts_stops = get_start_stops(seq) | |
| 170 print("tran", tran) | |
| 171 if tran not in master_dict: | |
| 172 master_dict[tran] = { | |
| 173 "utr": [], | |
| 174 "cds": [], | |
| 175 "exon": [], | |
| 176 "start_codon": [], | |
| 177 "stop_codon": [], | |
| 178 "start_list": starts_stops["starts"], | |
| 179 "stop_list": starts_stops["stops"], | |
| 180 "transcript": [], | |
| 181 "strand": "", | |
| 182 "gene_name": "", | |
| 183 "chrom": "", | |
| 184 "seq": seq, | |
| 185 "cds_start": "NULL", | |
| 186 "cds_stop": "NULL", | |
| 187 "length": len(seq), | |
| 188 "principal": 0, | |
| 189 "version": "NULL", | |
| 190 } | |
| 191 | |
| 192 | |
| 193 def to_ranges(iterable): | |
| 194 tup_list = [] | |
| 195 iterable = sorted(set(iterable)) | |
| 196 for key, group in itertools.groupby(enumerate(iterable), lambda t: t[1] - t[0]): | |
| 197 group = list(group) | |
| 198 tup_list.append((group[0][1], group[-1][1])) | |
| 199 return tup_list | |
| 200 | |
| 201 | |
| 202 # parse annotation file to get chromsome, exon location and CDS info for each transcript | |
| 203 def parse_gtf_file(annotation_file): | |
| 204 for line in annotation_file: | |
| 205 if line == "\n": | |
| 206 continue | |
| 207 if line[0] != "#": | |
| 208 splitline = (line.replace("\n", "")).split("\t") | |
| 209 chrom = splitline[0] | |
| 210 try: | |
| 211 annot_type = splitline[2].lower() | |
| 212 except: | |
| 213 print( | |
| 214 "ERROR tried to index to second item in splitline: ", | |
| 215 splitline, | |
| 216 line, | |
| 217 ) | |
| 218 sys.exit() | |
| 219 # if annot_type not in ["cds", "utr", "exon", "transcript","five_prime_utr", "three_prime_utr","stop_codon","start_codon"]: | |
| 220 # continue | |
| 221 if ( | |
| 222 annot_type not in delimiters["transcripts"]["annot_types"] | |
| 223 and annot_type not in delimiters["genes"]["annot_types"] | |
| 224 ): | |
| 225 continue | |
| 226 if annot_type == "five_prime_utr" or annot_type == "three_prime_utr": | |
| 227 annot_type = "utr" | |
| 228 strand = splitline[6] | |
| 229 if strand == "+": | |
| 230 start = int(splitline[3]) | |
| 231 end = int(splitline[4]) | |
| 232 else: | |
| 233 start = int(splitline[3]) + 1 | |
| 234 end = int(splitline[4]) + 1 | |
| 235 desc = splitline[8] | |
| 236 tran = desc | |
| 237 gene = desc | |
| 238 for item in delimiters["transcripts"]["before"]: | |
| 239 if item in tran: | |
| 240 tran = tran.split(item)[1] | |
| 241 for item in delimiters["transcripts"]["after"]: | |
| 242 if item in tran: | |
| 243 tran = tran.split(item)[0] | |
| 244 if "." in tran and TRAN_VERSION == True: | |
| 245 # print ("raw tran",tran) | |
| 246 tran = tran.split(".") | |
| 247 version = int(tran[-1].split("_")[0]) | |
| 248 tran = tran[0] | |
| 249 else: | |
| 250 version = "NULL" | |
| 251 tran = tran.replace("-", "_").replace(".", "_") | |
| 252 tran = tran.replace("(", "").replace(")", "") | |
| 253 tran = tran.replace(" ", "").replace("\t", "") | |
| 254 tran = tran.upper() | |
| 255 tran = tran.replace("GENE_", "").replace("ID_", "") | |
| 256 if "_PAR_Y" in desc: | |
| 257 # print ("adding _PAR_Y to tran") | |
| 258 tran = tran + "_PAR_Y" | |
| 259 # print ("New tran ", tran) | |
| 260 # if "PAR_Y" in line: | |
| 261 # print (line) | |
| 262 # #sys.exit() | |
| 263 # print ("tran",tran,version) | |
| 264 # if tran == "ENST00000316448": | |
| 265 # print ("annot type",annot_type) | |
| 266 # print ("appending exon to tran", start, end,line) | |
| 267 | |
| 268 gene_found = False | |
| 269 | |
| 270 if annot_type in delimiters["genes"]["annot_types"]: | |
| 271 for item in delimiters["genes"]["before"]: | |
| 272 if item in gene: | |
| 273 gene_found = True | |
| 274 gene = gene.split(item)[1] | |
| 275 for item in delimiters["genes"]["after"]: | |
| 276 if item in gene: | |
| 277 gene = gene.split(item)[0] | |
| 278 gene = gene.replace("'", "''") | |
| 279 gene = gene.replace("GENE_", "") | |
| 280 gene = gene.replace("ID_", "") | |
| 281 gene = gene.upper() | |
| 282 if tran in master_dict: | |
| 283 master_dict[tran]["strand"] = strand | |
| 284 if strand == "+": | |
| 285 if annot_type in master_dict[tran]: | |
| 286 master_dict[tran][annot_type].append((start, end)) | |
| 287 else: | |
| 288 if annot_type in master_dict[tran]: | |
| 289 master_dict[tran][annot_type].append((start, end)) | |
| 290 master_dict[tran]["chrom"] = chrom | |
| 291 master_dict[tran]["version"] = version | |
| 292 if gene_found == True: | |
| 293 master_dict[tran]["gene_name"] = gene | |
| 294 else: | |
| 295 notinfasta.write("{}\n".format(tran)) | |
| 296 | |
| 297 | |
| 298 annotation_file = open(sys.argv[2], "r") | |
| 299 parse_gtf_file(annotation_file) | |
| 300 | |
| 301 | |
| 302 # remove transcripts that were in fasta file but not in annotation_file | |
| 303 notinannotation = [] | |
| 304 for tran in master_dict: | |
| 305 if master_dict[tran]["chrom"] == "": | |
| 306 # print ("tran {} has no chrom :(".format(tran)) | |
| 307 notinannotation.append(tran) | |
| 308 for tran in notinannotation: | |
| 309 print(tran) | |
| 310 del master_dict[tran] | |
| 311 # Dictionary to store the coding status of a gene, if any transcript of this gene is coding, the value will be True | |
| 312 coding_genes_dict = {} | |
| 313 # parse master_dict to calculate length, cds_start/stop and exon junction positions | |
| 314 for tran in master_dict: | |
| 315 try: | |
| 316 transeq = master_dict[tran]["seq"] | |
| 317 except Exception as e: | |
| 318 print("not in fasta", tran) | |
| 319 notinfasta.write("{}\n".format(tran)) | |
| 320 continue | |
| 321 exon_junctions = [] | |
| 322 total_length = len(transeq) | |
| 323 three_len = 1 | |
| 324 five_len = 1 | |
| 325 strand = master_dict[tran]["strand"] | |
| 326 if master_dict[tran]["gene_name"] == "": | |
| 327 master_dict[tran]["gene_name"] = tran | |
| 328 gene = master_dict[tran]["gene_name"] | |
| 329 if gene not in coding_genes_dict: | |
| 330 coding_genes_dict[gene] = False | |
| 331 | |
| 332 if master_dict[tran]["cds"] == []: | |
| 333 tran_type = "noncoding" | |
| 334 cds_start = "NULL" | |
| 335 cds_stop = "NULL" | |
| 336 else: | |
| 337 # get utr lengths from annotation | |
| 338 tran_type = "coding" | |
| 339 coding_genes_dict[gene] = True | |
| 340 sorted_exons = sorted(master_dict[tran]["exon"]) | |
| 341 sorted_cds = sorted(master_dict[tran]["cds"]) | |
| 342 | |
| 343 min_cds = sorted_cds[0][0] | |
| 344 # Some annotation files do not have utr annotation types, so fix that here if thats the case | |
| 345 if master_dict[tran]["utr"] == []: | |
| 346 for exon_tup in master_dict[tran]["exon"]: | |
| 347 if exon_tup not in master_dict[tran]["cds"]: | |
| 348 # Now check if this overlaps with any of the CDS exons | |
| 349 overlap = False | |
| 350 for cds_tup in master_dict[tran]["cds"]: | |
| 351 if exon_tup[0] == cds_tup[0] and exon_tup[1] != cds_tup[1]: | |
| 352 master_dict[tran]["utr"].append((cds_tup[1], exon_tup[1])) | |
| 353 overlap = True | |
| 354 if exon_tup[0] != cds_tup[0] and exon_tup[1] == cds_tup[1]: | |
| 355 master_dict[tran]["utr"].append((exon_tup[0], cds_tup[0])) | |
| 356 overlap = True | |
| 357 if overlap == False: | |
| 358 master_dict[tran]["utr"].append(exon_tup) | |
| 359 | |
| 360 for tup in sorted(master_dict[tran]["utr"]): | |
| 361 if tup[0] < min_cds: | |
| 362 five_len += (tup[1] - tup[0]) + 1 | |
| 363 elif tup[0] > min_cds: | |
| 364 three_len += (tup[1] - tup[0]) + 1 | |
| 365 else: | |
| 366 pass | |
| 367 if strand == "+": | |
| 368 if len(sorted_exons) > 1: | |
| 369 sorted_exons[0] = ( | |
| 370 sorted_exons[0][0] - pseudo_utr_len, | |
| 371 sorted_exons[0][1], | |
| 372 ) | |
| 373 sorted_exons[-1] = ( | |
| 374 sorted_exons[-1][0], | |
| 375 sorted_exons[-1][1] + pseudo_utr_len, | |
| 376 ) | |
| 377 else: | |
| 378 sorted_exons[0] = ( | |
| 379 sorted_exons[0][0] - pseudo_utr_len, | |
| 380 sorted_exons[0][1] + pseudo_utr_len, | |
| 381 ) | |
| 382 master_dict[tran]["exon"] = sorted_exons | |
| 383 cds_start = five_len + pseudo_utr_len | |
| 384 cds_stop = ((total_length - three_len) - pseudo_utr_len) + 4 | |
| 385 elif strand == "-": | |
| 386 if len(sorted_exons) > 1: | |
| 387 sorted_exons[0] = ( | |
| 388 (sorted_exons[0][0] - pseudo_utr_len), | |
| 389 sorted_exons[0][1], | |
| 390 ) | |
| 391 sorted_exons[-1] = ( | |
| 392 sorted_exons[-1][0], | |
| 393 (sorted_exons[-1][1] + pseudo_utr_len), | |
| 394 ) | |
| 395 else: | |
| 396 sorted_exons[0] = ( | |
| 397 (sorted_exons[0][0] - pseudo_utr_len), | |
| 398 (sorted_exons[0][1] + pseudo_utr_len), | |
| 399 ) | |
| 400 master_dict[tran]["exon"] = sorted_exons | |
| 401 cds_start = three_len + pseudo_utr_len | |
| 402 cds_stop = ((total_length - (five_len)) - pseudo_utr_len) + 4 | |
| 403 # if tran == "ENST00000381401": | |
| 404 # print ("cds start, cds stop, five_len, three_len",cds_start,cds_stop,five_len,three_len) | |
| 405 # #sys.exit() | |
| 406 else: | |
| 407 print("strand is unknown: {}".format(strand)) | |
| 408 sys.exit() | |
| 409 # get exon junctions, cds is easy just get end of each tuple except last, same for utr except for if same as cds start/stop | |
| 410 total_intronic = 0 | |
| 411 try: | |
| 412 if strand == "+": | |
| 413 tx_start = min(sorted(master_dict[tran]["exon"]))[0] | |
| 414 prev_end = tx_start | |
| 415 for tup in sorted(master_dict[tran]["exon"])[:-1]: | |
| 416 total_intronic += tup[0] - prev_end | |
| 417 exon_junctions.append(((tup[1]) - tx_start) - total_intronic) | |
| 418 prev_end = tup[1] | |
| 419 elif strand == "-": | |
| 420 tx_start = max(sorted(master_dict[tran]["exon"]))[-1] | |
| 421 prev_end = tx_start | |
| 422 for tup in (sorted(master_dict[tran]["exon"])[1:])[::-1]: | |
| 423 total_intronic += (tup[0] + 1) - prev_end | |
| 424 exon_junctions.append(((tup[1]) - tx_start) - total_intronic) | |
| 425 prev_end = tup[1] | |
| 426 except: | |
| 427 if strand == "+": | |
| 428 tx_start = min(sorted(master_dict[tran]["cds"]))[0] | |
| 429 prev_end = tx_start | |
| 430 for tup in sorted(master_dict[tran]["cds"])[:-1]: | |
| 431 total_intronic += tup[0] - prev_end | |
| 432 exon_junctions.append(((tup[1]) - tx_start) - total_intronic) | |
| 433 prev_end = tup[1] | |
| 434 elif strand == "-": | |
| 435 tx_start = max(sorted(master_dict[tran]["cds"]))[-1] | |
| 436 prev_end = tx_start | |
| 437 for tup in (sorted(master_dict[tran]["cds"])[1:])[::-1]: | |
| 438 total_intronic += (tup[0] + 1) - prev_end | |
| 439 exon_junctions.append(((tup[1]) - tx_start) - total_intronic) | |
| 440 prev_end = tup[1] | |
| 441 # This can happen when a coding transcript doesn't have a properly annotated 3' trailer | |
| 442 if cds_stop != "NULL": | |
| 443 if cds_stop > total_length: | |
| 444 cds_stop = total_length | |
| 445 if strand == "+" and cds_start != "NULL": | |
| 446 master_dict[tran]["cds_start"] = cds_start | |
| 447 master_dict[tran]["cds_stop"] = cds_stop | |
| 448 elif strand == "-" and cds_start != "NULL": | |
| 449 master_dict[tran]["cds_start"] = cds_start | |
| 450 master_dict[tran]["cds_stop"] = cds_stop | |
| 451 | |
| 452 master_dict[tran]["strand"] = strand | |
| 453 master_dict[tran]["tran_type"] = tran_type | |
| 454 master_dict[tran]["exon_junctions"] = exon_junctions | |
| 455 | |
| 456 longest_tran_dict = {} | |
| 457 for tran in master_dict: | |
| 458 try: | |
| 459 gene = master_dict[tran]["gene_name"] | |
| 460 except: | |
| 461 continue | |
| 462 if coding_genes_dict[gene] == True: | |
| 463 if "cds_start" in master_dict[tran]: | |
| 464 if ( | |
| 465 master_dict[tran]["cds_stop"] != "NULL" | |
| 466 and master_dict[tran]["cds_start"] != "NULL" | |
| 467 ): | |
| 468 cds_length = ( | |
| 469 master_dict[tran]["cds_stop"] - master_dict[tran]["cds_start"] | |
| 470 ) | |
| 471 if gene not in longest_tran_dict: | |
| 472 longest_tran_dict[gene] = {"tran": tran, "length": cds_length} | |
| 473 else: | |
| 474 if cds_length > longest_tran_dict[gene]["length"]: | |
| 475 longest_tran_dict[gene] = {"tran": tran, "length": cds_length} | |
| 476 if cds_length == longest_tran_dict[gene]["length"]: | |
| 477 if ( | |
| 478 master_dict[tran]["length"] | |
| 479 > master_dict[longest_tran_dict[gene]["tran"]]["length"] | |
| 480 ): | |
| 481 longest_tran_dict[gene] = { | |
| 482 "tran": tran, | |
| 483 "length": cds_length, | |
| 484 } | |
| 485 else: | |
| 486 length = master_dict[tran]["length"] | |
| 487 if gene not in longest_tran_dict: | |
| 488 longest_tran_dict[gene] = {"tran": tran, "length": length} | |
| 489 elif length > longest_tran_dict[gene]["length"]: | |
| 490 longest_tran_dict[gene] = {"tran": tran, "length": length} | |
| 491 | |
| 492 | |
| 493 for gene in longest_tran_dict: | |
| 494 longest_tran = longest_tran_dict[gene]["tran"] | |
| 495 master_dict[longest_tran]["principal"] = 1 | |
| 496 | |
| 497 gene_sample = [] | |
| 498 for key in list(master_dict)[:10]: | |
| 499 try: | |
| 500 gene_sample.append(master_dict[key]["gene_name"]) | |
| 501 except: | |
| 502 pass | |
| 503 print(master_dict) | |
| 504 print("Here is a sample of the transcript ids: {}".format(list(master_dict)[:10])) | |
| 505 print("Here is a sample of the gene names: {}".format(gene_sample)) | |
| 506 | |
| 507 | |
| 508 # Takes a transcript, transcriptomic position and a master_dict (see ribopipe scripts) and returns the genomic position, positions should be passed 1 at a time. | |
| 509 def tran_to_genome(tran, start_pos, end_pos, master_dict): | |
| 510 pos_list = [] | |
| 511 for i in range(start_pos, end_pos + 1): | |
| 512 pos_list.append(i) | |
| 513 genomic_pos_list = [] | |
| 514 if tran in master_dict: | |
| 515 transcript_info = master_dict[tran] | |
| 516 else: | |
| 517 return ("Null", []) | |
| 518 | |
| 519 chrom = transcript_info["chrom"] | |
| 520 strand = transcript_info["strand"] | |
| 521 exons = sorted(transcript_info["exon"]) | |
| 522 # print ("chrom,strand,exons",chrom,strand,exons) | |
| 523 for pos in pos_list: | |
| 524 # print ("pos",pos) | |
| 525 if strand == "+": | |
| 526 exon_start = 0 | |
| 527 for tup in exons: | |
| 528 # print ("tup",tup) | |
| 529 exon_start = tup[0] | |
| 530 exonlen = tup[1] - tup[0] | |
| 531 if pos > exonlen: | |
| 532 pos = (pos - exonlen) - 1 | |
| 533 else: | |
| 534 break | |
| 535 # print ("appending exon_start-pos", exon_start, pos, exon_start+pos) | |
| 536 genomic_pos_list.append((exon_start + pos) - 1) | |
| 537 elif strand == "-": | |
| 538 exon_start = 0 | |
| 539 for tup in exons[::-1]: | |
| 540 # print ("tup",tup) | |
| 541 exon_start = tup[1] | |
| 542 exonlen = tup[1] - tup[0] | |
| 543 # print ("exonlen",exonlen) | |
| 544 if pos > exonlen: | |
| 545 # print ("pos is greater") | |
| 546 pos = (pos - exonlen) - 1 | |
| 547 # print ("new pos",pos) | |
| 548 else: | |
| 549 break | |
| 550 # print ("appending exon_start-pos", exon_start, pos, exon_start-pos) | |
| 551 genomic_pos_list.append((exon_start - pos) + 1) | |
| 552 return (chrom, genomic_pos_list) | |
| 553 | |
| 554 | |
| 555 orf_dict = { | |
| 556 "uorf": {}, | |
| 557 "ouorf": {}, | |
| 558 "cds": {}, | |
| 559 "nested": {}, | |
| 560 "odorf": {}, | |
| 561 "dorf": {}, | |
| 562 "minusone": {}, | |
| 563 "readthrough": {}, | |
| 564 "plusone": {}, | |
| 565 "noncoding": {}, | |
| 566 "extension": {}, | |
| 567 "inframe_stop": {}, | |
| 568 } | |
| 569 | |
| 570 start_codons = ["ATG", "CTG", "GTG", "TTG", "ATC", "ATA", "ATT", "ACG", "AAG", "AGG"] | |
| 571 | |
| 572 stop_codons = ["TAG", "TAA", "TGA"] | |
| 573 | |
| 574 | |
| 575 # Keep track of the longest transcript for each noncoding gene, append this to transcript list later | |
| 576 longest_noncoding = {} | |
| 577 | |
| 578 | |
| 579 tran_count = 0 | |
| 580 # This section is used to gather all cds regions, convert them to genomic regions and store them in a dictionary to check against later (all transcript contribute to this not just those | |
| 581 # in the transcript list) | |
| 582 genomic_cds_dict = {} | |
| 583 tree_dict = {} | |
| 584 for transcript in master_dict: | |
| 585 # print (transcript, master_dict[transcript]["tran_type"]) | |
| 586 tran_count += 1 | |
| 587 if "seq" not in master_dict[transcript]: | |
| 588 continue | |
| 589 chrom = master_dict[transcript]["chrom"] | |
| 590 if chrom not in genomic_cds_dict: | |
| 591 genomic_cds_dict[chrom] = [] | |
| 592 if "cds_start" in master_dict[transcript]: | |
| 593 cds_start = master_dict[transcript]["cds_start"] | |
| 594 cds_stop = master_dict[transcript]["cds_stop"] | |
| 595 if cds_start != "NULL": | |
| 596 cds_pos = [] | |
| 597 for i in range(cds_start, cds_stop + 1): | |
| 598 cds_pos.append(i) | |
| 599 | |
| 600 for tup in master_dict[transcript]["cds"]: | |
| 601 if tup[0] != tup[1]: | |
| 602 if tup not in genomic_cds_dict[chrom]: | |
| 603 genomic_cds_dict[chrom].append(tup) | |
| 604 | |
| 605 print("genomic cds dict built") | |
| 606 print(list(genomic_cds_dict)) | |
| 607 for chrom in genomic_cds_dict: | |
| 608 tree_dict[chrom] = IntervalTree.from_tuples(genomic_cds_dict[chrom]) | |
| 609 | |
| 610 # print (list(tree_dict)) | |
| 611 | |
| 612 | |
| 613 connection = sqlite3.connect("{}.sqlite".format(organism)) | |
| 614 cursor = connection.cursor() | |
| 615 cursor.execute( | |
| 616 "CREATE TABLE IF NOT EXISTS transcripts (transcript VARCHAR(50), gene VARCHAR(50), length INT(6), cds_start INT(6), cds_stop INT(6), sequence VARCHAR(50000), strand CHAR(1), stop_list VARCHAR(10000), start_list VARCHAR(10000), exon_junctions VARCHAR(1000), tran_type INT(1), gene_type INT(1), principal INT(1), version INT(2),gc INT(3),five_gc INT(3), cds_gc INT(3), three_gc INT(3), chrom VARCHAR(20));" | |
| 617 ) | |
| 618 cursor.execute( | |
| 619 "CREATE TABLE IF NOT EXISTS coding_regions (transcript VARCHAR(50), coding_start INT(6), coding_stop INT(6));" | |
| 620 ) | |
| 621 cursor.execute( | |
| 622 "CREATE TABLE IF NOT EXISTS exons (transcript VARCHAR(50), exon_start INT(6), exon_stop INT(6));" | |
| 623 ) | |
| 624 cursor.execute( | |
| 625 "CREATE TABLE IF NOT EXISTS uorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | |
| 626 ) | |
| 627 cursor.execute( | |
| 628 "CREATE TABLE IF NOT EXISTS ouorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | |
| 629 ) | |
| 630 cursor.execute( | |
| 631 "CREATE TABLE IF NOT EXISTS cds (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | |
| 632 ) | |
| 633 cursor.execute( | |
| 634 "CREATE TABLE IF NOT EXISTS nested (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | |
| 635 ) | |
| 636 cursor.execute( | |
| 637 "CREATE TABLE IF NOT EXISTS odorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | |
| 638 ) | |
| 639 cursor.execute( | |
| 640 "CREATE TABLE IF NOT EXISTS dorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | |
| 641 ) | |
| 642 cursor.execute( | |
| 643 "CREATE TABLE IF NOT EXISTS minusone(transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | |
| 644 ) | |
| 645 cursor.execute( | |
| 646 "CREATE TABLE IF NOT EXISTS readthrough (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | |
| 647 ) | |
| 648 cursor.execute( | |
| 649 "CREATE TABLE IF NOT EXISTS plusone (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | |
| 650 ) | |
| 651 cursor.execute( | |
| 652 "CREATE TABLE IF NOT EXISTS noncoding (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | |
| 653 ) | |
| 654 cursor.execute( | |
| 655 "CREATE TABLE IF NOT EXISTS extension (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | |
| 656 ) | |
| 657 cursor.execute( | |
| 658 "CREATE TABLE IF NOT EXISTS inframe_stop (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | |
| 659 ) | |
| 660 connection.commit() | |
| 661 | |
| 662 | |
| 663 print("Finding ORFs") | |
| 664 transcript_count = 0 | |
| 665 total_transcripts = len(list(master_dict)) | |
| 666 for transcript in master_dict: | |
| 667 # print ("transcript",transcript) | |
| 668 # if transcript != "ENST00000316448": | |
| 669 # continue | |
| 670 transcript_count += 1 | |
| 671 if transcript_count % 100 == 0: | |
| 672 print("Transcripts complete: {}/{}".format(transcript_count, total_transcripts)) | |
| 673 if "seq" not in master_dict[transcript]: | |
| 674 print("transcript {} has no sequence".format(transcript)) | |
| 675 continue | |
| 676 seq = master_dict[transcript]["seq"] | |
| 677 cds_start = "NULL" | |
| 678 cds_stop = "NULL" | |
| 679 transcript_len = len(seq) | |
| 680 if "cds_start" in master_dict[transcript]: | |
| 681 cds_start = master_dict[transcript]["cds_start"] | |
| 682 cds_stop = master_dict[transcript]["cds_stop"] | |
| 683 | |
| 684 # Find out what regions of this transcript overlap with any other coding regions | |
| 685 coding_positions = [] | |
| 686 if cds_start != "NULL": | |
| 687 # If this is a coding transcript don't bother checking the CDS | |
| 688 for i in range(cds_start, cds_stop): | |
| 689 coding_positions.append(i) | |
| 690 # check 5' leader | |
| 691 chrom, pos_list = tran_to_genome(transcript, 0, cds_start, master_dict) | |
| 692 for i in range(0, cds_start): | |
| 693 genomic_pos = pos_list[i] | |
| 694 overlap = tree_dict[chrom][genomic_pos] | |
| 695 if len(overlap) != 0: | |
| 696 coding_positions.append(i) | |
| 697 # check 3' trailer | |
| 698 chrom, pos_list = tran_to_genome( | |
| 699 transcript, cds_stop, transcript_len, master_dict | |
| 700 ) | |
| 701 for i in range(cds_stop, transcript_len + 1): | |
| 702 # print ("i",i) | |
| 703 genomic_pos = pos_list[i - cds_stop] | |
| 704 # print ("genomic position",genomic_pos) | |
| 705 overlap = tree_dict[chrom][genomic_pos] | |
| 706 if len(overlap) != 0: | |
| 707 # print ("overlap not empty appending i",overlap) | |
| 708 coding_positions.append(i) | |
| 709 else: | |
| 710 # check entire transcript | |
| 711 chrom, pos_list = tran_to_genome(transcript, 0, transcript_len, master_dict) | |
| 712 for i in range(0, transcript_len): | |
| 713 genomic_pos = pos_list[i] | |
| 714 overlap = tree_dict[chrom][genomic_pos] | |
| 715 if len(overlap) != 0: | |
| 716 coding_positions.append(i) | |
| 717 coding_positions_tuple = to_ranges(coding_positions) | |
| 718 coding_dict[transcript] = coding_positions_tuple | |
| 719 coding_positions = set(coding_positions) | |
| 720 # if this is a coding transcript find the minusone, readhtrough, plusone co-ordinates | |
| 721 if cds_start != "NULL": | |
| 722 # print ("transcript", transcript) | |
| 723 # if pseudo_utr_len != 0: | |
| 724 # cds_stop -= 3 # take 3 from stop so we can match it with orf_stop, do it here rather than above in case cds_stop is null | |
| 725 recoding_dict = {2: "minusone", 0: "readthrough", 1: "plusone"} | |
| 726 for addition in recoding_dict: | |
| 727 orftype = recoding_dict[addition] | |
| 728 for i in range(cds_stop + addition, transcript_len, 3): | |
| 729 if seq[i : i + 3] in stop_codons: | |
| 730 # orf_seq = seq[cds_stop:i+3] | |
| 731 orf_start = cds_stop | |
| 732 if orftype == "readthrough": | |
| 733 orf_start -= 2 | |
| 734 if orftype == "plusone": | |
| 735 orf_start -= 1 | |
| 736 orf_stop = i + 3 # +2 so it refers to the end of the stop codon | |
| 737 start_codon = None | |
| 738 length = (i + 3) - cds_stop | |
| 739 orf_pos_list = [] | |
| 740 # determine how many nucleotides in this orf overlap with an annotated coding region | |
| 741 cds_cov_count = 0.0 | |
| 742 for position in range(orf_start, orf_stop): | |
| 743 orf_pos_list.append(position) | |
| 744 for pos in range(orf_start, orf_stop + 1): | |
| 745 if pos in coding_positions: | |
| 746 cds_cov_count += 1 | |
| 747 cds_cov = cds_cov_count / length | |
| 748 # print ("orftype, start, stop", orftype, orf_start, orf_stop) | |
| 749 cursor.execute( | |
| 750 "INSERT INTO {} VALUES('{}','{}',{},{},{},{});".format( | |
| 751 orftype, | |
| 752 transcript, | |
| 753 start_codon, | |
| 754 length, | |
| 755 orf_start, | |
| 756 orf_stop, | |
| 757 cds_cov, | |
| 758 ) | |
| 759 ) | |
| 760 break | |
| 761 # sys.exit() | |
| 762 for frame in [0, 1, 2]: | |
| 763 for i in range(frame, transcript_len, 3): | |
| 764 if seq[i : i + 3] in start_codons: | |
| 765 for x in range(i, transcript_len, 3): | |
| 766 if seq[x : x + 3] in stop_codons: | |
| 767 # orf_seq = seq[i:x+3] | |
| 768 orf_start = i + 1 | |
| 769 orf_stop = x + 3 # +2 so it refers to the end of the stop codon | |
| 770 start_codon = seq[i : i + 3] | |
| 771 length = (x + 3) - i | |
| 772 orf_pos_list = [] | |
| 773 # determine how many nucleotides in this orf overlap with an annotated coding region | |
| 774 cds_cov_count = 0.0 | |
| 775 for pos in range(orf_start, orf_stop + 1): | |
| 776 if pos in coding_positions: | |
| 777 cds_cov_count += 1 | |
| 778 cds_cov = float(cds_cov_count) / float(length) | |
| 779 # Now determine orf type | |
| 780 if cds_start == "NULL": | |
| 781 orftype = "noncoding" | |
| 782 else: | |
| 783 # print ("cds start is not null :{}:{}".format(cds_start,cds_stop)) | |
| 784 # print ("orf start, orf stop", orf_start, orf_stop) | |
| 785 if orf_start == cds_start and orf_stop == cds_stop: | |
| 786 orftype = "cds" | |
| 787 # print ("orf type is cds") | |
| 788 elif orf_start < cds_start and orf_stop == cds_stop: | |
| 789 orftype = "extension" | |
| 790 # special case for extensions, we only take from the orf_start to the cds_start, and re-calculate cds coverage | |
| 791 orf_stop = cds_start | |
| 792 cds_cov_count = 0.0 | |
| 793 for pos in range(orf_start, orf_stop + 1): | |
| 794 if pos in coding_positions: | |
| 795 cds_cov_count += 1 | |
| 796 cds_cov = float(cds_cov_count) / float(length) | |
| 797 # orf_seq = seq[orf_start:cds_start] | |
| 798 length = cds_start - orf_start | |
| 799 # print ("orf type is extension") | |
| 800 elif orf_start < cds_start and orf_stop <= cds_start: | |
| 801 orftype = "uorf" | |
| 802 # print ("orf type is uorf") | |
| 803 elif orf_start < cds_start and orf_stop > cds_start: | |
| 804 orftype = "ouorf" | |
| 805 # print ("orf type is ouorf") | |
| 806 # sys.exit() | |
| 807 elif ( | |
| 808 orf_start >= cds_start | |
| 809 and orf_start <= cds_stop | |
| 810 and orf_stop <= cds_stop | |
| 811 ): | |
| 812 if orf_stop == cds_stop: | |
| 813 break | |
| 814 # print ("Tran, cds_start, cds_stop, orf_start, orf_stop, tranlen",tran, cds_start, cds_stop, orf_start, orf_stop,transcript_len) | |
| 815 if ( | |
| 816 orf_stop < transcript_len | |
| 817 and orf_stop % 3 == cds_stop % 3 | |
| 818 ) or ( | |
| 819 cds_start != 1 | |
| 820 and orf_stop % 3 == (cds_start + 2) % 3 | |
| 821 ): | |
| 822 # print ("Transcript {} has an inframe stop codon".format(transcript)) | |
| 823 break | |
| 824 orftype = "nested" | |
| 825 # print ("orf type is nested") | |
| 826 elif ( | |
| 827 orf_start >= cds_start | |
| 828 and orf_start <= cds_stop | |
| 829 and orf_stop > cds_stop | |
| 830 ): | |
| 831 orftype = "odorf" | |
| 832 # print ("orftype is odorf") | |
| 833 elif orf_start > cds_stop and orf_stop > cds_stop: | |
| 834 orftype = "dorf" | |
| 835 # print ("orftype is dorf") | |
| 836 # if orf_stop > cds_start and orf_stop < cds_stop: | |
| 837 # if (orf_stop+1)%3 == cds_start%3: | |
| 838 # orftype = "inframe_stop" | |
| 839 # print ("inframe stop, transcript, orf_stop", transcript, orf_stop) | |
| 840 # sys.exit() | |
| 841 # if transcript not in orf_dict: | |
| 842 # orf_dict[orftype][transcript] = [] | |
| 843 # #print ("some weird stop or something") | |
| 844 cursor.execute( | |
| 845 "INSERT INTO {} VALUES('{}','{}',{},{},{},{});".format( | |
| 846 orftype, | |
| 847 transcript, | |
| 848 start_codon, | |
| 849 length, | |
| 850 orf_start, | |
| 851 orf_stop, | |
| 852 cds_cov, | |
| 853 ) | |
| 854 ) | |
| 855 break | |
| 856 # Used to keep track of the codons at cds_start and cds_stop positions, | |
| 857 # If there is an issue with the cds co-ordinates the starts and stops counts will | |
| 858 # be much lower than the other count, start with 1 to prevent division by 0 | |
| 859 nuc_dict = {"stops": {"stops": 1, "other": 0}, "starts": {"starts": 1, "other": 0}} | |
| 860 | |
| 861 | |
| 862 def calcgc(seq): | |
| 863 if seq == "": | |
| 864 return "NULL" | |
| 865 g_count = 0 | |
| 866 c_count = 0 | |
| 867 a_count = 0 | |
| 868 t_count = 0 | |
| 869 for char in seq: | |
| 870 if char == "A": | |
| 871 a_count += 1 | |
| 872 if char == "T": | |
| 873 t_count += 1 | |
| 874 if char == "G": | |
| 875 g_count += 1 | |
| 876 if char == "C": | |
| 877 c_count += 1 | |
| 878 gc = ((g_count + c_count) / float(len(seq))) * 100 | |
| 879 return round(gc, 2) | |
| 880 | |
| 881 | |
| 882 for transcript in master_dict: | |
| 883 # print ("transcripts", transcript) | |
| 884 length = master_dict[transcript]["length"] | |
| 885 cds_start = master_dict[transcript]["cds_start"] | |
| 886 cds_stop = master_dict[transcript]["cds_stop"] | |
| 887 seq = master_dict[transcript]["seq"] | |
| 888 strand = master_dict[transcript]["strand"] | |
| 889 chrom = master_dict[transcript]["chrom"] | |
| 890 gene = master_dict[transcript]["gene_name"] | |
| 891 gc = calcgc(seq) | |
| 892 five_gc = "NULL" | |
| 893 cds_gc = "NULL" | |
| 894 three_gc = "NULL" | |
| 895 if cds_start != "NULL": | |
| 896 five_gc = calcgc(seq[:cds_start]) | |
| 897 cds_gc = calcgc(seq[cds_start:cds_stop]) | |
| 898 three_gc = calcgc(seq[cds_stop:]) | |
| 899 # check that the nucleotide cds_start points to is the first of the start codon | |
| 900 # take one becase cds_start is 1 based but python indexing is 0 based | |
| 901 start_nuc = seq[cds_start - 1 : cds_start + 2] | |
| 902 # print ("start nuc",start_nuc) | |
| 903 if start_nuc == "ATG": | |
| 904 nuc_dict["starts"]["starts"] += 1 | |
| 905 else: | |
| 906 nuc_dict["starts"]["other"] += 1 | |
| 907 stop_nuc = seq[cds_stop - 3 : cds_stop] | |
| 908 # print ("stop_nuc",stop_nuc) | |
| 909 if stop_nuc in ["TAG", "TAA", "TGA"]: | |
| 910 nuc_dict["stops"]["stops"] += 1 | |
| 911 else: | |
| 912 nuc_dict["stops"]["other"] += 1 | |
| 913 tran_type = master_dict[transcript]["tran_type"] | |
| 914 if coding_genes_dict[gene] == True: | |
| 915 gene_type = 1 | |
| 916 else: | |
| 917 gene_type = 0 | |
| 918 # print ("tran type before",tran_type) | |
| 919 if tran_type == "coding": | |
| 920 tran_type = 1 | |
| 921 else: | |
| 922 tran_type = 0 | |
| 923 # print ("tran type after",tran_type) | |
| 924 start_list = str(master_dict[transcript]["start_list"]).replace(" ", "").strip("[]") | |
| 925 stop_list = str(master_dict[transcript]["stop_list"]).replace(" ", "").strip("[]") | |
| 926 exon_junctions = ( | |
| 927 str(master_dict[transcript]["exon_junctions"]).replace(" ", "").strip("[]") | |
| 928 ) | |
| 929 principal = master_dict[transcript]["principal"] | |
| 930 version = master_dict[transcript]["version"] | |
| 931 # print (master_dict[transcript]) | |
| 932 # print (tran_type) | |
| 933 # print (gene_type) | |
| 934 # print (principal) | |
| 935 # print (version) | |
| 936 # print ("INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{});".format(transcript, gene, length, cds_start, cds_stop, seq, strand,stop_list, start_list, exon_junctions, tran_type,gene_type,principal,version)) | |
| 937 cursor.execute( | |
| 938 "INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{},{},{},{},{},'{}');".format( | |
| 939 transcript, | |
| 940 gene, | |
| 941 length, | |
| 942 cds_start, | |
| 943 cds_stop, | |
| 944 seq, | |
| 945 strand, | |
| 946 stop_list, | |
| 947 start_list, | |
| 948 exon_junctions, | |
| 949 tran_type, | |
| 950 gene_type, | |
| 951 principal, | |
| 952 version, | |
| 953 gc, | |
| 954 five_gc, | |
| 955 cds_gc, | |
| 956 three_gc, | |
| 957 chrom, | |
| 958 ) | |
| 959 ) | |
| 960 | |
| 961 for tup in master_dict[transcript]["exon"]: | |
| 962 cursor.execute( | |
| 963 "INSERT INTO exons VALUES('{}',{},{});".format(transcript, tup[0], tup[1]) | |
| 964 ) | |
| 965 if transcript in coding_dict: | |
| 966 for tup in coding_dict[transcript]: | |
| 967 cursor.execute( | |
| 968 "INSERT INTO coding_regions VALUES('{}',{},{});".format( | |
| 969 transcript, tup[0], tup[1] | |
| 970 ) | |
| 971 ) | |
| 972 | |
| 973 connection.commit() | |
| 974 connection.close() | |
| 975 | |
| 976 print("delim", delimiters) | |
| 977 if (nuc_dict["starts"]["other"] / nuc_dict["starts"]["starts"]) > 0.05: | |
| 978 print( | |
| 979 "Warning: {} transcripts do not have a an AUG at the CDS start position".format( | |
| 980 nuc_dict["starts"]["other"] | |
| 981 ) | |
| 982 ) | |
| 983 if (nuc_dict["stops"]["other"] / nuc_dict["stops"]["stops"]) > 0.05: | |
| 984 print( | |
| 985 "Warning: {} transcripts do not have a an stop codon at the CDS stop position".format( | |
| 986 nuc_dict["stops"]["other"] | |
| 987 ) | |
| 988 ) | |
| 989 if len(notinannotation) > 0: | |
| 990 print( | |
| 991 "Warning: {} transcripts were in the fasta file, but not the annotation file, these will be discarded".format( | |
| 992 len(notinannotation) | |
| 993 ) | |
| 994 ) |
