| 0 | 1 # Python3 script which takes in an annotation file(gtf/gff3) and a transcriptomic fasta file | 
|  | 2 # and produces an sqlite file which can be uploaded to Trips-Viz | 
|  | 3 # All co-ordinates produced are 1 based | 
|  | 4 # All start codon positions (including cds_start) should be at the first nucleotide of the codon | 
|  | 5 # All stop codon positions (including cds_stop) should be at the last nucleotide of the codon | 
|  | 6 import sys | 
|  | 7 import re | 
|  | 8 import sqlite3 | 
| 3 | 9 import subprocess | 
|  | 10 | 
| 0 | 11 from intervaltree import Interval, IntervalTree | 
|  | 12 import itertools | 
|  | 13 from sqlitedict import SqliteDict | 
|  | 14 import os | 
|  | 15 | 
|  | 16 organism = sys.argv[1] | 
|  | 17 # This should be a GTF or GFF3 file | 
|  | 18 annotation_file = open(sys.argv[2], "r") | 
|  | 19 # This needs to be the transcriptomic fasta file | 
|  | 20 fasta_file = open(sys.argv[3], "r") | 
|  | 21 # This value will be added used to create UTRs of this length, useful when looking at transcriptomes without annotated UTRs | 
|  | 22 pseudo_utr_len = int(sys.argv[4]) | 
|  | 23 # An example of a transcript_id from the annotation file, e.g ENST000000123456 | 
|  | 24 user_transcript_id = sys.argv[5] | 
|  | 25 # An example of a gene name from the annotation file | 
|  | 26 user_gene_name = sys.argv[6] | 
|  | 27 # Set to true if transcript version is included in transcript_id, e.g: ENST000000123456.1 | 
|  | 28 TRAN_VERSION = True | 
| 3 | 29 output = sys.argv[7] | 
| 0 | 30 | 
|  | 31 | 
|  | 32 if os.path.isfile("{}.sqlite".format(organism)): | 
|  | 33     print("{}.sqlite already exists".format(organism)) | 
|  | 34     sys.exit() | 
|  | 35 | 
|  | 36 | 
|  | 37 # old_exons = SqliteDict( | 
| 2 | 38 #     "/home/data2/www/tripsviz/tripsviz/trips_annotations/mus_musculus/transcriptomic_to_genomic.sqlite" | 
| 0 | 39 # ) | 
|  | 40 | 
|  | 41 | 
|  | 42 delimiters = { | 
|  | 43     "transcripts": {"before": [], "after": ["."], "annot_types": ["cds", "utr"]}, | 
|  | 44     "genes": {"before": [], "after": ['"'], "annot_types": ["lnc_rna"]}, | 
|  | 45 } | 
|  | 46 | 
|  | 47 punctuation = [";", " ", "-", ":", "-", ".", "=", "\t"] | 
|  | 48 # Find delimiters in the annotation and fasta files using the user_transcript_id | 
|  | 49 # and user_gene_name examples given by user. | 
|  | 50 for line in annotation_file: | 
|  | 51     if user_transcript_id in line: | 
|  | 52         tabsplitline = line.split("\t") | 
|  | 53         annot_type = tabsplitline[2] | 
|  | 54         if annot_type not in delimiters["transcripts"]["annot_types"]: | 
|  | 55             delimiters["transcripts"]["annot_types"].append(annot_type.lower()) | 
|  | 56         splitline = line.split(user_transcript_id) | 
|  | 57         before_delimiter = splitline[0] | 
|  | 58         for item in punctuation: | 
|  | 59             if item in before_delimiter: | 
|  | 60                 if len(before_delimiter.split(item)[-1]) >= 5: | 
|  | 61                     before_delimiter = before_delimiter.split(item)[-1] | 
|  | 62         after_delimiter = splitline[1][:2] | 
|  | 63         if ( | 
|  | 64             before_delimiter not in delimiters["transcripts"]["before"] | 
|  | 65             and len(before_delimiter) >= 5 | 
|  | 66         ): | 
|  | 67             delimiters["transcripts"]["before"].append(before_delimiter) | 
|  | 68         if after_delimiter not in delimiters["transcripts"]["after"]: | 
|  | 69             delimiters["transcripts"]["after"].append(after_delimiter) | 
|  | 70     if user_gene_name in line: | 
|  | 71         tabsplitline = line.split("\t") | 
|  | 72         annot_type = tabsplitline[2] | 
|  | 73         print("ANNOT TYPE", annot_type) | 
|  | 74         if annot_type not in delimiters["genes"]["annot_types"]: | 
|  | 75             delimiters["genes"]["annot_types"].append(annot_type.lower()) | 
|  | 76         splitline = line.split(user_gene_name) | 
|  | 77         before_delimiter = splitline[0] | 
|  | 78         for item in punctuation: | 
|  | 79             if item in before_delimiter: | 
|  | 80                 if len(before_delimiter.split(item)[-1]) >= 5: | 
|  | 81                     before_delimiter = before_delimiter.split(item)[-1] | 
|  | 82         after_delimiter = splitline[1][0] | 
|  | 83         if ( | 
|  | 84             before_delimiter not in delimiters["genes"]["before"] | 
|  | 85             and len(before_delimiter) >= 5 | 
|  | 86         ): | 
|  | 87             delimiters["genes"]["before"].append(before_delimiter) | 
|  | 88         if after_delimiter not in delimiters["genes"]["after"]: | 
|  | 89             if after_delimiter in punctuation: | 
|  | 90                 delimiters["genes"]["after"].append(after_delimiter) | 
|  | 91 | 
|  | 92 print("delimeters[genes]", delimiters["transcripts"]["annot_types"]) | 
|  | 93 | 
|  | 94 for line in fasta_file: | 
|  | 95     if user_transcript_id in line: | 
|  | 96         splitline = line.split(user_transcript_id) | 
|  | 97         before_delimiter = splitline[0] | 
|  | 98         for item in punctuation: | 
|  | 99             if item in before_delimiter: | 
|  | 100                 if len(before_delimiter.split(item)[1]) >= 5: | 
|  | 101                     before_delimiter = before_delimiter.split(item)[1] | 
|  | 102         after_delimiter = splitline[1][0] | 
|  | 103         if ( | 
|  | 104             before_delimiter not in delimiters["transcripts"]["before"] | 
|  | 105             and len(before_delimiter) >= 5 | 
|  | 106         ): | 
|  | 107             delimiters["transcripts"]["before"].append(before_delimiter) | 
|  | 108         if after_delimiter not in delimiters["transcripts"]["after"]: | 
|  | 109             delimiters["transcripts"]["after"].append(after_delimiter) | 
|  | 110 fasta_file.close() | 
|  | 111 annotation_file.close() | 
|  | 112 | 
|  | 113 | 
|  | 114 if delimiters["transcripts"]["before"] == []: | 
|  | 115     print( | 
|  | 116         "ERROR: No transcript_id with the name {} could be found in the annotation file".format( | 
|  | 117             user_transcript_id | 
|  | 118         ) | 
|  | 119     ) | 
|  | 120     sys.exit() | 
|  | 121 if delimiters["genes"]["before"] == []: | 
|  | 122     print( | 
|  | 123         "ERROR: No gene with the name {} could be found in the annotation file".format( | 
|  | 124             user_gene_name | 
|  | 125         ) | 
|  | 126     ) | 
|  | 127     sys.exit() | 
|  | 128 | 
|  | 129 master_dict = {} | 
|  | 130 coding_dict = {} | 
|  | 131 notinfasta = open("notinfasta.csv", "w") | 
|  | 132 | 
|  | 133 # Given a nucleotide sequence returns the positions of all start and stop codons. | 
|  | 134 def get_start_stops(transcript_sequence): | 
|  | 135     transcript_sequence = transcript_sequence.upper() | 
|  | 136     start_codons = ["ATG"] | 
|  | 137     stop_codons = ["TAA", "TAG", "TGA"] | 
|  | 138     seq_frames = {"starts": [], "stops": []} | 
|  | 139     for codons, positions in ((start_codons, "starts"), (stop_codons, "stops")): | 
|  | 140         if len(codons) > 1: | 
|  | 141             pat = re.compile("|".join(codons)) | 
|  | 142         else: | 
|  | 143             pat = re.compile(codons[0]) | 
|  | 144         for m in re.finditer(pat, transcript_sequence): | 
|  | 145             # Increment position by 1, Frame 1 starts at position 1 not 0, | 
|  | 146             # if it's a stop codon add another 2 so it points to the last nuc of the codon | 
|  | 147             if positions == "starts": | 
|  | 148                 start = m.start() + 1 | 
|  | 149             else: | 
|  | 150                 start = m.start() + 3 | 
|  | 151             seq_frames[positions].append(start) | 
|  | 152     return seq_frames | 
|  | 153 | 
|  | 154 | 
|  | 155 # parse fasta to get the nucleotide sequence of transcripts and the positions of start/stop codons. | 
|  | 156 fasta_file = open(sys.argv[3], "r") | 
|  | 157 read_fasta = fasta_file.read() | 
|  | 158 split_fasta = read_fasta.split(">") | 
|  | 159 for entry in split_fasta[1:]: | 
|  | 160     newline_split = entry.split("\n") | 
|  | 161     tran = newline_split[0] | 
|  | 162     for item in delimiters["transcripts"]["after"]: | 
|  | 163         if item in tran: | 
|  | 164             tran = tran.split(item)[0] | 
|  | 165     tran = tran.replace("-", "_").replace("(", "").replace(")", "") | 
|  | 166     seq = "".join(newline_split[1:]) | 
|  | 167     if "_PAR_Y" in tran: | 
|  | 168         tran += "_chrY" | 
|  | 169     elif "_PAR_X" in tran: | 
|  | 170         tran += "_chrX" | 
|  | 171     tran = tran.upper() | 
|  | 172     starts_stops = get_start_stops(seq) | 
|  | 173     print("tran", tran) | 
|  | 174     if tran not in master_dict: | 
|  | 175         master_dict[tran] = { | 
|  | 176             "utr": [], | 
|  | 177             "cds": [], | 
|  | 178             "exon": [], | 
|  | 179             "start_codon": [], | 
|  | 180             "stop_codon": [], | 
|  | 181             "start_list": starts_stops["starts"], | 
|  | 182             "stop_list": starts_stops["stops"], | 
|  | 183             "transcript": [], | 
|  | 184             "strand": "", | 
|  | 185             "gene_name": "", | 
|  | 186             "chrom": "", | 
|  | 187             "seq": seq, | 
|  | 188             "cds_start": "NULL", | 
|  | 189             "cds_stop": "NULL", | 
|  | 190             "length": len(seq), | 
|  | 191             "principal": 0, | 
|  | 192             "version": "NULL", | 
|  | 193         } | 
|  | 194 | 
|  | 195 | 
|  | 196 def to_ranges(iterable): | 
|  | 197     tup_list = [] | 
|  | 198     iterable = sorted(set(iterable)) | 
|  | 199     for key, group in itertools.groupby(enumerate(iterable), lambda t: t[1] - t[0]): | 
|  | 200         group = list(group) | 
|  | 201         tup_list.append((group[0][1], group[-1][1])) | 
|  | 202     return tup_list | 
|  | 203 | 
|  | 204 | 
|  | 205 # parse annotation file to get chromsome, exon location and CDS info for each transcript | 
|  | 206 def parse_gtf_file(annotation_file): | 
|  | 207     for line in annotation_file: | 
|  | 208         if line == "\n": | 
|  | 209             continue | 
|  | 210         if line[0] != "#": | 
|  | 211             splitline = (line.replace("\n", "")).split("\t") | 
|  | 212             chrom = splitline[0] | 
|  | 213             try: | 
|  | 214                 annot_type = splitline[2].lower() | 
|  | 215             except: | 
|  | 216                 print( | 
|  | 217                     "ERROR tried to index to second item in splitline: ", | 
|  | 218                     splitline, | 
|  | 219                     line, | 
|  | 220                 ) | 
|  | 221                 sys.exit() | 
|  | 222             # if annot_type not in ["cds", "utr", "exon", "transcript","five_prime_utr", "three_prime_utr","stop_codon","start_codon"]: | 
|  | 223             # 	continue | 
|  | 224             if ( | 
|  | 225                 annot_type not in delimiters["transcripts"]["annot_types"] | 
|  | 226                 and annot_type not in delimiters["genes"]["annot_types"] | 
|  | 227             ): | 
|  | 228                 continue | 
|  | 229             if annot_type == "five_prime_utr" or annot_type == "three_prime_utr": | 
|  | 230                 annot_type = "utr" | 
|  | 231             strand = splitline[6] | 
|  | 232             if strand == "+": | 
|  | 233                 start = int(splitline[3]) | 
|  | 234                 end = int(splitline[4]) | 
|  | 235             else: | 
|  | 236                 start = int(splitline[3]) + 1 | 
|  | 237                 end = int(splitline[4]) + 1 | 
|  | 238             desc = splitline[8] | 
|  | 239             tran = desc | 
|  | 240             gene = desc | 
|  | 241             for item in delimiters["transcripts"]["before"]: | 
|  | 242                 if item in tran: | 
|  | 243                     tran = tran.split(item)[1] | 
|  | 244             for item in delimiters["transcripts"]["after"]: | 
|  | 245                 if item in tran: | 
|  | 246                     tran = tran.split(item)[0] | 
|  | 247             if "." in tran and TRAN_VERSION == True: | 
|  | 248                 # print ("raw tran",tran) | 
|  | 249                 tran = tran.split(".") | 
|  | 250                 version = int(tran[-1].split("_")[0]) | 
|  | 251                 tran = tran[0] | 
|  | 252             else: | 
|  | 253                 version = "NULL" | 
|  | 254             tran = tran.replace("-", "_").replace(".", "_") | 
|  | 255             tran = tran.replace("(", "").replace(")", "") | 
|  | 256             tran = tran.replace(" ", "").replace("\t", "") | 
|  | 257             tran = tran.upper() | 
|  | 258             tran = tran.replace("GENE_", "").replace("ID_", "") | 
|  | 259             if "_PAR_Y" in desc: | 
|  | 260                 # print ("adding _PAR_Y to tran") | 
|  | 261                 tran = tran + "_PAR_Y" | 
|  | 262                 # print ("New tran ", tran) | 
|  | 263             # if "PAR_Y" in line: | 
|  | 264             # 	print (line) | 
|  | 265             # 	#sys.exit() | 
|  | 266             # print ("tran",tran,version) | 
|  | 267             # if tran == "ENST00000316448": | 
|  | 268             # 	print ("annot type",annot_type) | 
|  | 269             # 	print ("appending exon to tran", start, end,line) | 
|  | 270 | 
|  | 271             gene_found = False | 
|  | 272 | 
|  | 273             if annot_type in delimiters["genes"]["annot_types"]: | 
|  | 274                 for item in delimiters["genes"]["before"]: | 
|  | 275                     if item in gene: | 
|  | 276                         gene_found = True | 
|  | 277                         gene = gene.split(item)[1] | 
|  | 278                 for item in delimiters["genes"]["after"]: | 
|  | 279                     if item in gene: | 
|  | 280                         gene = gene.split(item)[0] | 
|  | 281                 gene = gene.replace("'", "''") | 
|  | 282                 gene = gene.replace("GENE_", "") | 
|  | 283                 gene = gene.replace("ID_", "") | 
|  | 284                 gene = gene.upper() | 
|  | 285             if tran in master_dict: | 
|  | 286                 master_dict[tran]["strand"] = strand | 
|  | 287                 if strand == "+": | 
|  | 288                     if annot_type in master_dict[tran]: | 
|  | 289                         master_dict[tran][annot_type].append((start, end)) | 
|  | 290                 else: | 
|  | 291                     if annot_type in master_dict[tran]: | 
|  | 292                         master_dict[tran][annot_type].append((start, end)) | 
|  | 293                 master_dict[tran]["chrom"] = chrom | 
|  | 294                 master_dict[tran]["version"] = version | 
|  | 295                 if gene_found == True: | 
|  | 296                     master_dict[tran]["gene_name"] = gene | 
|  | 297             else: | 
|  | 298                 notinfasta.write("{}\n".format(tran)) | 
|  | 299 | 
|  | 300 | 
|  | 301 annotation_file = open(sys.argv[2], "r") | 
|  | 302 parse_gtf_file(annotation_file) | 
|  | 303 | 
|  | 304 | 
|  | 305 # remove transcripts that were in fasta file but not in annotation_file | 
|  | 306 notinannotation = [] | 
|  | 307 for tran in master_dict: | 
|  | 308     if master_dict[tran]["chrom"] == "": | 
|  | 309         # print ("tran {} has no chrom :(".format(tran)) | 
|  | 310         notinannotation.append(tran) | 
|  | 311 for tran in notinannotation: | 
|  | 312     print(tran) | 
|  | 313     del master_dict[tran] | 
|  | 314 # Dictionary to store the coding status of a gene, if any transcript of this gene is coding, the value will be True | 
|  | 315 coding_genes_dict = {} | 
|  | 316 # parse master_dict to calculate length, cds_start/stop and exon junction positions | 
|  | 317 for tran in master_dict: | 
|  | 318     try: | 
|  | 319         transeq = master_dict[tran]["seq"] | 
|  | 320     except Exception as e: | 
|  | 321         print("not in fasta", tran) | 
|  | 322         notinfasta.write("{}\n".format(tran)) | 
|  | 323         continue | 
|  | 324     exon_junctions = [] | 
|  | 325     total_length = len(transeq) | 
|  | 326     three_len = 1 | 
|  | 327     five_len = 1 | 
|  | 328     strand = master_dict[tran]["strand"] | 
|  | 329     if master_dict[tran]["gene_name"] == "": | 
|  | 330         master_dict[tran]["gene_name"] = tran | 
|  | 331     gene = master_dict[tran]["gene_name"] | 
|  | 332     if gene not in coding_genes_dict: | 
|  | 333         coding_genes_dict[gene] = False | 
|  | 334 | 
|  | 335     if master_dict[tran]["cds"] == []: | 
|  | 336         tran_type = "noncoding" | 
|  | 337         cds_start = "NULL" | 
|  | 338         cds_stop = "NULL" | 
|  | 339     else: | 
|  | 340         # get utr lengths from annotation | 
|  | 341         tran_type = "coding" | 
|  | 342         coding_genes_dict[gene] = True | 
|  | 343         sorted_exons = sorted(master_dict[tran]["exon"]) | 
|  | 344         sorted_cds = sorted(master_dict[tran]["cds"]) | 
|  | 345 | 
|  | 346         min_cds = sorted_cds[0][0] | 
|  | 347         # Some annotation files do not have utr annotation types, so fix that here if thats the case | 
|  | 348         if master_dict[tran]["utr"] == []: | 
|  | 349             for exon_tup in master_dict[tran]["exon"]: | 
|  | 350                 if exon_tup not in master_dict[tran]["cds"]: | 
|  | 351                     # Now check if this overlaps with any of the CDS exons | 
|  | 352                     overlap = False | 
|  | 353                     for cds_tup in master_dict[tran]["cds"]: | 
|  | 354                         if exon_tup[0] == cds_tup[0] and exon_tup[1] != cds_tup[1]: | 
|  | 355                             master_dict[tran]["utr"].append((cds_tup[1], exon_tup[1])) | 
|  | 356                             overlap = True | 
|  | 357                         if exon_tup[0] != cds_tup[0] and exon_tup[1] == cds_tup[1]: | 
|  | 358                             master_dict[tran]["utr"].append((exon_tup[0], cds_tup[0])) | 
|  | 359                             overlap = True | 
|  | 360                     if overlap == False: | 
|  | 361                         master_dict[tran]["utr"].append(exon_tup) | 
|  | 362 | 
|  | 363         for tup in sorted(master_dict[tran]["utr"]): | 
|  | 364             if tup[0] < min_cds: | 
|  | 365                 five_len += (tup[1] - tup[0]) + 1 | 
|  | 366             elif tup[0] > min_cds: | 
|  | 367                 three_len += (tup[1] - tup[0]) + 1 | 
|  | 368             else: | 
|  | 369                 pass | 
|  | 370         if strand == "+": | 
|  | 371             if len(sorted_exons) > 1: | 
|  | 372                 sorted_exons[0] = ( | 
|  | 373                     sorted_exons[0][0] - pseudo_utr_len, | 
|  | 374                     sorted_exons[0][1], | 
|  | 375                 ) | 
|  | 376                 sorted_exons[-1] = ( | 
|  | 377                     sorted_exons[-1][0], | 
|  | 378                     sorted_exons[-1][1] + pseudo_utr_len, | 
|  | 379                 ) | 
|  | 380             else: | 
|  | 381                 sorted_exons[0] = ( | 
|  | 382                     sorted_exons[0][0] - pseudo_utr_len, | 
|  | 383                     sorted_exons[0][1] + pseudo_utr_len, | 
|  | 384                 ) | 
|  | 385             master_dict[tran]["exon"] = sorted_exons | 
|  | 386             cds_start = five_len + pseudo_utr_len | 
|  | 387             cds_stop = ((total_length - three_len) - pseudo_utr_len) + 4 | 
|  | 388         elif strand == "-": | 
|  | 389             if len(sorted_exons) > 1: | 
|  | 390                 sorted_exons[0] = ( | 
|  | 391                     (sorted_exons[0][0] - pseudo_utr_len), | 
|  | 392                     sorted_exons[0][1], | 
|  | 393                 ) | 
|  | 394                 sorted_exons[-1] = ( | 
|  | 395                     sorted_exons[-1][0], | 
|  | 396                     (sorted_exons[-1][1] + pseudo_utr_len), | 
|  | 397                 ) | 
|  | 398             else: | 
|  | 399                 sorted_exons[0] = ( | 
|  | 400                     (sorted_exons[0][0] - pseudo_utr_len), | 
|  | 401                     (sorted_exons[0][1] + pseudo_utr_len), | 
|  | 402                 ) | 
|  | 403             master_dict[tran]["exon"] = sorted_exons | 
|  | 404             cds_start = three_len + pseudo_utr_len | 
|  | 405             cds_stop = ((total_length - (five_len)) - pseudo_utr_len) + 4 | 
|  | 406             # if tran == "ENST00000381401": | 
|  | 407             # 	print ("cds start, cds stop, five_len, three_len",cds_start,cds_stop,five_len,three_len) | 
|  | 408             # 	#sys.exit() | 
|  | 409         else: | 
|  | 410             print("strand is unknown: {}".format(strand)) | 
|  | 411             sys.exit() | 
|  | 412     # get exon junctions, cds is easy just get end of each tuple except last, same for utr except for if same as cds start/stop | 
|  | 413     total_intronic = 0 | 
|  | 414     try: | 
|  | 415         if strand == "+": | 
|  | 416             tx_start = min(sorted(master_dict[tran]["exon"]))[0] | 
|  | 417             prev_end = tx_start | 
|  | 418             for tup in sorted(master_dict[tran]["exon"])[:-1]: | 
|  | 419                 total_intronic += tup[0] - prev_end | 
|  | 420                 exon_junctions.append(((tup[1]) - tx_start) - total_intronic) | 
|  | 421                 prev_end = tup[1] | 
|  | 422         elif strand == "-": | 
|  | 423             tx_start = max(sorted(master_dict[tran]["exon"]))[-1] | 
|  | 424             prev_end = tx_start | 
|  | 425             for tup in (sorted(master_dict[tran]["exon"])[1:])[::-1]: | 
|  | 426                 total_intronic += (tup[0] + 1) - prev_end | 
|  | 427                 exon_junctions.append(((tup[1]) - tx_start) - total_intronic) | 
|  | 428                 prev_end = tup[1] | 
|  | 429     except: | 
|  | 430         if strand == "+": | 
|  | 431             tx_start = min(sorted(master_dict[tran]["cds"]))[0] | 
|  | 432             prev_end = tx_start | 
|  | 433             for tup in sorted(master_dict[tran]["cds"])[:-1]: | 
|  | 434                 total_intronic += tup[0] - prev_end | 
|  | 435                 exon_junctions.append(((tup[1]) - tx_start) - total_intronic) | 
|  | 436                 prev_end = tup[1] | 
|  | 437         elif strand == "-": | 
|  | 438             tx_start = max(sorted(master_dict[tran]["cds"]))[-1] | 
|  | 439             prev_end = tx_start | 
|  | 440             for tup in (sorted(master_dict[tran]["cds"])[1:])[::-1]: | 
|  | 441                 total_intronic += (tup[0] + 1) - prev_end | 
|  | 442                 exon_junctions.append(((tup[1]) - tx_start) - total_intronic) | 
|  | 443                 prev_end = tup[1] | 
|  | 444     # This can happen when a coding transcript doesn't have a properly annotated 3' trailer | 
|  | 445     if cds_stop != "NULL": | 
|  | 446         if cds_stop > total_length: | 
|  | 447             cds_stop = total_length | 
|  | 448     if strand == "+" and cds_start != "NULL": | 
|  | 449         master_dict[tran]["cds_start"] = cds_start | 
|  | 450         master_dict[tran]["cds_stop"] = cds_stop | 
|  | 451     elif strand == "-" and cds_start != "NULL": | 
|  | 452         master_dict[tran]["cds_start"] = cds_start | 
|  | 453         master_dict[tran]["cds_stop"] = cds_stop | 
|  | 454 | 
|  | 455     master_dict[tran]["strand"] = strand | 
|  | 456     master_dict[tran]["tran_type"] = tran_type | 
|  | 457     master_dict[tran]["exon_junctions"] = exon_junctions | 
|  | 458 | 
|  | 459 longest_tran_dict = {} | 
|  | 460 for tran in master_dict: | 
|  | 461     try: | 
|  | 462         gene = master_dict[tran]["gene_name"] | 
|  | 463     except: | 
|  | 464         continue | 
|  | 465     if coding_genes_dict[gene] == True: | 
|  | 466         if "cds_start" in master_dict[tran]: | 
|  | 467             if ( | 
|  | 468                 master_dict[tran]["cds_stop"] != "NULL" | 
|  | 469                 and master_dict[tran]["cds_start"] != "NULL" | 
|  | 470             ): | 
|  | 471                 cds_length = ( | 
|  | 472                     master_dict[tran]["cds_stop"] - master_dict[tran]["cds_start"] | 
|  | 473                 ) | 
|  | 474                 if gene not in longest_tran_dict: | 
|  | 475                     longest_tran_dict[gene] = {"tran": tran, "length": cds_length} | 
|  | 476                 else: | 
|  | 477                     if cds_length > longest_tran_dict[gene]["length"]: | 
|  | 478                         longest_tran_dict[gene] = {"tran": tran, "length": cds_length} | 
|  | 479                     if cds_length == longest_tran_dict[gene]["length"]: | 
|  | 480                         if ( | 
|  | 481                             master_dict[tran]["length"] | 
|  | 482                             > master_dict[longest_tran_dict[gene]["tran"]]["length"] | 
|  | 483                         ): | 
|  | 484                             longest_tran_dict[gene] = { | 
|  | 485                                 "tran": tran, | 
|  | 486                                 "length": cds_length, | 
|  | 487                             } | 
|  | 488     else: | 
|  | 489         length = master_dict[tran]["length"] | 
|  | 490         if gene not in longest_tran_dict: | 
|  | 491             longest_tran_dict[gene] = {"tran": tran, "length": length} | 
|  | 492         elif length > longest_tran_dict[gene]["length"]: | 
|  | 493             longest_tran_dict[gene] = {"tran": tran, "length": length} | 
|  | 494 | 
|  | 495 | 
|  | 496 for gene in longest_tran_dict: | 
|  | 497     longest_tran = longest_tran_dict[gene]["tran"] | 
|  | 498     master_dict[longest_tran]["principal"] = 1 | 
|  | 499 | 
|  | 500 gene_sample = [] | 
|  | 501 for key in list(master_dict)[:10]: | 
|  | 502     try: | 
|  | 503         gene_sample.append(master_dict[key]["gene_name"]) | 
|  | 504     except: | 
|  | 505         pass | 
|  | 506 print(master_dict) | 
|  | 507 print("Here is a sample of the transcript ids: {}".format(list(master_dict)[:10])) | 
|  | 508 print("Here is a sample of the gene names: {}".format(gene_sample)) | 
|  | 509 | 
|  | 510 | 
|  | 511 # Takes a transcript, transcriptomic position and a master_dict (see ribopipe scripts) and returns the genomic position, positions should be passed 1 at a time. | 
|  | 512 def tran_to_genome(tran, start_pos, end_pos, master_dict): | 
|  | 513     pos_list = [] | 
|  | 514     for i in range(start_pos, end_pos + 1): | 
|  | 515         pos_list.append(i) | 
|  | 516     genomic_pos_list = [] | 
|  | 517     if tran in master_dict: | 
|  | 518         transcript_info = master_dict[tran] | 
|  | 519     else: | 
|  | 520         return ("Null", []) | 
|  | 521 | 
|  | 522     chrom = transcript_info["chrom"] | 
|  | 523     strand = transcript_info["strand"] | 
|  | 524     exons = sorted(transcript_info["exon"]) | 
|  | 525     # print ("chrom,strand,exons",chrom,strand,exons) | 
|  | 526     for pos in pos_list: | 
|  | 527         # print ("pos",pos) | 
|  | 528         if strand == "+": | 
|  | 529             exon_start = 0 | 
|  | 530             for tup in exons: | 
|  | 531                 # print ("tup",tup) | 
|  | 532                 exon_start = tup[0] | 
|  | 533                 exonlen = tup[1] - tup[0] | 
|  | 534                 if pos > exonlen: | 
|  | 535                     pos = (pos - exonlen) - 1 | 
|  | 536                 else: | 
|  | 537                     break | 
|  | 538             # print ("appending exon_start-pos", exon_start, pos, exon_start+pos) | 
|  | 539             genomic_pos_list.append((exon_start + pos) - 1) | 
|  | 540         elif strand == "-": | 
|  | 541             exon_start = 0 | 
|  | 542             for tup in exons[::-1]: | 
|  | 543                 # print ("tup",tup) | 
|  | 544                 exon_start = tup[1] | 
|  | 545                 exonlen = tup[1] - tup[0] | 
|  | 546                 # print ("exonlen",exonlen) | 
|  | 547                 if pos > exonlen: | 
|  | 548                     # print ("pos is greater") | 
|  | 549                     pos = (pos - exonlen) - 1 | 
|  | 550                     # print ("new pos",pos) | 
|  | 551                 else: | 
|  | 552                     break | 
|  | 553             # print ("appending exon_start-pos", exon_start, pos, exon_start-pos) | 
|  | 554             genomic_pos_list.append((exon_start - pos) + 1) | 
|  | 555     return (chrom, genomic_pos_list) | 
|  | 556 | 
|  | 557 | 
|  | 558 orf_dict = { | 
|  | 559     "uorf": {}, | 
|  | 560     "ouorf": {}, | 
|  | 561     "cds": {}, | 
|  | 562     "nested": {}, | 
|  | 563     "odorf": {}, | 
|  | 564     "dorf": {}, | 
|  | 565     "minusone": {}, | 
|  | 566     "readthrough": {}, | 
|  | 567     "plusone": {}, | 
|  | 568     "noncoding": {}, | 
|  | 569     "extension": {}, | 
|  | 570     "inframe_stop": {}, | 
|  | 571 } | 
|  | 572 | 
|  | 573 start_codons = ["ATG", "CTG", "GTG", "TTG", "ATC", "ATA", "ATT", "ACG", "AAG", "AGG"] | 
|  | 574 | 
|  | 575 stop_codons = ["TAG", "TAA", "TGA"] | 
|  | 576 | 
|  | 577 | 
|  | 578 # Keep track of the longest transcript for each noncoding gene, append this to transcript list later | 
|  | 579 longest_noncoding = {} | 
|  | 580 | 
|  | 581 | 
|  | 582 tran_count = 0 | 
|  | 583 # This section is used to gather all cds regions, convert them to genomic regions and store them in a dictionary to check against later (all transcript contribute to this not just those | 
|  | 584 # in the transcript list) | 
|  | 585 genomic_cds_dict = {} | 
|  | 586 tree_dict = {} | 
|  | 587 for transcript in master_dict: | 
|  | 588     # print (transcript, master_dict[transcript]["tran_type"]) | 
|  | 589     tran_count += 1 | 
|  | 590     if "seq" not in master_dict[transcript]: | 
|  | 591         continue | 
|  | 592     chrom = master_dict[transcript]["chrom"] | 
|  | 593     if chrom not in genomic_cds_dict: | 
|  | 594         genomic_cds_dict[chrom] = [] | 
|  | 595     if "cds_start" in master_dict[transcript]: | 
|  | 596         cds_start = master_dict[transcript]["cds_start"] | 
|  | 597         cds_stop = master_dict[transcript]["cds_stop"] | 
|  | 598         if cds_start != "NULL": | 
|  | 599             cds_pos = [] | 
|  | 600             for i in range(cds_start, cds_stop + 1): | 
|  | 601                 cds_pos.append(i) | 
|  | 602 | 
|  | 603             for tup in master_dict[transcript]["cds"]: | 
|  | 604                 if tup[0] != tup[1]: | 
|  | 605                     if tup not in genomic_cds_dict[chrom]: | 
|  | 606                         genomic_cds_dict[chrom].append(tup) | 
|  | 607 | 
|  | 608 print("genomic cds dict built") | 
|  | 609 print(list(genomic_cds_dict)) | 
|  | 610 for chrom in genomic_cds_dict: | 
|  | 611     tree_dict[chrom] = IntervalTree.from_tuples(genomic_cds_dict[chrom]) | 
|  | 612 | 
|  | 613 # print (list(tree_dict)) | 
|  | 614 | 
|  | 615 | 
| 3 | 616 connection = sqlite3.connect(output) | 
| 0 | 617 cursor = connection.cursor() | 
|  | 618 cursor.execute( | 
|  | 619     "CREATE TABLE IF NOT EXISTS transcripts (transcript VARCHAR(50), gene VARCHAR(50), length INT(6), cds_start INT(6), cds_stop INT(6), sequence VARCHAR(50000), strand CHAR(1), stop_list VARCHAR(10000), start_list VARCHAR(10000), exon_junctions VARCHAR(1000), tran_type INT(1), gene_type INT(1), principal INT(1), version INT(2),gc INT(3),five_gc INT(3), cds_gc INT(3), three_gc INT(3), chrom VARCHAR(20));" | 
|  | 620 ) | 
|  | 621 cursor.execute( | 
|  | 622     "CREATE TABLE IF NOT EXISTS coding_regions (transcript VARCHAR(50), coding_start INT(6), coding_stop INT(6));" | 
|  | 623 ) | 
|  | 624 cursor.execute( | 
|  | 625     "CREATE TABLE IF NOT EXISTS exons (transcript VARCHAR(50), exon_start INT(6), exon_stop INT(6));" | 
|  | 626 ) | 
|  | 627 cursor.execute( | 
|  | 628     "CREATE TABLE IF NOT EXISTS uorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | 
|  | 629 ) | 
|  | 630 cursor.execute( | 
|  | 631     "CREATE TABLE IF NOT EXISTS ouorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | 
|  | 632 ) | 
|  | 633 cursor.execute( | 
|  | 634     "CREATE TABLE IF NOT EXISTS cds (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | 
|  | 635 ) | 
|  | 636 cursor.execute( | 
|  | 637     "CREATE TABLE IF NOT EXISTS nested (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | 
|  | 638 ) | 
|  | 639 cursor.execute( | 
|  | 640     "CREATE TABLE IF NOT EXISTS odorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | 
|  | 641 ) | 
|  | 642 cursor.execute( | 
|  | 643     "CREATE TABLE IF NOT EXISTS dorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | 
|  | 644 ) | 
|  | 645 cursor.execute( | 
|  | 646     "CREATE TABLE IF NOT EXISTS minusone(transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | 
|  | 647 ) | 
|  | 648 cursor.execute( | 
|  | 649     "CREATE TABLE IF NOT EXISTS readthrough (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | 
|  | 650 ) | 
|  | 651 cursor.execute( | 
|  | 652     "CREATE TABLE IF NOT EXISTS plusone (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | 
|  | 653 ) | 
|  | 654 cursor.execute( | 
|  | 655     "CREATE TABLE IF NOT EXISTS noncoding (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | 
|  | 656 ) | 
|  | 657 cursor.execute( | 
|  | 658     "CREATE TABLE IF NOT EXISTS extension (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | 
|  | 659 ) | 
|  | 660 cursor.execute( | 
|  | 661     "CREATE TABLE IF NOT EXISTS inframe_stop (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" | 
|  | 662 ) | 
|  | 663 connection.commit() | 
|  | 664 | 
|  | 665 | 
|  | 666 print("Finding ORFs") | 
|  | 667 transcript_count = 0 | 
|  | 668 total_transcripts = len(list(master_dict)) | 
|  | 669 for transcript in master_dict: | 
|  | 670     # print ("transcript",transcript) | 
|  | 671     # if transcript != "ENST00000316448": | 
|  | 672     # 	continue | 
|  | 673     transcript_count += 1 | 
|  | 674     if transcript_count % 100 == 0: | 
|  | 675         print("Transcripts complete: {}/{}".format(transcript_count, total_transcripts)) | 
|  | 676     if "seq" not in master_dict[transcript]: | 
|  | 677         print("transcript {} has no sequence".format(transcript)) | 
|  | 678         continue | 
|  | 679     seq = master_dict[transcript]["seq"] | 
|  | 680     cds_start = "NULL" | 
|  | 681     cds_stop = "NULL" | 
|  | 682     transcript_len = len(seq) | 
|  | 683     if "cds_start" in master_dict[transcript]: | 
|  | 684         cds_start = master_dict[transcript]["cds_start"] | 
|  | 685         cds_stop = master_dict[transcript]["cds_stop"] | 
|  | 686 | 
|  | 687     # Find out what regions of this transcript overlap with any other coding regions | 
|  | 688     coding_positions = [] | 
|  | 689     if cds_start != "NULL": | 
|  | 690         # If this is a coding transcript don't bother checking the CDS | 
|  | 691         for i in range(cds_start, cds_stop): | 
|  | 692             coding_positions.append(i) | 
|  | 693         # check 5' leader | 
|  | 694         chrom, pos_list = tran_to_genome(transcript, 0, cds_start, master_dict) | 
|  | 695         for i in range(0, cds_start): | 
|  | 696             genomic_pos = pos_list[i] | 
|  | 697             overlap = tree_dict[chrom][genomic_pos] | 
|  | 698             if len(overlap) != 0: | 
|  | 699                 coding_positions.append(i) | 
|  | 700         # check 3' trailer | 
|  | 701         chrom, pos_list = tran_to_genome( | 
|  | 702             transcript, cds_stop, transcript_len, master_dict | 
|  | 703         ) | 
|  | 704         for i in range(cds_stop, transcript_len + 1): | 
|  | 705             # print ("i",i) | 
|  | 706             genomic_pos = pos_list[i - cds_stop] | 
|  | 707             # print ("genomic position",genomic_pos) | 
|  | 708             overlap = tree_dict[chrom][genomic_pos] | 
|  | 709             if len(overlap) != 0: | 
|  | 710                 # print ("overlap not empty appending i",overlap) | 
|  | 711                 coding_positions.append(i) | 
|  | 712     else: | 
|  | 713         # check entire transcript | 
|  | 714         chrom, pos_list = tran_to_genome(transcript, 0, transcript_len, master_dict) | 
|  | 715         for i in range(0, transcript_len): | 
|  | 716             genomic_pos = pos_list[i] | 
|  | 717             overlap = tree_dict[chrom][genomic_pos] | 
|  | 718             if len(overlap) != 0: | 
|  | 719                 coding_positions.append(i) | 
|  | 720     coding_positions_tuple = to_ranges(coding_positions) | 
|  | 721     coding_dict[transcript] = coding_positions_tuple | 
|  | 722     coding_positions = set(coding_positions) | 
|  | 723     # if this is a coding transcript find the minusone, readhtrough, plusone co-ordinates | 
|  | 724     if cds_start != "NULL": | 
|  | 725         # print ("transcript", transcript) | 
|  | 726         # if pseudo_utr_len != 0: | 
|  | 727         # 	cds_stop -= 3 # take 3 from stop so we can match it with orf_stop, do it here rather than above in case cds_stop is null | 
|  | 728         recoding_dict = {2: "minusone", 0: "readthrough", 1: "plusone"} | 
|  | 729         for addition in recoding_dict: | 
|  | 730             orftype = recoding_dict[addition] | 
|  | 731             for i in range(cds_stop + addition, transcript_len, 3): | 
|  | 732                 if seq[i : i + 3] in stop_codons: | 
|  | 733                     # orf_seq = seq[cds_stop:i+3] | 
|  | 734                     orf_start = cds_stop | 
|  | 735                     if orftype == "readthrough": | 
|  | 736                         orf_start -= 2 | 
|  | 737                     if orftype == "plusone": | 
|  | 738                         orf_start -= 1 | 
|  | 739                     orf_stop = i + 3  # +2 so it refers to the end of the stop codon | 
|  | 740                     start_codon = None | 
|  | 741                     length = (i + 3) - cds_stop | 
|  | 742                     orf_pos_list = [] | 
|  | 743                     # determine how many nucleotides in this orf overlap with an annotated coding region | 
|  | 744                     cds_cov_count = 0.0 | 
|  | 745                     for position in range(orf_start, orf_stop): | 
|  | 746                         orf_pos_list.append(position) | 
|  | 747                     for pos in range(orf_start, orf_stop + 1): | 
|  | 748                         if pos in coding_positions: | 
|  | 749                             cds_cov_count += 1 | 
|  | 750                     cds_cov = cds_cov_count / length | 
|  | 751                     # print ("orftype, start, stop", orftype, orf_start, orf_stop) | 
|  | 752                     cursor.execute( | 
|  | 753                         "INSERT INTO {} VALUES('{}','{}',{},{},{},{});".format( | 
|  | 754                             orftype, | 
|  | 755                             transcript, | 
|  | 756                             start_codon, | 
|  | 757                             length, | 
|  | 758                             orf_start, | 
|  | 759                             orf_stop, | 
|  | 760                             cds_cov, | 
|  | 761                         ) | 
|  | 762                     ) | 
|  | 763                     break | 
|  | 764         # sys.exit() | 
|  | 765     for frame in [0, 1, 2]: | 
|  | 766         for i in range(frame, transcript_len, 3): | 
|  | 767             if seq[i : i + 3] in start_codons: | 
|  | 768                 for x in range(i, transcript_len, 3): | 
|  | 769                     if seq[x : x + 3] in stop_codons: | 
|  | 770                         # orf_seq = seq[i:x+3] | 
|  | 771                         orf_start = i + 1 | 
|  | 772                         orf_stop = x + 3  # +2 so it refers to the end of the stop codon | 
|  | 773                         start_codon = seq[i : i + 3] | 
|  | 774                         length = (x + 3) - i | 
|  | 775                         orf_pos_list = [] | 
|  | 776                         # determine how many nucleotides in this orf overlap with an annotated coding region | 
|  | 777                         cds_cov_count = 0.0 | 
|  | 778                         for pos in range(orf_start, orf_stop + 1): | 
|  | 779                             if pos in coding_positions: | 
|  | 780                                 cds_cov_count += 1 | 
|  | 781                         cds_cov = float(cds_cov_count) / float(length) | 
|  | 782                         # Now determine orf type | 
|  | 783                         if cds_start == "NULL": | 
|  | 784                             orftype = "noncoding" | 
|  | 785                         else: | 
|  | 786                             # print ("cds start is not null :{}:{}".format(cds_start,cds_stop)) | 
|  | 787                             # print ("orf start, orf stop", orf_start, orf_stop) | 
|  | 788                             if orf_start == cds_start and orf_stop == cds_stop: | 
|  | 789                                 orftype = "cds" | 
|  | 790                                 # print ("orf type is cds") | 
|  | 791                             elif orf_start < cds_start and orf_stop == cds_stop: | 
|  | 792                                 orftype = "extension" | 
|  | 793                                 # special case for extensions, we only take from the orf_start to the cds_start, and re-calculate cds coverage | 
|  | 794                                 orf_stop = cds_start | 
|  | 795                                 cds_cov_count = 0.0 | 
|  | 796                                 for pos in range(orf_start, orf_stop + 1): | 
|  | 797                                     if pos in coding_positions: | 
|  | 798                                         cds_cov_count += 1 | 
|  | 799                                 cds_cov = float(cds_cov_count) / float(length) | 
|  | 800                                 # orf_seq = seq[orf_start:cds_start] | 
|  | 801                                 length = cds_start - orf_start | 
|  | 802                                 # print ("orf type is extension") | 
|  | 803                             elif orf_start < cds_start and orf_stop <= cds_start: | 
|  | 804                                 orftype = "uorf" | 
|  | 805                                 # print ("orf type is uorf") | 
|  | 806                             elif orf_start < cds_start and orf_stop > cds_start: | 
|  | 807                                 orftype = "ouorf" | 
|  | 808                                 # print ("orf type is ouorf") | 
|  | 809                                 # sys.exit() | 
|  | 810                             elif ( | 
|  | 811                                 orf_start >= cds_start | 
|  | 812                                 and orf_start <= cds_stop | 
|  | 813                                 and orf_stop <= cds_stop | 
|  | 814                             ): | 
|  | 815                                 if orf_stop == cds_stop: | 
|  | 816                                     break | 
|  | 817                                 # print ("Tran, cds_start, cds_stop, orf_start, orf_stop, tranlen",tran, cds_start, cds_stop, orf_start, orf_stop,transcript_len) | 
|  | 818                                 if ( | 
|  | 819                                     orf_stop < transcript_len | 
|  | 820                                     and orf_stop % 3 == cds_stop % 3 | 
|  | 821                                 ) or ( | 
|  | 822                                     cds_start != 1 | 
|  | 823                                     and orf_stop % 3 == (cds_start + 2) % 3 | 
|  | 824                                 ): | 
|  | 825                                     # print ("Transcript {} has an inframe stop codon".format(transcript)) | 
|  | 826                                     break | 
|  | 827                                 orftype = "nested" | 
|  | 828                                 # print ("orf type is nested") | 
|  | 829                             elif ( | 
|  | 830                                 orf_start >= cds_start | 
|  | 831                                 and orf_start <= cds_stop | 
|  | 832                                 and orf_stop > cds_stop | 
|  | 833                             ): | 
|  | 834                                 orftype = "odorf" | 
|  | 835                                 # print ("orftype is odorf") | 
|  | 836                             elif orf_start > cds_stop and orf_stop > cds_stop: | 
|  | 837                                 orftype = "dorf" | 
|  | 838                                 # print ("orftype is dorf") | 
|  | 839                             # if orf_stop > cds_start and orf_stop < cds_stop: | 
|  | 840                             # 	if (orf_stop+1)%3 == cds_start%3: | 
|  | 841                             # 		orftype = "inframe_stop" | 
|  | 842                             # 		print ("inframe stop, transcript, orf_stop", transcript, orf_stop) | 
|  | 843                             # 		sys.exit() | 
|  | 844                             # 		if transcript not in orf_dict: | 
|  | 845                             # 			orf_dict[orftype][transcript] = [] | 
|  | 846                             # 	#print ("some weird stop or something") | 
|  | 847                         cursor.execute( | 
|  | 848                             "INSERT INTO {} VALUES('{}','{}',{},{},{},{});".format( | 
|  | 849                                 orftype, | 
|  | 850                                 transcript, | 
|  | 851                                 start_codon, | 
|  | 852                                 length, | 
|  | 853                                 orf_start, | 
|  | 854                                 orf_stop, | 
|  | 855                                 cds_cov, | 
|  | 856                             ) | 
|  | 857                         ) | 
|  | 858                         break | 
|  | 859 # Used to keep track of the codons at cds_start and cds_stop positions, | 
|  | 860 # If there is an issue with the cds co-ordinates the starts and stops counts will | 
|  | 861 # be much lower than the other count, start with 1 to prevent division by 0 | 
|  | 862 nuc_dict = {"stops": {"stops": 1, "other": 0}, "starts": {"starts": 1, "other": 0}} | 
|  | 863 | 
|  | 864 | 
|  | 865 def calcgc(seq): | 
|  | 866     if seq == "": | 
|  | 867         return "NULL" | 
|  | 868     g_count = 0 | 
|  | 869     c_count = 0 | 
|  | 870     a_count = 0 | 
|  | 871     t_count = 0 | 
|  | 872     for char in seq: | 
|  | 873         if char == "A": | 
|  | 874             a_count += 1 | 
|  | 875         if char == "T": | 
|  | 876             t_count += 1 | 
|  | 877         if char == "G": | 
|  | 878             g_count += 1 | 
|  | 879         if char == "C": | 
|  | 880             c_count += 1 | 
|  | 881         gc = ((g_count + c_count) / float(len(seq))) * 100 | 
|  | 882     return round(gc, 2) | 
|  | 883 | 
|  | 884 | 
|  | 885 for transcript in master_dict: | 
|  | 886     # print ("transcripts", transcript) | 
|  | 887     length = master_dict[transcript]["length"] | 
|  | 888     cds_start = master_dict[transcript]["cds_start"] | 
|  | 889     cds_stop = master_dict[transcript]["cds_stop"] | 
|  | 890     seq = master_dict[transcript]["seq"] | 
|  | 891     strand = master_dict[transcript]["strand"] | 
|  | 892     chrom = master_dict[transcript]["chrom"] | 
|  | 893     gene = master_dict[transcript]["gene_name"] | 
|  | 894     gc = calcgc(seq) | 
|  | 895     five_gc = "NULL" | 
|  | 896     cds_gc = "NULL" | 
|  | 897     three_gc = "NULL" | 
|  | 898     if cds_start != "NULL": | 
|  | 899         five_gc = calcgc(seq[:cds_start]) | 
|  | 900         cds_gc = calcgc(seq[cds_start:cds_stop]) | 
|  | 901         three_gc = calcgc(seq[cds_stop:]) | 
|  | 902         # check that the nucleotide cds_start points to is the first of the start codon | 
|  | 903         # take one becase cds_start is 1 based but python indexing is 0 based | 
|  | 904         start_nuc = seq[cds_start - 1 : cds_start + 2] | 
|  | 905         # print ("start nuc",start_nuc) | 
|  | 906         if start_nuc == "ATG": | 
|  | 907             nuc_dict["starts"]["starts"] += 1 | 
|  | 908         else: | 
|  | 909             nuc_dict["starts"]["other"] += 1 | 
|  | 910         stop_nuc = seq[cds_stop - 3 : cds_stop] | 
|  | 911         # print ("stop_nuc",stop_nuc) | 
|  | 912         if stop_nuc in ["TAG", "TAA", "TGA"]: | 
|  | 913             nuc_dict["stops"]["stops"] += 1 | 
|  | 914         else: | 
|  | 915             nuc_dict["stops"]["other"] += 1 | 
|  | 916     tran_type = master_dict[transcript]["tran_type"] | 
|  | 917     if coding_genes_dict[gene] == True: | 
|  | 918         gene_type = 1 | 
|  | 919     else: | 
|  | 920         gene_type = 0 | 
|  | 921     # print ("tran type before",tran_type) | 
|  | 922     if tran_type == "coding": | 
|  | 923         tran_type = 1 | 
|  | 924     else: | 
|  | 925         tran_type = 0 | 
|  | 926     # print ("tran type after",tran_type) | 
|  | 927     start_list = str(master_dict[transcript]["start_list"]).replace(" ", "").strip("[]") | 
|  | 928     stop_list = str(master_dict[transcript]["stop_list"]).replace(" ", "").strip("[]") | 
|  | 929     exon_junctions = ( | 
|  | 930         str(master_dict[transcript]["exon_junctions"]).replace(" ", "").strip("[]") | 
|  | 931     ) | 
|  | 932     principal = master_dict[transcript]["principal"] | 
|  | 933     version = master_dict[transcript]["version"] | 
|  | 934     # print (master_dict[transcript]) | 
|  | 935     # print (tran_type) | 
|  | 936     # print (gene_type) | 
|  | 937     # print (principal) | 
|  | 938     # print (version) | 
|  | 939     # print ("INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{});".format(transcript, gene, length, cds_start, cds_stop, seq, strand,stop_list, start_list, exon_junctions, tran_type,gene_type,principal,version)) | 
|  | 940     cursor.execute( | 
|  | 941         "INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{},{},{},{},{},'{}');".format( | 
|  | 942             transcript, | 
|  | 943             gene, | 
|  | 944             length, | 
|  | 945             cds_start, | 
|  | 946             cds_stop, | 
|  | 947             seq, | 
|  | 948             strand, | 
|  | 949             stop_list, | 
|  | 950             start_list, | 
|  | 951             exon_junctions, | 
|  | 952             tran_type, | 
|  | 953             gene_type, | 
|  | 954             principal, | 
|  | 955             version, | 
|  | 956             gc, | 
|  | 957             five_gc, | 
|  | 958             cds_gc, | 
|  | 959             three_gc, | 
|  | 960             chrom, | 
|  | 961         ) | 
|  | 962     ) | 
|  | 963 | 
|  | 964     for tup in master_dict[transcript]["exon"]: | 
|  | 965         cursor.execute( | 
|  | 966             "INSERT INTO exons VALUES('{}',{},{});".format(transcript, tup[0], tup[1]) | 
|  | 967         ) | 
|  | 968     if transcript in coding_dict: | 
|  | 969         for tup in coding_dict[transcript]: | 
|  | 970             cursor.execute( | 
|  | 971                 "INSERT INTO coding_regions VALUES('{}',{},{});".format( | 
|  | 972                     transcript, tup[0], tup[1] | 
|  | 973                 ) | 
|  | 974             ) | 
| 3 | 975 # print(cursor.execute( | 
|  | 976 #     ".tables" | 
|  | 977 #     )) | 
| 0 | 978 | 
|  | 979 print("delim", delimiters) | 
|  | 980 if (nuc_dict["starts"]["other"] / nuc_dict["starts"]["starts"]) > 0.05: | 
|  | 981     print( | 
|  | 982         "Warning: {} transcripts do not have a an AUG at the CDS start position".format( | 
|  | 983             nuc_dict["starts"]["other"] | 
|  | 984         ) | 
|  | 985     ) | 
|  | 986 if (nuc_dict["stops"]["other"] / nuc_dict["stops"]["stops"]) > 0.05: | 
|  | 987     print( | 
|  | 988         "Warning: {} transcripts do not have a an stop codon at the CDS stop position".format( | 
|  | 989             nuc_dict["stops"]["other"] | 
|  | 990         ) | 
|  | 991     ) | 
|  | 992 if len(notinannotation) > 0: | 
|  | 993     print( | 
|  | 994         "Warning: {} transcripts were in the fasta file, but not the annotation file, these will be discarded".format( | 
|  | 995             len(notinannotation) | 
|  | 996         ) | 
|  | 997     ) | 
| 3 | 998 | 
|  | 999 | 
|  | 1000 | 
|  | 1001 connection.commit() | 
|  | 1002 connection.close() | 
|  | 1003 | 
|  | 1004 |