annotate trips_bam_to_sqlite/bam_to_sqlite.py @ 30:a511e084e4e7 draft

Uploaded
author jackcurragh
date Thu, 03 Nov 2022 13:22:00 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
30
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
1 import sys
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
2 import pysam
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
3 import operator
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
4 import os
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
5 import time
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
6 import sqlite3
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
7 from sqlitedict import SqliteDict
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
8
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
9 def tran_to_genome(tran, pos, transcriptome_info_dict):
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
10 #print ("tran",list(transcriptome_info_dict))
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
11 traninfo = transcriptome_info_dict[tran]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
12 chrom = traninfo["chrom"]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
13 strand = traninfo["strand"]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
14 exons = sorted(traninfo["exons"])
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
15 #print exons
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
16 if strand == "+":
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
17 exon_start = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
18 for tup in exons:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
19 exon_start = tup[0]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
20 exonlen = tup[1] - tup[0]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
21 if pos > exonlen:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
22 pos = (pos - exonlen)-1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
23 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
24 break
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
25 genomic_pos = (exon_start+pos)-1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
26 elif strand == "-":
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
27 exon_start = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
28 for tup in exons[::-1]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
29 exon_start = tup[1]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
30 exonlen = tup[1] - tup[0]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
31 if pos > exonlen:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
32 pos = (pos - exonlen)-1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
33 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
34 break
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
35 genomic_pos = (exon_start-pos)+1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
36 return (chrom, genomic_pos)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
37
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
38
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
39 # Takes a dictionary with a readname as key and a list of lists as value, each sub list has consists of two elements a transcript and the position the read aligns to in the transcript
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
40 # This function will count the number of genes that the transcripts correspond to and if less than or equal to 3 will add the relevant value to transcript_counts_dict
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
41 def processor(process_chunk, master_read_dict, transcriptome_info_dict,master_dict,readseq, unambig_read_length_dict):
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
42 readlen = len(readseq)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
43 ambiguously_mapped_reads = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
44 #get the read name
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
45 read = list(process_chunk)[0]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
46
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
47 read_list = process_chunk[read] # a list of lists of all transcripts the read aligns to and the positions
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
48 #used to store different genomic poistions
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
49 genomic_positions = []
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
50
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
51 #This section is just to get the different genomic positions the read aligns to
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
52
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
53 for listname in process_chunk[read]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
54
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
55 tran = listname[0].replace("-","_").replace("(","").replace(")","")
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
56
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
57 pos = int(listname[1])
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
58 genomic_pos = tran_to_genome(tran, pos, transcriptome_info_dict)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
59 #print ("genomic pos",genomic_pos)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
60 if genomic_pos not in genomic_positions:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
61 genomic_positions.append(genomic_pos)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
62
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
63 #If the read maps unambiguously
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
64 if len(genomic_positions) == 1:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
65 if readlen not in unambig_read_length_dict:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
66 unambig_read_length_dict[readlen] = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
67 unambig_read_length_dict[readlen] += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
68 #assume this read aligns to a noncoding position, if we find that it does align to a coding region change this to True
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
69 coding=False
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
70
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
71 # For each transcript this read alings to
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
72 for listname in process_chunk[read]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
73 #get the transcript name
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
74 tran = listname[0].replace("-","_").replace("(","").replace(")","")
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
75 #If we haven't come across this transcript already then add to master_read_dict
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
76 if tran not in master_read_dict:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
77 master_read_dict[tran] = {"ambig":{}, "unambig":{}, "mismatches":{}, "seq":{}}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
78 #get the raw unedited positon, and read tags
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
79 pos = int(listname[1])
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
80 read_tags = listname[2]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
81 #If there is mismatches in this line, then modify the postion and readlen (if mismatches at start or end) and add mismatches to dictionary
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
82 nm_tag = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
83
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
84 for tag in read_tags:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
85 if tag[0] == "NM":
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
86 nm_tag = int(tag[1])
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
87 if nm_tag > 0:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
88 md_tag = ""
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
89 for tag in read_tags:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
90 if tag[0] == "MD":
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
91 md_tag = tag[1]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
92 pos_modifier, readlen_modifier,mismatches = get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
93 # Count the mismatches (we only do this for unambiguous)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
94 for mismatch in mismatches:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
95 #Ignore mismatches appearing in the first position (due to non templated addition)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
96 if mismatch != 0:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
97 char = mismatches[mismatch]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
98 mismatch_pos = pos + mismatch
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
99 if mismatch_pos not in master_read_dict[tran]["seq"]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
100 master_read_dict[tran]["seq"][mismatch_pos] = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
101 if char not in master_read_dict[tran]["seq"][mismatch_pos]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
102 master_read_dict[tran]["seq"][mismatch_pos][char] = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
103 master_read_dict[tran]["seq"][mismatch_pos][char] += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
104 # apply the modifiers
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
105 #pos = pos+pos_modifier
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
106 #readlen = readlen - readlen_modifier
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
107
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
108
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
109 try:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
110 cds_start = transcriptome_info_dict[tran]["cds_start"]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
111 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
112
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
113 if pos >= cds_start and pos <= cds_stop:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
114 coding=True
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
115 except:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
116 pass
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
117
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
118
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
119 if readlen in master_read_dict[tran]["unambig"]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
120 if pos in master_read_dict[tran]["unambig"][readlen]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
121 master_read_dict[tran]["unambig"][readlen][pos] += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
122 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
123 master_read_dict[tran]["unambig"][readlen][pos] = 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
124 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
125 master_read_dict[tran]["unambig"][readlen] = {pos:1}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
126
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
127 if coding == True:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
128 master_dict["unambiguous_coding_count"] += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
129 elif coding == False:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
130 master_dict["unambiguous_non_coding_count"] += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
131
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
132 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
133 ambiguously_mapped_reads += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
134 for listname in process_chunk[read]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
135 tran = listname[0].replace("-","_").replace("(","").replace(")","")
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
136 if tran not in master_read_dict:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
137 master_read_dict[tran] = {"ambig":{}, "unambig":{}, "mismatches":{}, "seq":{}}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
138 pos = int(listname[1])
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
139 read_tags = listname[2]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
140 nm_tag = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
141 for tag in read_tags:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
142 if tag[0] == "NM":
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
143 nm_tag = int(tag[1])
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
144 if nm_tag > 0:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
145 md_tag = ""
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
146 for tag in read_tags:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
147 if tag[0] == "MD":
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
148 md_tag = tag[1]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
149 pos_modifier, readlen_modifier,mismatches = get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
150 # apply the modifiers
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
151 #pos = pos+pos_modifier
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
152 #readlen = readlen - readlen_modifier
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
153 if readlen in master_read_dict[tran]["ambig"]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
154 if pos in master_read_dict[tran]["ambig"][readlen]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
155 master_read_dict[tran]["ambig"][readlen][pos] += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
156 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
157 master_read_dict[tran]["ambig"][readlen][pos] = 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
158 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
159 master_read_dict[tran]["ambig"][readlen] = {pos:1}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
160 return ambiguously_mapped_reads
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
161
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
162
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
163 def get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq):
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
164 nucs = ["A","T","G","C"]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
165 mismatches = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
166 total_so_far = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
167 prev_char = ""
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
168 for char in md_tag:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
169 if char in nucs:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
170 if prev_char != "":
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
171 total_so_far += int(prev_char)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
172 prev_char = ""
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
173 mismatches[total_so_far+len(mismatches)] = (readseq[total_so_far+len(mismatches)])
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
174 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
175 if char != "^" and char != "N":
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
176 if prev_char == "":
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
177 prev_char = char
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
178 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
179 total_so_far += int(prev_char+char)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
180 prev_char = ""
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
181 readlen_modifier = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
182 pos_modifier = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
183 five_ok = False
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
184 three_ok = False
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
185 while five_ok == False:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
186 for i in range(0,readlen):
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
187 if i in mismatches:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
188 pos_modifier += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
189 readlen_modifier += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
190 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
191 five_ok = True
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
192 break
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
193 five_ok = True
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
194
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
195
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
196 while three_ok == False:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
197 for i in range(readlen-1,0,-1):
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
198 if i in mismatches:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
199 readlen_modifier += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
200 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
201 three_ok = True
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
202 break
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
203 three_ok = True
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
204
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
205
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
206 return (pos_modifier, readlen_modifier, mismatches)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
207
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
208
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
209
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
210 def process_bam(bam_filepath, transcriptome_info_dict_path,outputfile):
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
211 desc = "NULL"
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
212 start_time = time.time()
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
213 study_dict ={}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
214 nuc_count_dict = {"mapped":{},"unmapped":{}}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
215 dinuc_count_dict = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
216 threeprime_nuc_count_dict = {"mapped":{},"unmapped":{}}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
217 read_length_dict = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
218 unambig_read_length_dict = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
219 unmapped_dict = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
220 master_dict = {"unambiguous_non_coding_count":0,"unambiguous_coding_count":0,"current_dir":os.getcwd()}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
221
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
222 transcriptome_info_dict = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
223 connection = sqlite3.connect(transcriptome_info_dict_path)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
224 cursor = connection.cursor()
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
225 cursor.execute("SELECT transcript,cds_start,cds_stop,length,strand,chrom,tran_type from transcripts;")
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
226 result = cursor.fetchall()
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
227 for row in result:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
228 transcriptome_info_dict[str(row[0])] = {"cds_start":row[1],"cds_stop":row[2],"length":row[3],"strand":row[4],"chrom":row[5],"exons":[],"tran_type":row[6]}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
229 #print list(transcriptome_info_dict)[:10]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
230
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
231 cursor.execute("SELECT * from exons;")
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
232 result = cursor.fetchall()
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
233 for row in result:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
234 transcriptome_info_dict[str(row[0])]["exons"].append((row[1],row[2]))
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
235
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
236 #it might be the case that there are no multimappers, so set this to 0 first to avoid an error, it will be overwritten later if there is multimappers
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
237 multimappers = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
238 unmapped_reads = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
239 unambiguous_coding_count = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
240 unambiguous_non_coding_count = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
241 trip_periodicity_reads = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
242
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
243 final_offsets = {"fiveprime":{"offsets":{}, "read_scores":{}}, "threeprime":{"offsets":{}, "read_scores":{}}}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
244 master_read_dict = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
245 prev_seq = ""
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
246 process_chunk = {"read_name":[["placeholder_tran","1","28"]]}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
247 mapped_reads = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
248 ambiguously_mapped_reads = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
249 master_trip_dict = {"fiveprime":{}, "threeprime":{}}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
250 master_offset_dict = {"fiveprime":{}, "threeprime":{}}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
251 master_metagene_stop_dict = {"fiveprime":{}, "threeprime":{}}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
252
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
253
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
254 os.system(f'samtools sort -n {bam_filepath} -o {bam_filepath}_n_sorted.bam 2> /dev/null')
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
255
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
256 pysam.set_verbosity(0)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
257 infile = pysam.Samfile(f"{bam_filepath}_n_sorted.bam", "rb")
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
258 header = infile.header["HD"]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
259
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
260 unsorted = False
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
261 if "SO" in header:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
262 print("Sorting order: "+header["SO"])
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
263 if header["SO"] != "queryname":
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
264 print("Sorting order is not queryname")
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
265 unsorted = True
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
266 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
267 unsorted = True
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
268 if unsorted == True:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
269 print ("ERROR: Bam file appears to be unsorted or not sorted by read name. To sort by read name use the command: samtools sort -n input.bam output.bam")
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
270 print (header,bam_filepath)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
271 sys.exit()
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
272 total_bam_lines = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
273 all_ref_ids = infile.references
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
274
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
275 for read in infile.fetch(until_eof=True):
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
276 total_bam_lines += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
277 if not read.is_unmapped:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
278 ref = read.reference_id
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
279 tran = (all_ref_ids[ref]).split(".")[0]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
280 mapped_reads += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
281 if mapped_reads%1000000 == 0:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
282 print ("{} reads parsed at {}".format(mapped_reads,(time.time()-start_time)))
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
283 pos = read.reference_start
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
284 readname = read.query_name
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
285 read_tags = read.tags
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
286 if readname == list(process_chunk)[0]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
287 process_chunk[readname].append([tran,pos,read_tags])
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
288 #if the current read is different from previous reads send 'process_chunk' to the 'processor' function, then start 'process_chunk' over using current read
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
289 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
290 if list(process_chunk)[0] != "read_name":
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
291
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
292 #At this point we work out readseq, we do this for multiple reasons, firstly so we don't count the sequence from a read multiple times, just because
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
293 # it aligns multiple times and secondly we only call read.seq once (read.seq is computationally expensive)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
294 seq = read.seq
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
295 readlen = len(seq)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
296
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
297 # Note if a read maps ambiguously it will still be counted toward the read length distribution (however it will only be counted once, not each time it maps)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
298 if readlen not in read_length_dict:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
299 read_length_dict[readlen] = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
300 read_length_dict[readlen] += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
301
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
302 if readlen not in nuc_count_dict["mapped"]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
303 nuc_count_dict["mapped"][readlen] = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
304 if readlen not in threeprime_nuc_count_dict["mapped"]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
305 threeprime_nuc_count_dict["mapped"][readlen] = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
306 if readlen not in dinuc_count_dict:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
307 dinuc_count_dict[readlen] = {"AA":0, "TA":0, "GA":0, "CA":0,
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
308 "AT":0, "TT":0, "GT":0, "CT":0,
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
309 "AG":0, "TG":0, "GG":0, "CG":0,
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
310 "AC":0, "TC":0, "GC":0, "CC":0}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
311
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
312 for i in range(0,len(seq)):
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
313 if i not in nuc_count_dict["mapped"][readlen]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
314 nuc_count_dict["mapped"][readlen][i] = {"A":0, "T":0, "G":0, "C":0, "N":0}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
315 nuc_count_dict["mapped"][readlen][i][seq[i]] += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
316
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
317 for i in range(0,len(seq)):
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
318 try:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
319 dinuc_count_dict[readlen][seq[i:i+2]] += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
320 except:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
321 pass
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
322
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
323 for i in range(len(seq),0,-1):
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
324 dist = i-len(seq)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
325 if dist not in threeprime_nuc_count_dict["mapped"][readlen]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
326 threeprime_nuc_count_dict["mapped"][readlen][dist] = {"A":0, "T":0, "G":0, "C":0, "N":0}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
327 threeprime_nuc_count_dict["mapped"][readlen][dist][seq[dist]] += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
328 ambiguously_mapped_reads += processor(process_chunk, master_read_dict, transcriptome_info_dict,master_dict,prev_seq, unambig_read_length_dict)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
329 process_chunk = {readname:[[tran, pos, read_tags]]}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
330 prev_seq = read.seq
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
331 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
332 unmapped_reads += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
333
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
334 # Add this unmapped read to unmapped_dict so we can see what the most frequent unmapped read is.
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
335 seq = read.seq
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
336 readlen = len(seq)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
337 if seq in unmapped_dict:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
338 unmapped_dict[seq] += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
339 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
340 unmapped_dict[seq] = 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
341
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
342 # Populate the nuc_count_dict with this unmapped read
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
343 if readlen not in nuc_count_dict["unmapped"]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
344 nuc_count_dict["unmapped"][readlen] = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
345 for i in range(0,len(seq)):
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
346 if i not in nuc_count_dict["unmapped"][readlen]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
347 nuc_count_dict["unmapped"][readlen][i] = {"A":0, "T":0, "G":0, "C":0, "N":0}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
348 nuc_count_dict["unmapped"][readlen][i][seq[i]] += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
349
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
350 if readlen not in threeprime_nuc_count_dict["unmapped"]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
351 threeprime_nuc_count_dict["unmapped"][readlen] = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
352
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
353 for i in range(len(seq),0,-1):
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
354 dist = i-len(seq)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
355 if dist not in threeprime_nuc_count_dict["unmapped"][readlen]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
356 threeprime_nuc_count_dict["unmapped"][readlen][dist] = {"A":0, "T":0, "G":0, "C":0, "N":0}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
357 threeprime_nuc_count_dict["unmapped"][readlen][dist][seq[dist]] += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
358
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
359 #add stats about mapped/unmapped reads to file dict which will be used for the final report
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
360 master_dict["total_bam_lines"] = total_bam_lines
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
361 master_dict["mapped_reads"] = mapped_reads
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
362 master_dict["unmapped_reads"] = unmapped_reads
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
363 master_dict["ambiguously_mapped_reads"] = ambiguously_mapped_reads
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
364
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
365 if "read_name" in master_read_dict:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
366 del master_read_dict["read_name"]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
367 print ("BAM file processed")
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
368 print ("Creating metagenes, triplet periodicity plots, etc.")
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
369
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
370 for tran in master_read_dict:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
371 try:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
372 cds_start = int(0 if transcriptome_info_dict[tran]["cds_start"] is None else transcriptome_info_dict[tran]["cds_start"])
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
373 cds_stop = int(0 if transcriptome_info_dict[tran]["cds_stop"] is None else transcriptome_info_dict[tran]["cds_stop"])
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
374 # print(tran, type(cds_start))
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
375 except:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
376 print("Exception: ", tran)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
377 continue
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
378
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
379 tranlen = transcriptome_info_dict[tran]["length"]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
380 #Use this to discard transcripts with no 5' leader or 3' trailer
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
381 if cds_start > 1 and cds_stop < tranlen and transcriptome_info_dict[tran]["tran_type"] == 1:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
382 for primetype in ["fiveprime", "threeprime"]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
383 # Create the triplet periodicity and metainfo plots based on both the 5' and 3' ends of reads
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
384 for readlength in master_read_dict[tran]["unambig"]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
385 #print "readlength", readlength
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
386 # for each fiveprime postion for this readlength within this transcript
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
387 for raw_pos in master_read_dict[tran]["unambig"][readlength]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
388 #print "raw pos", raw_pos
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
389 trip_periodicity_reads += 1
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
390 if primetype == "fiveprime":
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
391 # get the five prime postion minus the cds start postion
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
392 real_pos = raw_pos-cds_start
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
393 rel_stop_pos = raw_pos-cds_stop
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
394 elif primetype == "threeprime":
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
395 real_pos = (raw_pos+readlength)-cds_start
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
396 rel_stop_pos = (raw_pos+readlength)-cds_stop
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
397 #get the readcount at the raw postion
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
398 readcount = master_read_dict[tran]["unambig"][readlength][raw_pos]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
399 #print "readcount", readcount
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
400 frame = (real_pos%3)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
401 if real_pos >= cds_start and real_pos <= cds_stop:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
402 if readlength in master_trip_dict[primetype]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
403 master_trip_dict[primetype][readlength][str(frame)] += readcount
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
404 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
405 master_trip_dict[primetype][readlength]= {"0":0.0,"1":0.0,"2":0.0}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
406 master_trip_dict[primetype][readlength][str(frame)] += readcount
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
407 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
408 if real_pos > (-600) and real_pos < (601):
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
409 if readlength in master_offset_dict[primetype]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
410 if real_pos in master_offset_dict[primetype][readlength]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
411 #print "real pos in offset dict"
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
412 master_offset_dict[primetype][readlength][real_pos] += readcount
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
413 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
414 #print "real pos not in offset dict"
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
415 master_offset_dict[primetype][readlength][real_pos] = readcount
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
416 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
417 #initiliase with zero to avoid missing neighbours below
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
418 #print "initialising with zeros"
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
419 master_offset_dict[primetype][readlength]= {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
420 for i in range(-600,601):
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
421 master_offset_dict[primetype][readlength][i] = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
422 master_offset_dict[primetype][readlength][real_pos] += readcount
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
423
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
424 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
425 if rel_stop_pos > (-600) and rel_stop_pos < (601):
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
426 if readlength in master_metagene_stop_dict[primetype]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
427 if rel_stop_pos in master_metagene_stop_dict[primetype][readlength]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
428 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] += readcount
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
429 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
430 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] = readcount
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
431 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
432 #initiliase with zero to avoid missing neighbours below
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
433 master_metagene_stop_dict[primetype][readlength] = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
434 for i in range(-600,601):
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
435 master_metagene_stop_dict[primetype][readlength][i] = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
436 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] += readcount
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
437
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
438 # master trip dict is now made up of readlengths with 3 frames and a count associated with each frame
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
439 # create a 'score' for each readlength by putting the max frame count over the second highest frame count
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
440 for primetype in ["fiveprime", "threeprime"]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
441 for subreadlength in master_trip_dict[primetype]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
442 maxcount = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
443 secondmaxcount = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
444 for frame in master_trip_dict[primetype][subreadlength]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
445 if master_trip_dict[primetype][subreadlength][frame] > maxcount:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
446 maxcount = master_trip_dict[primetype][subreadlength][frame]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
447 for frame in master_trip_dict[primetype][subreadlength]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
448 if master_trip_dict[primetype][subreadlength][frame] > secondmaxcount and master_trip_dict[primetype][subreadlength][frame] != maxcount:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
449 secondmaxcount = master_trip_dict[primetype][subreadlength][frame]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
450 # a perfect score would be 0 meaning there is only a single peak, the worst score would be 1 meaning two highest peaks are the same height
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
451 master_trip_dict[primetype][subreadlength]["score"] = float(secondmaxcount)/float(maxcount)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
452 #This part is to determine what offsets to give each read length
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
453 print ("Calculating offsets")
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
454 for primetype in ["fiveprime", "threeprime"]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
455 for readlen in master_offset_dict[primetype]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
456 accepted_len = False
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
457 max_relative_pos = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
458 max_relative_count = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
459 for relative_pos in master_offset_dict[primetype][readlen]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
460 # This line is to ensure we don't choose an offset greater than the readlength (in cases of a large peak far up/downstream)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
461 if abs(relative_pos) < 10 or abs(relative_pos) > (readlen-10):
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
462 continue
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
463 if master_offset_dict[primetype][readlen][relative_pos] > max_relative_count:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
464 max_relative_pos = relative_pos
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
465 max_relative_count = master_offset_dict[primetype][readlen][relative_pos]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
466 #print "for readlen {} the max_relative pos is {}".format(readlen, max_relative_pos)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
467 if primetype == "fiveprime":
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
468 # -3 to get from p-site to a-site, +1 to account for 1 based co-ordinates, resulting in -2 overall
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
469 final_offsets[primetype]["offsets"][readlen] = abs(max_relative_pos-2)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
470 elif primetype == "threeprime":
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
471 # +3 to get from p-site to a-site, -1 to account for 1 based co-ordinates, resulting in +2 overall
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
472 final_offsets[primetype]["offsets"][readlen] = (max_relative_pos*(-1))+2
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
473 #If there are no reads in CDS regions for a specific length, it may not be present in master_trip_dict
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
474 if readlen in master_trip_dict[primetype]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
475 final_offsets[primetype]["read_scores"][readlen] = master_trip_dict[primetype][readlen]["score"]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
476 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
477 final_offsets[primetype]["read_scores"][readlen] = 0.0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
478
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
479
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
480 master_read_dict["unmapped_reads"] = unmapped_reads
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
481 master_read_dict["offsets"] = final_offsets
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
482 master_read_dict["trip_periodicity"] = master_trip_dict
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
483 master_read_dict["desc"] = "Null"
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
484 master_read_dict["mapped_reads"] = mapped_reads
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
485 master_read_dict["nuc_counts"] = nuc_count_dict
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
486 master_read_dict["dinuc_counts"] = dinuc_count_dict
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
487 master_read_dict["threeprime_nuc_counts"] = threeprime_nuc_count_dict
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
488 master_read_dict["metagene_counts"] = master_offset_dict
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
489 master_read_dict["stop_metagene_counts"] = master_metagene_stop_dict
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
490 master_read_dict["read_lengths"] = read_length_dict
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
491 master_read_dict["unambig_read_lengths"] = unambig_read_length_dict
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
492 master_read_dict["coding_counts"] = master_dict["unambiguous_coding_count"]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
493 master_read_dict["noncoding_counts"] = master_dict["unambiguous_non_coding_count"]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
494 master_read_dict["ambiguous_counts"] = master_dict["ambiguously_mapped_reads"]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
495 master_read_dict["frequent_unmapped_reads"] = (sorted(unmapped_dict.items(), key=operator.itemgetter(1)))[-2000:]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
496 master_read_dict["cutadapt_removed"] = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
497 master_read_dict["rrna_removed"] = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
498 #If no reads are removed by minus m there won't be an entry in the log file, so initiliase with 0 first and change if there is a line
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
499 master_read_dict["removed_minus_m"] = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
500 master_dict["removed_minus_m"] = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
501 # We work out the total counts for 5', cds 3' for differential translation here, would be better to do thisn in processor but need the offsets
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
502 master_read_dict["unambiguous_all_totals"] = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
503 master_read_dict["unambiguous_fiveprime_totals"] = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
504 master_read_dict["unambiguous_cds_totals"] = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
505 master_read_dict["unambiguous_threeprime_totals"] = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
506
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
507 master_read_dict["ambiguous_all_totals"] = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
508 master_read_dict["ambiguous_fiveprime_totals"] = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
509 master_read_dict["ambiguous_cds_totals"] = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
510 master_read_dict["ambiguous_threeprime_totals"] = {}
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
511 print ("calculating transcript counts")
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
512 for tran in master_read_dict:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
513 if tran in transcriptome_info_dict:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
514 five_total = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
515 cds_total = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
516 three_total = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
517
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
518 ambig_five_total = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
519 ambig_cds_total = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
520 ambig_three_total = 0
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
521
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
522 cds_start = transcriptome_info_dict[tran]["cds_start"]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
523 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
524
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
525 for readlen in master_read_dict[tran]["unambig"]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
526 if readlen in final_offsets["fiveprime"]["offsets"]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
527 offset = final_offsets["fiveprime"]["offsets"][readlen]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
528 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
529 offset = 15
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
530 for pos in master_read_dict[tran]["unambig"][readlen]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
531 real_pos = pos+offset
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
532 if cds_start is None or cds_stop is None:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
533 three_total += master_read_dict[tran]["unambig"][readlen][pos]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
534 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
535 if real_pos <cds_start:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
536 five_total += master_read_dict[tran]["unambig"][readlen][pos]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
537 elif real_pos >=cds_start and real_pos <= cds_stop:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
538 cds_total += master_read_dict[tran]["unambig"][readlen][pos]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
539 elif real_pos > cds_stop:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
540 three_total += master_read_dict[tran]["unambig"][readlen][pos]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
541 master_read_dict["unambiguous_all_totals"][tran] = five_total+cds_total+three_total
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
542 master_read_dict["unambiguous_fiveprime_totals"][tran] = five_total
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
543 master_read_dict["unambiguous_cds_totals"][tran] = cds_total
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
544 master_read_dict["unambiguous_threeprime_totals"][tran] = three_total
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
545
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
546 for readlen in master_read_dict[tran]["ambig"]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
547 if readlen in final_offsets["fiveprime"]["offsets"]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
548 offset = final_offsets["fiveprime"]["offsets"][readlen]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
549 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
550 offset = 15
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
551 for pos in master_read_dict[tran]["ambig"][readlen]:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
552 if cds_start is None or cds_stop is None:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
553 ambig_three_total += master_read_dict[tran]["ambig"][readlen][pos]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
554 else:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
555 real_pos = pos+offset
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
556 if real_pos < cds_start:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
557 ambig_five_total += master_read_dict[tran]["ambig"][readlen][pos]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
558 elif real_pos >=cds_start and real_pos <= cds_stop:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
559 ambig_cds_total += master_read_dict[tran]["ambig"][readlen][pos]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
560 elif real_pos > cds_stop:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
561 ambig_three_total += master_read_dict[tran]["ambig"][readlen][pos]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
562
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
563 master_read_dict["ambiguous_all_totals"][tran] = five_total+cds_total+three_total+ambig_five_total+ambig_cds_total+ambig_three_total
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
564 master_read_dict["ambiguous_fiveprime_totals"][tran] = five_total+ambig_five_total
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
565 master_read_dict["ambiguous_cds_totals"][tran] = cds_total+ambig_cds_total
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
566 master_read_dict["ambiguous_threeprime_totals"][tran] = three_total+ambig_three_total
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
567
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
568 print ("Writing out to sqlite file")
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
569 sqlite_db = SqliteDict(outputfile,autocommit=False)
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
570 for key in master_read_dict:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
571 sqlite_db[key] = master_read_dict[key]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
572 sqlite_db["description"] = desc
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
573 sqlite_db.commit()
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
574 sqlite_db.close()
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
575
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
576
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
577 if __name__ == "__main__":
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
578 if len(sys.argv) <= 2:
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
579 print ("Usage: python bam_to_sqlite.py <path_to_bam_file> <path_to_organism.sqlite> <file_description (optional)>")
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
580 sys.exit()
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
581 bam_filepath = sys.argv[1]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
582 annotation_sqlite_filepath = sys.argv[2]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
583 desc = sys.argv[3]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
584 outputfile = sys.argv[4]
a511e084e4e7 Uploaded
jackcurragh
parents:
diff changeset
585 process_bam(bam_filepath,annotation_sqlite_filepath,outputfile)