comparison pep_pointer.py @ 3:a6282baa8c6f draft default tip

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pep_pointer commit 494bc6dd87b9a6e2af40cb32aa5d2ee6e9bfebfc
author galaxyp
date Mon, 20 Jun 2022 13:59:52 +0000
parents 073a2965e3b2
children
comparison
equal deleted inserted replaced
2:073a2965e3b2 3:a6282baa8c6f
1 1
2 # 2 #
3 # Author: Praveen Kumar 3 # Author: Praveen Kumar
4 # Updated: April 6th, 2018 4 # Updated: April 6th, 2018 (updated to python3: May 2022)
5 # 5 #
6 # 6 #
7 # 7 #
8 8
9 import re 9 import re
10 10
11 11
12 def main(): 12 def main():
13 import sys 13 import sys
14 if len(sys.argv) == 4: 14 if len(sys.argv) == 4:
15 inputFile = sys.argv 15 inputFile = sys.argv
16 infh = open(inputFile[1], "r") 16 infh = open(inputFile[1], "r")
17 # infh = open("Mus_musculus.GRCm38.90.chr.gtf", "r") 17 # infh = open("Mus_musculus.GRCm38.90.chr.gtf", "r")
18 18
19 gtf = {} 19 gtf = {}
20 gtf_transcript = {} 20 gtf_transcript = {}
21 gtf_gene = {} 21 gtf_gene = {}
22 for each in infh.readlines(): 22 for each in infh.readlines():
23 a = each.split("\t") 23 a = each.split("\t")
36 end = a[4].strip() 36 end = a[4].strip()
37 elif int(a[4].strip()) < int(a[3].strip()): 37 elif int(a[4].strip()) < int(a[3].strip()):
38 start = a[4].strip() 38 start = a[4].strip()
39 end = a[3].strip() 39 end = a[3].strip()
40 else: 40 else:
41 print "Please check the start end coordinates in the GTF file" 41 print("Please check the start end coordinates in the GTF file")
42 else: 42 else:
43 print "Please check the strand information in the GTF file. It should be '+' or '-'." 43 print("Please check the strand information in the GTF file. It should be '+' or '-'.")
44 if not gtf.has_key(strand): 44 if strand not in gtf:
45 gtf[strand] = {} 45 gtf[strand] = {}
46 if not gtf[strand].has_key(type): 46 if type not in gtf[strand]:
47 gtf[strand][type] = [] 47 gtf[strand][type] = []
48 b = re.search("gene_id \"(.+?)\";", a[8].strip()) 48 b = re.search("gene_id \"(.+?)\";", a[8].strip())
49 gene = b.group(1) 49 gene = b.group(1)
50 if type == "gene": 50 if type == "gene":
51 transcript = "" 51 transcript = ""
52 else: 52 else:
53 b = re.search("transcript_id \"(.+?)\";", a[8].strip()) 53 b = re.search("transcript_id \"(.+?)\";", a[8].strip())
54 transcript = b.group(1) 54 transcript = b.group(1)
55 data = (chr, start, end, gene, transcript, strand, type) 55 data = (chr, start, end, gene, transcript, strand, type)
56 gtf[strand][type].append(data) 56 gtf[strand][type].append(data)
57 57
58 if type == "exon": 58 if type == "exon":
59 if gtf_transcript.has_key(chr+"#"+strand): 59 if chr + "#" + strand in gtf_transcript:
60 if gtf_transcript[chr+"#"+strand].has_key(transcript+"#"+gene): 60 if transcript + "#" + gene in gtf_transcript[chr + "#" + strand]:
61 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) 61 gtf_transcript[chr + "#" + strand][transcript + "#" + gene][0].append(int(start))
62 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) 62 gtf_transcript[chr + "#" + strand][transcript + "#" + gene][1].append(int(end))
63 else: 63 else:
64 gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]] 64 gtf_transcript[chr + "#" + strand][transcript + "#" + gene] = [[], []]
65 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) 65 gtf_transcript[chr + "#" + strand][transcript + "#" + gene][0].append(int(start))
66 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) 66 gtf_transcript[chr + "#" + strand][transcript + "#" + gene][1].append(int(end))
67 else: 67 else:
68 gtf_transcript[chr+"#"+strand] = {} 68 gtf_transcript[chr + "#" + strand] = {}
69 gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]] 69 gtf_transcript[chr + "#" + strand][transcript + "#" + gene] = [[], []]
70 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) 70 gtf_transcript[chr + "#" + strand][transcript + "#" + gene][0].append(int(start))
71 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) 71 gtf_transcript[chr + "#" + strand][transcript + "#" + gene][1].append(int(end))
72 72
73 if type == "gene": 73 if type == "gene":
74 if gtf_gene.has_key(chr+"#"+strand): 74 if chr + "#" + strand in gtf_gene:
75 gtf_gene[chr+"#"+strand][0].append(int(start)) 75 gtf_gene[chr + "#" + strand][0].append(int(start))
76 gtf_gene[chr+"#"+strand][1].append(int(end)) 76 gtf_gene[chr + "#" + strand][1].append(int(end))
77 gtf_gene[chr+"#"+strand][2].append(gene) 77 gtf_gene[chr + "#" + strand][2].append(gene)
78 else: 78 else:
79 gtf_gene[chr+"#"+strand] = [[0],[0],["no_gene"]] 79 gtf_gene[chr + "#" + strand] = [[0], [0], ["no_gene"]]
80 gtf_gene[chr+"#"+strand][0].append(int(start)) 80 gtf_gene[chr + "#" + strand][0].append(int(start))
81 gtf_gene[chr+"#"+strand][1].append(int(end)) 81 gtf_gene[chr + "#" + strand][1].append(int(end))
82 gtf_gene[chr+"#"+strand][2].append(gene) 82 gtf_gene[chr + "#" + strand][2].append(gene)
83 83
84
85
86 # "Starting Reading Intron . . ." 84 # "Starting Reading Intron . . ."
87 85
88 gtf["+"]["intron"] = [] 86 gtf["+"]["intron"] = []
89 gtf["-"]["intron"] = [] 87 gtf["-"]["intron"] = []
90 for chr_strand in gtf_transcript.keys(): 88 for chr_strand in gtf_transcript.keys():
91 chr = chr_strand.split("#")[0] 89 chr = chr_strand.split("#")[0]
92 strand = chr_strand.split("#")[1] 90 strand = chr_strand.split("#")[1]
93 91
94 for transcript_gene in gtf_transcript[chr_strand].keys(): 92 for transcript_gene in gtf_transcript[chr_strand].keys():
95 start_list = gtf_transcript[chr_strand][transcript_gene][0] 93 start_list = gtf_transcript[chr_strand][transcript_gene][0]
96 end_list = gtf_transcript[chr_strand][transcript_gene][1] 94 end_list = gtf_transcript[chr_strand][transcript_gene][1]
97 sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])] 95 sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])]
98 sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])] 96 sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])]
99 if sorted_start_index == sorted_end_index: 97 if sorted_start_index == sorted_end_index:
100 sorted_start = sorted(start_list) 98 sorted_start = sorted(start_list)
101 sorted_end = [end_list[i] for i in sorted_start_index] 99 sorted_end = [end_list[i] for i in sorted_start_index]
102 for x in range(len(sorted_start))[1:]: 100 for x in range(len(sorted_start))[1:]:
103 intron_start = sorted_end[x-1]+1 101 intron_start = sorted_end[x - 1] + 1
104 intron_end = sorted_start[x]-1 102 intron_end = sorted_start[x] - 1
105 transcript = transcript_gene.split("#")[0] 103 transcript = transcript_gene.split("#")[0]
106 gene = transcript_gene.split("#")[1] 104 gene = transcript_gene.split("#")[1]
107 data = (chr, str(intron_start), str(intron_end), gene, transcript, strand, "intron") 105 data = (chr, str(intron_start), str(intron_end), gene, transcript, strand, "intron")
108 gtf[strand]["intron"].append(data) 106 gtf[strand]["intron"].append(data)
109 107
110
111 # "Starting Reading Intergenic . . ." 108 # "Starting Reading Intergenic . . ."
112 109
113 gtf["+"]["intergenic"] = [] 110 gtf["+"]["intergenic"] = []
114 gtf["-"]["intergenic"] = [] 111 gtf["-"]["intergenic"] = []
115 for chr_strand in gtf_gene.keys(): 112 for chr_strand in gtf_gene.keys():
116 chr = chr_strand.split("#")[0] 113 chr = chr_strand.split("#")[0]
117 strand = chr_strand.split("#")[1] 114 strand = chr_strand.split("#")[1]
118 start_list = gtf_gene[chr_strand][0] 115 start_list = gtf_gene[chr_strand][0]
119 end_list = gtf_gene[chr_strand][1] 116 end_list = gtf_gene[chr_strand][1]
120 gene_list = gtf_gene[chr_strand][2] 117 gene_list = gtf_gene[chr_strand][2]
121 sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])] 118 sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])]
122 sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])] 119 sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])]
123 120
124 sorted_start = sorted(start_list) 121 sorted_start = sorted(start_list)
125 sorted_end = [end_list[i] for i in sorted_start_index] 122 sorted_end = [end_list[i] for i in sorted_start_index]
126 sorted_gene = [gene_list[i] for i in sorted_start_index] 123 sorted_gene = [gene_list[i] for i in sorted_start_index]
127 for x in range(len(sorted_start))[1:]: 124 for x in range(len(sorted_start))[1:]:
128 intergene_start = sorted_end[x-1]+1 125 intergene_start = sorted_end[x - 1] + 1
129 intergene_end = sorted_start[x]-1 126 intergene_end = sorted_start[x] - 1
130 if intergene_start < intergene_end: 127 if intergene_start < intergene_end:
131 intergene_1 = sorted_gene[x-1] 128 intergene_1 = sorted_gene[x - 1]
132 intergene_2 = sorted_gene[x] 129 intergene_2 = sorted_gene[x]
133 gene = intergene_1 + "-#-" + intergene_2 130 gene = intergene_1 + "-#-" + intergene_2
134 data = (chr, str(intergene_start), str(intergene_end), gene, "", strand, "intergenic") 131 data = (chr, str(intergene_start), str(intergene_end), gene, "", strand, "intergenic")
135 gtf[strand]["intergenic"].append(data) 132 gtf[strand]["intergenic"].append(data)
136 133
137 import sqlite3 134 import sqlite3
138 # conn = sqlite3.connect('gtf_database.db') 135 # conn = sqlite3.connect('gtf_database.db')
139 conn = sqlite3.connect(":memory:") 136 conn = sqlite3.connect(":memory:")
140 c = conn.cursor() 137 c = conn.cursor()
141 # c.execute("DROP TABLE IF EXISTS gtf_data;") 138 # c.execute("DROP TABLE IF EXISTS gtf_data;")
142 # c.execute("CREATE TABLE IF NOT EXISTS gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)") 139 # c.execute("CREATE TABLE IF NOT EXISTS gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)")
143 c.execute("CREATE TABLE gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)") 140 c.execute("CREATE TABLE gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)")
144 141
145 for strand in gtf.keys(): 142 for strand in gtf.keys():
146 if strand == "+": 143 if strand not in ["+", "-"]:
147 st = "positive" 144 print("Please check the strand information in the GTF file. It should be '+' or '-'.")
148 elif strand == "-": 145
149 st = "negative"
150 else:
151 print "Please check the strand information in the GTF file. It should be '+' or '-'."
152
153 for type in gtf[strand].keys(): 146 for type in gtf[strand].keys():
154 data = gtf[strand][type] 147 data = gtf[strand][type]
155 c.executemany('INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)', data) 148 c.executemany('INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)', data)
156 149
157 conn.commit() 150 conn.commit()
158 151
159 infh = open(inputFile[2], "r") 152 infh = open(inputFile[2], "r")
160 # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r") 153 # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r")
161 data = infh.readlines() 154 data = infh.readlines()
162 # output file 155 # output file
163 outfh = open(inputFile[3], 'w') 156 outfh = open(inputFile[3], 'w')
164 # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w") 157 # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w")
165 158
166 for each in data: 159 for each in data:
167 a = each.strip().split("\t") 160 a = each.strip().split("\t")
168 chr = a[0].strip() 161 chr = a[0].strip()
169 pep_start = str(int(a[1].strip())+1) 162 pep_start = str(int(a[1].strip()) + 1)
170 pep_end = a[2].strip() 163 pep_end = a[2].strip()
171 strand = a[5].strip() 164 strand = a[5].strip()
172 each = "\t".join(a[:6]) 165 each = "\t".join(a[:6])
173 if (len(a) == 12 and int(a[9]) == 1) or (len(a) == 6): 166 if (len(a) == 12 and int(a[9]) == 1) or (len(a) == 6):
174 c.execute("select * from gtf_data where type = 'CDS' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") 167 c.execute("select * from gtf_data where type = 'CDS' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ")
175 rows = c.fetchall() 168 rows = c.fetchall()
176 if len(rows) > 0: 169 if len(rows) > 0:
177 outfh.write(each.strip() + "\tCDS\n") 170 outfh.write(each.strip() + "\tCDS\n")
178 else: 171 else:
179 c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") 172 c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ")
180 rows = c.fetchall() 173 rows = c.fetchall()
181 if len(rows) > 0: 174 if len(rows) > 0:
182 outfh.write(each.strip() + "\tfive_prime_utr\n") 175 outfh.write(each.strip() + "\tfive_prime_utr\n")
183 else: 176 else:
184 c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") 177 c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ")
185 rows = c.fetchall() 178 rows = c.fetchall()
186 if len(rows) > 0: 179 if len(rows) > 0:
187 outfh.write(each.strip() + "\tthree_prime_utr\n") 180 outfh.write(each.strip() + "\tthree_prime_utr\n")
188 else: 181 else:
189 c.execute("select * from gtf_data where type = 'exon' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") 182 c.execute("select * from gtf_data where type = 'exon' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ")
190 rows = c.fetchall() 183 rows = c.fetchall()
191 if len(rows) > 0: 184 if len(rows) > 0:
192 outfh.write(each.strip() + "\texon\n") 185 outfh.write(each.strip() + "\texon\n")
193 else: 186 else:
194 c.execute("select * from gtf_data where type = 'intron' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") 187 c.execute("select * from gtf_data where type = 'intron' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ")
195 rows = c.fetchall() 188 rows = c.fetchall()
196 if len(rows) > 0: 189 if len(rows) > 0:
197 outfh.write(each.strip() + "\tintron\n") 190 outfh.write(each.strip() + "\tintron\n")
198 else: 191 else:
199 c.execute("select * from gtf_data where type = 'gene' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") 192 c.execute("select * from gtf_data where type = 'gene' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ")
200 rows = c.fetchall() 193 rows = c.fetchall()
201 if len(rows) > 0: 194 if len(rows) > 0:
202 outfh.write(each.strip() + "\tgene\n") 195 outfh.write(each.strip() + "\tgene\n")
203 else: 196 else:
204 c.execute("select * from gtf_data where type = 'intergenic' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") 197 c.execute("select * from gtf_data where type = 'intergenic' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ")
205 rows = c.fetchall() 198 rows = c.fetchall()
206 if len(rows) > 0: 199 if len(rows) > 0:
207 outfh.write(each.strip() + "\tintergene\n") 200 outfh.write(each.strip() + "\tintergene\n")
208 else: 201 else:
209 outfh.write(each.strip() + "\tOVERLAPPING_ON_TWO_REGIONS: PLEASE_LOOK_MANUALLY (Will be updated in next version)\n") 202 outfh.write(each.strip() + "\tOVERLAPPING_ON_TWO_REGIONS: PLEASE_LOOK_MANUALLY (Will be updated in next version)\n")
210 elif (len(a) == 12 and int(a[9]) == 2): 203 elif (len(a) == 12 and int(a[9]) == 2):
211 outfh.write(each.strip() + "\tSpliceJunction\n") 204 outfh.write(each.strip() + "\tSpliceJunction\n")
212 else: 205 else:
213 outfh.write(each.strip() + "\tPlease check\n") 206 outfh.write(each.strip() + "\tPlease check\n")
214 207
215 conn.close() 208 conn.close()
216 outfh.close() 209 outfh.close()
217 else: 210 else:
218 print "USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>" 211 print("USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>")
219 return None 212 return None
213
220 214
221 if __name__ == "__main__": 215 if __name__ == "__main__":
222 main() 216 main()
223
224
225
226
227