annotate peptideGenomicCoordinate.py @ 1:561b62936e2c draft

planemo upload
author pravs
date Mon, 09 Apr 2018 16:53:05 -0400
parents
children 1b61f7aafb61
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
561b62936e2c planemo upload
pravs
parents:
diff changeset
1
561b62936e2c planemo upload
pravs
parents:
diff changeset
2 #
561b62936e2c planemo upload
pravs
parents:
diff changeset
3 # Author: Praveen Kumar
561b62936e2c planemo upload
pravs
parents:
diff changeset
4 # University of Minnesota
561b62936e2c planemo upload
pravs
parents:
diff changeset
5 #
561b62936e2c planemo upload
pravs
parents:
diff changeset
6 # Get peptide's genomic coordinate from the protein's genomic mapping sqlite file (which is derived from the https://toolshed.g2.bx.psu.edu/view/galaxyp/translate_bed/038ecf54cbec)
561b62936e2c planemo upload
pravs
parents:
diff changeset
7 #
561b62936e2c planemo upload
pravs
parents:
diff changeset
8 # python peptideGenomicCoordinate.py <peptide_list> <mz_to_sqlite DB> <genomic mapping file DB> <output.bed>
561b62936e2c planemo upload
pravs
parents:
diff changeset
9 #
561b62936e2c planemo upload
pravs
parents:
diff changeset
10 #
561b62936e2c planemo upload
pravs
parents:
diff changeset
11
561b62936e2c planemo upload
pravs
parents:
diff changeset
12 def main():
561b62936e2c planemo upload
pravs
parents:
diff changeset
13 import sys
561b62936e2c planemo upload
pravs
parents:
diff changeset
14 import sqlite3
561b62936e2c planemo upload
pravs
parents:
diff changeset
15 conn = sqlite3.connect(sys.argv[2])
561b62936e2c planemo upload
pravs
parents:
diff changeset
16 c = conn.cursor()
561b62936e2c planemo upload
pravs
parents:
diff changeset
17 c.execute("DROP table if exists novel")
561b62936e2c planemo upload
pravs
parents:
diff changeset
18 conn.commit()
561b62936e2c planemo upload
pravs
parents:
diff changeset
19 c.execute("CREATE TABLE novel(peptide text)")
561b62936e2c planemo upload
pravs
parents:
diff changeset
20 pepfile = open(sys.argv[1],"r")
561b62936e2c planemo upload
pravs
parents:
diff changeset
21
561b62936e2c planemo upload
pravs
parents:
diff changeset
22 for seq in pepfile.readlines():
561b62936e2c planemo upload
pravs
parents:
diff changeset
23 seq = seq.strip()
561b62936e2c planemo upload
pravs
parents:
diff changeset
24 c.execute('INSERT INTO novel VALUES ("'+seq+'")')
561b62936e2c planemo upload
pravs
parents:
diff changeset
25 conn.commit()
561b62936e2c planemo upload
pravs
parents:
diff changeset
26
561b62936e2c planemo upload
pravs
parents:
diff changeset
27 c.execute("SELECT distinct psm.sequence, ps.id, ps.sequence from db_sequence ps, psm_entries psm, novel n, proteins_by_peptide pbp where psm.sequence = n.peptide and pbp.peptide_ref = psm.id and pbp.id = ps.id")
561b62936e2c planemo upload
pravs
parents:
diff changeset
28 rows = c.fetchall()
561b62936e2c planemo upload
pravs
parents:
diff changeset
29
561b62936e2c planemo upload
pravs
parents:
diff changeset
30 conn1 = sqlite3.connect(sys.argv[3])
561b62936e2c planemo upload
pravs
parents:
diff changeset
31 c1 = conn1.cursor()
561b62936e2c planemo upload
pravs
parents:
diff changeset
32
561b62936e2c planemo upload
pravs
parents:
diff changeset
33 outfh = open(sys.argv[4], "w")
561b62936e2c planemo upload
pravs
parents:
diff changeset
34
561b62936e2c planemo upload
pravs
parents:
diff changeset
35 master_dict = {}
561b62936e2c planemo upload
pravs
parents:
diff changeset
36 for each in rows:
561b62936e2c planemo upload
pravs
parents:
diff changeset
37 peptide = each[0]
561b62936e2c planemo upload
pravs
parents:
diff changeset
38 acc = each[1]
561b62936e2c planemo upload
pravs
parents:
diff changeset
39 acc_seq = each[2]
561b62936e2c planemo upload
pravs
parents:
diff changeset
40
561b62936e2c planemo upload
pravs
parents:
diff changeset
41 c1.execute("SELECT chrom,start,end,name,strand,cds_start,cds_end FROM feature_cds_map map WHERE map.name = '"+acc+"'")
561b62936e2c planemo upload
pravs
parents:
diff changeset
42 coordinates = c1.fetchall()
561b62936e2c planemo upload
pravs
parents:
diff changeset
43
561b62936e2c planemo upload
pravs
parents:
diff changeset
44 if len(coordinates) != 0:
561b62936e2c planemo upload
pravs
parents:
diff changeset
45 pep_start = 0
561b62936e2c planemo upload
pravs
parents:
diff changeset
46 pep_end = 0
561b62936e2c planemo upload
pravs
parents:
diff changeset
47 flag = 0
561b62936e2c planemo upload
pravs
parents:
diff changeset
48 splice_flag = 0
561b62936e2c planemo upload
pravs
parents:
diff changeset
49 spliced_peptide = []
561b62936e2c planemo upload
pravs
parents:
diff changeset
50 for each_entry in coordinates:
561b62936e2c planemo upload
pravs
parents:
diff changeset
51 chromosome = each_entry[0]
561b62936e2c planemo upload
pravs
parents:
diff changeset
52 start = int(each_entry[1])
561b62936e2c planemo upload
pravs
parents:
diff changeset
53 end = int(each_entry[2])
561b62936e2c planemo upload
pravs
parents:
diff changeset
54 strand = each_entry[4]
561b62936e2c planemo upload
pravs
parents:
diff changeset
55 cds_start = int(each_entry[5])
561b62936e2c planemo upload
pravs
parents:
diff changeset
56 cds_end = int(each_entry[6])
561b62936e2c planemo upload
pravs
parents:
diff changeset
57 pep_pos_start = (acc_seq.find(peptide)*3)
561b62936e2c planemo upload
pravs
parents:
diff changeset
58 pep_pos_end = pep_pos_start + (len(peptide)*3)
561b62936e2c planemo upload
pravs
parents:
diff changeset
59 if (pep_pos_start >= cds_start) and (pep_pos_end <= cds_end):
561b62936e2c planemo upload
pravs
parents:
diff changeset
60 if strand == "+":
561b62936e2c planemo upload
pravs
parents:
diff changeset
61 pep_start = start + pep_pos_start - cds_start
561b62936e2c planemo upload
pravs
parents:
diff changeset
62 pep_end = start + pep_pos_end - cds_start
561b62936e2c planemo upload
pravs
parents:
diff changeset
63 pep_thick_start = 0
561b62936e2c planemo upload
pravs
parents:
diff changeset
64 pep_thick_end = len(peptide)
561b62936e2c planemo upload
pravs
parents:
diff changeset
65 flag == 1
561b62936e2c planemo upload
pravs
parents:
diff changeset
66 else:
561b62936e2c planemo upload
pravs
parents:
diff changeset
67 pep_end = end - pep_pos_start + cds_start
561b62936e2c planemo upload
pravs
parents:
diff changeset
68 pep_start = end - pep_pos_end + cds_start
561b62936e2c planemo upload
pravs
parents:
diff changeset
69 pep_thick_start = 0
561b62936e2c planemo upload
pravs
parents:
diff changeset
70 pep_thick_end = len(peptide)
561b62936e2c planemo upload
pravs
parents:
diff changeset
71 flag == 1
561b62936e2c planemo upload
pravs
parents:
diff changeset
72 spliced_peptide = []
561b62936e2c planemo upload
pravs
parents:
diff changeset
73 splice_flag = 0
561b62936e2c planemo upload
pravs
parents:
diff changeset
74 else:
561b62936e2c planemo upload
pravs
parents:
diff changeset
75 if flag == 0:
561b62936e2c planemo upload
pravs
parents:
diff changeset
76 if strand == "+":
561b62936e2c planemo upload
pravs
parents:
diff changeset
77 if (pep_pos_start >= cds_start) and (pep_pos_start <= cds_end) and (pep_pos_end > cds_end):
561b62936e2c planemo upload
pravs
parents:
diff changeset
78 pep_start = start + pep_pos_start - cds_start
561b62936e2c planemo upload
pravs
parents:
diff changeset
79 pep_end = end
561b62936e2c planemo upload
pravs
parents:
diff changeset
80 pep_thick_start = 0
561b62936e2c planemo upload
pravs
parents:
diff changeset
81 pep_thick_end = (pep_end-pep_start)
561b62936e2c planemo upload
pravs
parents:
diff changeset
82 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
561b62936e2c planemo upload
pravs
parents:
diff changeset
83 splice_flag = splice_flag + 1
561b62936e2c planemo upload
pravs
parents:
diff changeset
84 if splice_flag == 2:
561b62936e2c planemo upload
pravs
parents:
diff changeset
85 flag = 1
561b62936e2c planemo upload
pravs
parents:
diff changeset
86 elif (pep_pos_end >= cds_start) and (pep_pos_end <= cds_end) and (pep_pos_start < cds_start):
561b62936e2c planemo upload
pravs
parents:
diff changeset
87 pep_start = start
561b62936e2c planemo upload
pravs
parents:
diff changeset
88 pep_end = start + pep_pos_end - cds_start
561b62936e2c planemo upload
pravs
parents:
diff changeset
89 pep_thick_start = (len(peptide)*3)-(pep_end-pep_start)
561b62936e2c planemo upload
pravs
parents:
diff changeset
90 pep_thick_end = (len(peptide)*3)
561b62936e2c planemo upload
pravs
parents:
diff changeset
91 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
561b62936e2c planemo upload
pravs
parents:
diff changeset
92 splice_flag = splice_flag + 1
561b62936e2c planemo upload
pravs
parents:
diff changeset
93 if splice_flag == 2:
561b62936e2c planemo upload
pravs
parents:
diff changeset
94 flag = 1
561b62936e2c planemo upload
pravs
parents:
diff changeset
95 else:
561b62936e2c planemo upload
pravs
parents:
diff changeset
96 pass
561b62936e2c planemo upload
pravs
parents:
diff changeset
97 else:
561b62936e2c planemo upload
pravs
parents:
diff changeset
98 if (pep_pos_start >= cds_start) and (pep_pos_start <= cds_end) and (pep_pos_end >= cds_end):
561b62936e2c planemo upload
pravs
parents:
diff changeset
99 pep_start = start
561b62936e2c planemo upload
pravs
parents:
diff changeset
100 pep_end = end - pep_pos_start - cds_start
561b62936e2c planemo upload
pravs
parents:
diff changeset
101 pep_thick_start = 0
561b62936e2c planemo upload
pravs
parents:
diff changeset
102 pep_thick_end = (pep_end-pep_start)
561b62936e2c planemo upload
pravs
parents:
diff changeset
103 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
561b62936e2c planemo upload
pravs
parents:
diff changeset
104 splice_flag = splice_flag + 1
561b62936e2c planemo upload
pravs
parents:
diff changeset
105 if splice_flag == 2:
561b62936e2c planemo upload
pravs
parents:
diff changeset
106 flag = 1
561b62936e2c planemo upload
pravs
parents:
diff changeset
107 elif (pep_pos_end >= cds_start) and (pep_pos_end <= cds_end) and (pep_pos_start <= cds_start):
561b62936e2c planemo upload
pravs
parents:
diff changeset
108 pep_start = end - pep_pos_end + cds_start
561b62936e2c planemo upload
pravs
parents:
diff changeset
109 pep_end = end
561b62936e2c planemo upload
pravs
parents:
diff changeset
110 pep_thick_start = (len(peptide)*3)-(pep_end-pep_start)
561b62936e2c planemo upload
pravs
parents:
diff changeset
111 pep_thick_end = (len(peptide)*3)
561b62936e2c planemo upload
pravs
parents:
diff changeset
112 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
561b62936e2c planemo upload
pravs
parents:
diff changeset
113 splice_flag = splice_flag + 1
561b62936e2c planemo upload
pravs
parents:
diff changeset
114 if splice_flag == 2:
561b62936e2c planemo upload
pravs
parents:
diff changeset
115 flag = 1
561b62936e2c planemo upload
pravs
parents:
diff changeset
116 else:
561b62936e2c planemo upload
pravs
parents:
diff changeset
117 pass
561b62936e2c planemo upload
pravs
parents:
diff changeset
118 if len(spliced_peptide) == 0:
561b62936e2c planemo upload
pravs
parents:
diff changeset
119 if strand == "+":
561b62936e2c planemo upload
pravs
parents:
diff changeset
120 bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"]
561b62936e2c planemo upload
pravs
parents:
diff changeset
121 else:
561b62936e2c planemo upload
pravs
parents:
diff changeset
122 bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"]
561b62936e2c planemo upload
pravs
parents:
diff changeset
123 outfh.write("\t".join(bed_line)+"\n")
561b62936e2c planemo upload
pravs
parents:
diff changeset
124 else:
561b62936e2c planemo upload
pravs
parents:
diff changeset
125 if strand == "+":
561b62936e2c planemo upload
pravs
parents:
diff changeset
126 pep_entry = spliced_peptide
561b62936e2c planemo upload
pravs
parents:
diff changeset
127 pep_start = min([pep_entry[0][0], pep_entry[1][0]])
561b62936e2c planemo upload
pravs
parents:
diff changeset
128 pep_end = max([pep_entry[0][1], pep_entry[1][1]])
561b62936e2c planemo upload
pravs
parents:
diff changeset
129 blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))]
561b62936e2c planemo upload
pravs
parents:
diff changeset
130 blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))]
561b62936e2c planemo upload
pravs
parents:
diff changeset
131 bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)]
561b62936e2c planemo upload
pravs
parents:
diff changeset
132 outfh.write("\t".join(bed_line)+"\n")
561b62936e2c planemo upload
pravs
parents:
diff changeset
133 else:
561b62936e2c planemo upload
pravs
parents:
diff changeset
134 pep_entry = spliced_peptide
561b62936e2c planemo upload
pravs
parents:
diff changeset
135 pep_start = min([pep_entry[0][0], pep_entry[1][0]])
561b62936e2c planemo upload
pravs
parents:
diff changeset
136 pep_end = max([pep_entry[0][1], pep_entry[1][1]])
561b62936e2c planemo upload
pravs
parents:
diff changeset
137 blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))]
561b62936e2c planemo upload
pravs
parents:
diff changeset
138 blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))]
561b62936e2c planemo upload
pravs
parents:
diff changeset
139 bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)]
561b62936e2c planemo upload
pravs
parents:
diff changeset
140 outfh.write("\t".join(bed_line)+"\n")
561b62936e2c planemo upload
pravs
parents:
diff changeset
141 c.execute("DROP table novel")
561b62936e2c planemo upload
pravs
parents:
diff changeset
142 conn.commit()
561b62936e2c planemo upload
pravs
parents:
diff changeset
143 conn.close()
561b62936e2c planemo upload
pravs
parents:
diff changeset
144 conn1.close()
561b62936e2c planemo upload
pravs
parents:
diff changeset
145 outfh.close()
561b62936e2c planemo upload
pravs
parents:
diff changeset
146 pepfile.close()
561b62936e2c planemo upload
pravs
parents:
diff changeset
147
561b62936e2c planemo upload
pravs
parents:
diff changeset
148 return None
561b62936e2c planemo upload
pravs
parents:
diff changeset
149 if __name__ == "__main__":
561b62936e2c planemo upload
pravs
parents:
diff changeset
150 main()