comparison peptide_genomic_coordinate.py @ 1:cb0378d2d487 draft default tip

"planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
author galaxyp
date Sun, 14 Mar 2021 03:01:11 +0000
parents 5f49ffce52cb
children
comparison
equal deleted inserted replaced
0:5f49ffce52cb 1:cb0378d2d487
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # 2 #
3 # Author: Praveen Kumar 3 # Author: Praveen Kumar
4 # University of Minnesota 4 # University of Minnesota
5 # 5 #
6 # Get peptide's genomic coordinate from the protein's genomic mapping sqlite file (which is derived from the https://toolshed.g2.bx.psu.edu/view/galaxyp/translate_bed/038ecf54cbec) 6 # Get peptide's genomic coordinate from the protein's genomic mapping sqlite file
7 # 7 # (which is derived from the https://toolshed.g2.bx.psu.edu/view/galaxyp/translate_bed/038ecf54cbec)
8 #
8 # python peptideGenomicCoordinate.py <peptide_list> <mz_to_sqlite DB> <genomic mapping file DB> <output.bed> 9 # python peptideGenomicCoordinate.py <peptide_list> <mz_to_sqlite DB> <genomic mapping file DB> <output.bed>
9 # 10 #
11 import argparse
12 import sqlite3
10 import sys 13 import sys
11 import sqlite3 14
15
16 pep_stmt = """\
17 SELECT dBSequence_ref, start, end, peptide_ref \
18 FROM peptide_evidence e JOIN peptides p on e.peptide_ref = p.id \
19 WHERE isDecoy = 'false' AND p.sequence = ?\
20 """
21
22 map_stmt = """
23 SELECT name, chrom, start, end, strand, cds_start, cds_end \
24 FROM feature_cds_map \
25 WHERE name = ? \
26 AND cds_end >= ? AND cds_start <= ? \
27 ORDER BY cds_start\
28 """
12 29
13 30
14 def main(): 31 def main():
15 conn = sqlite3.connect(sys.argv[2]) 32 parser = argparse.ArgumentParser(description='BED file for peptides')
16 c = conn.cursor() 33 parser.add_argument('peptides_file',
17 c.execute("DROP table if exists novel") 34 metavar='peptides.tabular',
18 conn.commit() 35 type=argparse.FileType('r'),
19 c.execute("CREATE TABLE novel(peptide text)") 36 help='List of peptides, one per line')
20 pepfile = open(sys.argv[1],"r") 37 parser.add_argument('mz_to_sqlite',
21 38 metavar='mz.sqlite',
22 pep_seq = [] 39 help='mz_to_sqlite sqlite database')
40 parser.add_argument('genome_mapping',
41 metavar='genome_mapping.sqlite',
42 help='genome_mapping sqlite database')
43 parser.add_argument('bed_file',
44 metavar='peptides.bed',
45 type=argparse.FileType('w'),
46 help='BED file of peptide genomic locations')
47 parser.add_argument('-a', '--accession',
48 action='store_true',
49 help='Append the accession to the peptide for BED name')
50 parser.add_argument('-d', '--debug',
51 action='store_true',
52 help='Debug')
53 args = parser.parse_args()
54
55 pconn = sqlite3.connect(args.mz_to_sqlite)
56 pc = pconn.cursor()
57 mconn = sqlite3.connect(args.genome_mapping)
58 mc = mconn.cursor()
59 outfh = args.bed_file
60 pepfile = args.peptides_file
23 for seq in pepfile.readlines(): 61 for seq in pepfile.readlines():
24 seq = seq.strip() 62 seq = seq.strip()
25 pep_seq.append(tuple([seq])) 63 pc.execute(pep_stmt, (seq,))
26 64 pep_refs = pc.fetchall()
27 c.executemany("insert into novel(peptide) values(?)", pep_seq) 65 for pep_ref in pep_refs:
28 conn.commit() 66 (acc, pep_start, pep_end, pep_seq) = pep_ref
29 67 cds_start = (pep_start - 1) * 3
30 c.execute("SELECT distinct psm.sequence, ps.id, ps.sequence from db_sequence ps, psm_entries psm, novel n, proteins_by_peptide pbp where psm.sequence = n.peptide and pbp.peptide_ref = psm.id and pbp.id = ps.id") 68 cds_end = pep_end * 3
31 rows = c.fetchall() 69 if args.debug:
32 70 print('%s\t%s\t%s\t%d\t%d' % (acc, pep_start, pep_end, cds_start, cds_end), file=sys.stdout)
33 conn1 = sqlite3.connect(sys.argv[3]) 71 mc.execute(map_stmt, (acc, cds_start, cds_end))
34 c1 = conn1.cursor() 72 exons = mc.fetchall()
35 73 if args.debug:
36 outfh = open(sys.argv[4], "w") 74 print('\n'.join([str(e) for e in exons]), file=sys.stdout)
37 75 if exons:
38 master_dict = {} 76 chrom = exons[0][1]
39 for each in rows: 77 strand = exons[0][4]
40 peptide = each[0] 78 if strand == '+':
41 acc = each[1] 79 start = exons[0][2] + cds_start
42 acc_seq = each[2] 80 end = exons[-1][2] + cds_end - exons[-1][5]
43 81 blk_start = []
44 c1.execute("SELECT chrom,start,end,name,strand,cds_start,cds_end FROM feature_cds_map map WHERE map.name = '"+acc+"'") 82 blk_size = []
45 coordinates = c1.fetchall() 83 for exon in exons:
46 84 offset = cds_start if cds_start > exon[5] else 0
47 if len(coordinates) != 0: 85 bstart = exon[2] + offset
48 pep_start = 0 86 bsize = min(cds_end, exon[6]) - max(cds_start, exon[5])
49 pep_end = 0 87 if args.debug:
50 flag = 0 88 print('bstart %d\tbsize %d\t %d' % (bstart, bsize, offset), file=sys.stdout)
51 splice_flag = 0 89 blk_start.append(bstart - start)
52 spliced_peptide = [] 90 blk_size.append(bsize)
53 for each_entry in coordinates:
54 chromosome = each_entry[0]
55 start = int(each_entry[1])
56 end = int(each_entry[2])
57 strand = each_entry[4]
58 cds_start = int(each_entry[5])
59 cds_end = int(each_entry[6])
60 pep_pos_start = (acc_seq.find(peptide)*3)
61 pep_pos_end = pep_pos_start + (len(peptide)*3)
62 if pep_pos_start >= cds_start and pep_pos_end <= cds_end:
63 if strand == "+":
64 pep_start = start + pep_pos_start - cds_start
65 pep_end = start + pep_pos_end - cds_start
66 pep_thick_start = 0
67 pep_thick_end = len(peptide)
68 flag == 1
69 else:
70 pep_end = end - pep_pos_start + cds_start
71 pep_start = end - pep_pos_end + cds_start
72 pep_thick_start = 0
73 pep_thick_end = len(peptide)
74 flag == 1
75 spliced_peptide = []
76 splice_flag = 0
77 else: 91 else:
78 if flag == 0: 92 start = exons[-1][2] + exons[-1][6] - cds_end
79 if strand == "+": 93 end = exons[0][3] - cds_start + exons[0][5]
80 if pep_pos_start >= cds_start and pep_pos_start <= cds_end and pep_pos_end > cds_end: 94 blk_start = []
81 pep_start = start + pep_pos_start - cds_start 95 blk_size = []
82 pep_end = end 96 for exon in reversed(exons):
83 pep_thick_start = 0 97 bstart = exon[2] + exon[6] - min(exon[6], cds_end)
84 pep_thick_end = (pep_end-pep_start) 98 bsize = min(cds_end, exon[6]) - max(cds_start, exon[5])
85 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end]) 99 # bend = exon[3] - (exon[5] - max(exon[5], cds_start))
86 splice_flag = splice_flag + 1 100 bend = exon[3] - min(cds_start - exon[5], cds_start)
87 if splice_flag == 2: 101 bend = exon[3] - bsize
88 flag = 1 102 if args.debug:
89 elif pep_pos_end >= cds_start and pep_pos_end <= cds_end and pep_pos_start < cds_start: 103 print('bstart %d\tbsize %d\tbend %d' % (bstart, bsize, bend), file=sys.stdout)
90 pep_start = start 104 blk_start.append(bstart - start)
91 pep_end = start + pep_pos_end - cds_start 105 blk_size.append(bsize)
92 pep_thick_start = (len(peptide)*3)-(pep_end-pep_start) 106 bed_line = [str(chrom), str(start), str(end),
93 pep_thick_end = (len(peptide)*3) 107 '_'.join([seq, acc]) if args.accession else seq,
94 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end]) 108 '255', strand,
95 splice_flag = splice_flag + 1 109 str(start), str(end),
96 if splice_flag == 2: 110 '0,0,0',
97 flag = 1 111 str(len(blk_start)),
98 else: 112 ','.join([str(b) for b in blk_size]),
99 pass 113 ','.join([str(b) for b in blk_start])]
100 else: 114 if args.debug:
101 if pep_pos_start >= cds_start and pep_pos_start <= cds_end and pep_pos_end >= cds_end: 115 print('\t'.join(bed_line), file=sys.stdout)
102 pep_start = start 116 outfh.write('\t'.join(bed_line) + '\n')
103 pep_end = end - pep_pos_start - cds_start 117 pconn.close()
104 pep_thick_start = 0 118 mconn.close()
105 pep_thick_end = (pep_end-pep_start)
106 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
107 splice_flag = splice_flag + 1
108 if splice_flag == 2:
109 flag = 1
110 elif pep_pos_end >= cds_start and pep_pos_end <= cds_end and pep_pos_start <= cds_start:
111 pep_start = end - pep_pos_end + cds_start
112 pep_end = end
113 pep_thick_start = (len(peptide)*3)-(pep_end-pep_start)
114 pep_thick_end = (len(peptide)*3)
115 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
116 splice_flag = splice_flag + 1
117 if splice_flag == 2:
118 flag = 1
119 else:
120 pass
121
122 if len(spliced_peptide) == 0:
123 if strand == "+":
124 bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"]
125 else:
126 bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"]
127 outfh.write("\t".join(bed_line)+"\n")
128 else:
129 if strand == "+":
130 pep_entry = spliced_peptide
131 pep_start = min([pep_entry[0][0], pep_entry[1][0]])
132 pep_end = max([pep_entry[0][1], pep_entry[1][1]])
133 blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))]
134 blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))]
135 bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)]
136 outfh.write("\t".join(bed_line)+"\n")
137 else:
138 pep_entry = spliced_peptide
139 pep_start = min([pep_entry[0][0], pep_entry[1][0]])
140 pep_end = max([pep_entry[0][1], pep_entry[1][1]])
141 blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))]
142 blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))]
143 bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)]
144 outfh.write("\t".join(bed_line)+"\n")
145 c.execute("DROP table novel")
146 conn.commit()
147 conn.close()
148 conn1.close()
149 outfh.close() 119 outfh.close()
150 pepfile.close() 120 pepfile.close()
151 121
152 return None 122
153 if __name__ == "__main__": 123 if __name__ == '__main__':
154 main() 124 main()