Mercurial > repos > galaxyp > peptide_genomic_coordinate
comparison peptide_genomic_coordinate.py @ 1:cb0378d2d487 draft default tip
"planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
author | galaxyp |
---|---|
date | Sun, 14 Mar 2021 03:01:11 +0000 |
parents | 5f49ffce52cb |
children |
comparison
equal
deleted
inserted
replaced
0:5f49ffce52cb | 1:cb0378d2d487 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # | 2 # |
3 # Author: Praveen Kumar | 3 # Author: Praveen Kumar |
4 # University of Minnesota | 4 # University of Minnesota |
5 # | 5 # |
6 # Get peptide's genomic coordinate from the protein's genomic mapping sqlite file (which is derived from the https://toolshed.g2.bx.psu.edu/view/galaxyp/translate_bed/038ecf54cbec) | 6 # Get peptide's genomic coordinate from the protein's genomic mapping sqlite file |
7 # | 7 # (which is derived from the https://toolshed.g2.bx.psu.edu/view/galaxyp/translate_bed/038ecf54cbec) |
8 # | |
8 # python peptideGenomicCoordinate.py <peptide_list> <mz_to_sqlite DB> <genomic mapping file DB> <output.bed> | 9 # python peptideGenomicCoordinate.py <peptide_list> <mz_to_sqlite DB> <genomic mapping file DB> <output.bed> |
9 # | 10 # |
11 import argparse | |
12 import sqlite3 | |
10 import sys | 13 import sys |
11 import sqlite3 | 14 |
15 | |
16 pep_stmt = """\ | |
17 SELECT dBSequence_ref, start, end, peptide_ref \ | |
18 FROM peptide_evidence e JOIN peptides p on e.peptide_ref = p.id \ | |
19 WHERE isDecoy = 'false' AND p.sequence = ?\ | |
20 """ | |
21 | |
22 map_stmt = """ | |
23 SELECT name, chrom, start, end, strand, cds_start, cds_end \ | |
24 FROM feature_cds_map \ | |
25 WHERE name = ? \ | |
26 AND cds_end >= ? AND cds_start <= ? \ | |
27 ORDER BY cds_start\ | |
28 """ | |
12 | 29 |
13 | 30 |
14 def main(): | 31 def main(): |
15 conn = sqlite3.connect(sys.argv[2]) | 32 parser = argparse.ArgumentParser(description='BED file for peptides') |
16 c = conn.cursor() | 33 parser.add_argument('peptides_file', |
17 c.execute("DROP table if exists novel") | 34 metavar='peptides.tabular', |
18 conn.commit() | 35 type=argparse.FileType('r'), |
19 c.execute("CREATE TABLE novel(peptide text)") | 36 help='List of peptides, one per line') |
20 pepfile = open(sys.argv[1],"r") | 37 parser.add_argument('mz_to_sqlite', |
21 | 38 metavar='mz.sqlite', |
22 pep_seq = [] | 39 help='mz_to_sqlite sqlite database') |
40 parser.add_argument('genome_mapping', | |
41 metavar='genome_mapping.sqlite', | |
42 help='genome_mapping sqlite database') | |
43 parser.add_argument('bed_file', | |
44 metavar='peptides.bed', | |
45 type=argparse.FileType('w'), | |
46 help='BED file of peptide genomic locations') | |
47 parser.add_argument('-a', '--accession', | |
48 action='store_true', | |
49 help='Append the accession to the peptide for BED name') | |
50 parser.add_argument('-d', '--debug', | |
51 action='store_true', | |
52 help='Debug') | |
53 args = parser.parse_args() | |
54 | |
55 pconn = sqlite3.connect(args.mz_to_sqlite) | |
56 pc = pconn.cursor() | |
57 mconn = sqlite3.connect(args.genome_mapping) | |
58 mc = mconn.cursor() | |
59 outfh = args.bed_file | |
60 pepfile = args.peptides_file | |
23 for seq in pepfile.readlines(): | 61 for seq in pepfile.readlines(): |
24 seq = seq.strip() | 62 seq = seq.strip() |
25 pep_seq.append(tuple([seq])) | 63 pc.execute(pep_stmt, (seq,)) |
26 | 64 pep_refs = pc.fetchall() |
27 c.executemany("insert into novel(peptide) values(?)", pep_seq) | 65 for pep_ref in pep_refs: |
28 conn.commit() | 66 (acc, pep_start, pep_end, pep_seq) = pep_ref |
29 | 67 cds_start = (pep_start - 1) * 3 |
30 c.execute("SELECT distinct psm.sequence, ps.id, ps.sequence from db_sequence ps, psm_entries psm, novel n, proteins_by_peptide pbp where psm.sequence = n.peptide and pbp.peptide_ref = psm.id and pbp.id = ps.id") | 68 cds_end = pep_end * 3 |
31 rows = c.fetchall() | 69 if args.debug: |
32 | 70 print('%s\t%s\t%s\t%d\t%d' % (acc, pep_start, pep_end, cds_start, cds_end), file=sys.stdout) |
33 conn1 = sqlite3.connect(sys.argv[3]) | 71 mc.execute(map_stmt, (acc, cds_start, cds_end)) |
34 c1 = conn1.cursor() | 72 exons = mc.fetchall() |
35 | 73 if args.debug: |
36 outfh = open(sys.argv[4], "w") | 74 print('\n'.join([str(e) for e in exons]), file=sys.stdout) |
37 | 75 if exons: |
38 master_dict = {} | 76 chrom = exons[0][1] |
39 for each in rows: | 77 strand = exons[0][4] |
40 peptide = each[0] | 78 if strand == '+': |
41 acc = each[1] | 79 start = exons[0][2] + cds_start |
42 acc_seq = each[2] | 80 end = exons[-1][2] + cds_end - exons[-1][5] |
43 | 81 blk_start = [] |
44 c1.execute("SELECT chrom,start,end,name,strand,cds_start,cds_end FROM feature_cds_map map WHERE map.name = '"+acc+"'") | 82 blk_size = [] |
45 coordinates = c1.fetchall() | 83 for exon in exons: |
46 | 84 offset = cds_start if cds_start > exon[5] else 0 |
47 if len(coordinates) != 0: | 85 bstart = exon[2] + offset |
48 pep_start = 0 | 86 bsize = min(cds_end, exon[6]) - max(cds_start, exon[5]) |
49 pep_end = 0 | 87 if args.debug: |
50 flag = 0 | 88 print('bstart %d\tbsize %d\t %d' % (bstart, bsize, offset), file=sys.stdout) |
51 splice_flag = 0 | 89 blk_start.append(bstart - start) |
52 spliced_peptide = [] | 90 blk_size.append(bsize) |
53 for each_entry in coordinates: | |
54 chromosome = each_entry[0] | |
55 start = int(each_entry[1]) | |
56 end = int(each_entry[2]) | |
57 strand = each_entry[4] | |
58 cds_start = int(each_entry[5]) | |
59 cds_end = int(each_entry[6]) | |
60 pep_pos_start = (acc_seq.find(peptide)*3) | |
61 pep_pos_end = pep_pos_start + (len(peptide)*3) | |
62 if pep_pos_start >= cds_start and pep_pos_end <= cds_end: | |
63 if strand == "+": | |
64 pep_start = start + pep_pos_start - cds_start | |
65 pep_end = start + pep_pos_end - cds_start | |
66 pep_thick_start = 0 | |
67 pep_thick_end = len(peptide) | |
68 flag == 1 | |
69 else: | |
70 pep_end = end - pep_pos_start + cds_start | |
71 pep_start = end - pep_pos_end + cds_start | |
72 pep_thick_start = 0 | |
73 pep_thick_end = len(peptide) | |
74 flag == 1 | |
75 spliced_peptide = [] | |
76 splice_flag = 0 | |
77 else: | 91 else: |
78 if flag == 0: | 92 start = exons[-1][2] + exons[-1][6] - cds_end |
79 if strand == "+": | 93 end = exons[0][3] - cds_start + exons[0][5] |
80 if pep_pos_start >= cds_start and pep_pos_start <= cds_end and pep_pos_end > cds_end: | 94 blk_start = [] |
81 pep_start = start + pep_pos_start - cds_start | 95 blk_size = [] |
82 pep_end = end | 96 for exon in reversed(exons): |
83 pep_thick_start = 0 | 97 bstart = exon[2] + exon[6] - min(exon[6], cds_end) |
84 pep_thick_end = (pep_end-pep_start) | 98 bsize = min(cds_end, exon[6]) - max(cds_start, exon[5]) |
85 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end]) | 99 # bend = exon[3] - (exon[5] - max(exon[5], cds_start)) |
86 splice_flag = splice_flag + 1 | 100 bend = exon[3] - min(cds_start - exon[5], cds_start) |
87 if splice_flag == 2: | 101 bend = exon[3] - bsize |
88 flag = 1 | 102 if args.debug: |
89 elif pep_pos_end >= cds_start and pep_pos_end <= cds_end and pep_pos_start < cds_start: | 103 print('bstart %d\tbsize %d\tbend %d' % (bstart, bsize, bend), file=sys.stdout) |
90 pep_start = start | 104 blk_start.append(bstart - start) |
91 pep_end = start + pep_pos_end - cds_start | 105 blk_size.append(bsize) |
92 pep_thick_start = (len(peptide)*3)-(pep_end-pep_start) | 106 bed_line = [str(chrom), str(start), str(end), |
93 pep_thick_end = (len(peptide)*3) | 107 '_'.join([seq, acc]) if args.accession else seq, |
94 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end]) | 108 '255', strand, |
95 splice_flag = splice_flag + 1 | 109 str(start), str(end), |
96 if splice_flag == 2: | 110 '0,0,0', |
97 flag = 1 | 111 str(len(blk_start)), |
98 else: | 112 ','.join([str(b) for b in blk_size]), |
99 pass | 113 ','.join([str(b) for b in blk_start])] |
100 else: | 114 if args.debug: |
101 if pep_pos_start >= cds_start and pep_pos_start <= cds_end and pep_pos_end >= cds_end: | 115 print('\t'.join(bed_line), file=sys.stdout) |
102 pep_start = start | 116 outfh.write('\t'.join(bed_line) + '\n') |
103 pep_end = end - pep_pos_start - cds_start | 117 pconn.close() |
104 pep_thick_start = 0 | 118 mconn.close() |
105 pep_thick_end = (pep_end-pep_start) | |
106 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end]) | |
107 splice_flag = splice_flag + 1 | |
108 if splice_flag == 2: | |
109 flag = 1 | |
110 elif pep_pos_end >= cds_start and pep_pos_end <= cds_end and pep_pos_start <= cds_start: | |
111 pep_start = end - pep_pos_end + cds_start | |
112 pep_end = end | |
113 pep_thick_start = (len(peptide)*3)-(pep_end-pep_start) | |
114 pep_thick_end = (len(peptide)*3) | |
115 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end]) | |
116 splice_flag = splice_flag + 1 | |
117 if splice_flag == 2: | |
118 flag = 1 | |
119 else: | |
120 pass | |
121 | |
122 if len(spliced_peptide) == 0: | |
123 if strand == "+": | |
124 bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"] | |
125 else: | |
126 bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"] | |
127 outfh.write("\t".join(bed_line)+"\n") | |
128 else: | |
129 if strand == "+": | |
130 pep_entry = spliced_peptide | |
131 pep_start = min([pep_entry[0][0], pep_entry[1][0]]) | |
132 pep_end = max([pep_entry[0][1], pep_entry[1][1]]) | |
133 blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))] | |
134 blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))] | |
135 bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)] | |
136 outfh.write("\t".join(bed_line)+"\n") | |
137 else: | |
138 pep_entry = spliced_peptide | |
139 pep_start = min([pep_entry[0][0], pep_entry[1][0]]) | |
140 pep_end = max([pep_entry[0][1], pep_entry[1][1]]) | |
141 blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))] | |
142 blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))] | |
143 bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)] | |
144 outfh.write("\t".join(bed_line)+"\n") | |
145 c.execute("DROP table novel") | |
146 conn.commit() | |
147 conn.close() | |
148 conn1.close() | |
149 outfh.close() | 119 outfh.close() |
150 pepfile.close() | 120 pepfile.close() |
151 | 121 |
152 return None | 122 |
153 if __name__ == "__main__": | 123 if __name__ == '__main__': |
154 main() | 124 main() |