Mercurial > repos > galaxyp > peptide_genomic_coordinate
diff peptide_genomic_coordinate.py @ 1:cb0378d2d487 draft default tip
"planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
author | galaxyp |
---|---|
date | Sun, 14 Mar 2021 03:01:11 +0000 |
parents | 5f49ffce52cb |
children |
line wrap: on
line diff
--- a/peptide_genomic_coordinate.py Wed Apr 03 04:04:18 2019 -0400 +++ b/peptide_genomic_coordinate.py Sun Mar 14 03:01:11 2021 +0000 @@ -1,154 +1,124 @@ #!/usr/bin/env python -# +# # Author: Praveen Kumar # University of Minnesota # -# Get peptide's genomic coordinate from the protein's genomic mapping sqlite file (which is derived from the https://toolshed.g2.bx.psu.edu/view/galaxyp/translate_bed/038ecf54cbec) -# +# Get peptide's genomic coordinate from the protein's genomic mapping sqlite file +# (which is derived from the https://toolshed.g2.bx.psu.edu/view/galaxyp/translate_bed/038ecf54cbec) +# # python peptideGenomicCoordinate.py <peptide_list> <mz_to_sqlite DB> <genomic mapping file DB> <output.bed> -# +# +import argparse +import sqlite3 import sys -import sqlite3 + + +pep_stmt = """\ +SELECT dBSequence_ref, start, end, peptide_ref \ +FROM peptide_evidence e JOIN peptides p on e.peptide_ref = p.id \ +WHERE isDecoy = 'false' AND p.sequence = ?\ +""" + +map_stmt = """ +SELECT name, chrom, start, end, strand, cds_start, cds_end \ +FROM feature_cds_map \ +WHERE name = ? \ +AND cds_end >= ? AND cds_start <= ? \ +ORDER BY cds_start\ +""" def main(): - conn = sqlite3.connect(sys.argv[2]) - c = conn.cursor() - c.execute("DROP table if exists novel") - conn.commit() - c.execute("CREATE TABLE novel(peptide text)") - pepfile = open(sys.argv[1],"r") - - pep_seq = [] + parser = argparse.ArgumentParser(description='BED file for peptides') + parser.add_argument('peptides_file', + metavar='peptides.tabular', + type=argparse.FileType('r'), + help='List of peptides, one per line') + parser.add_argument('mz_to_sqlite', + metavar='mz.sqlite', + help='mz_to_sqlite sqlite database') + parser.add_argument('genome_mapping', + metavar='genome_mapping.sqlite', + help='genome_mapping sqlite database') + parser.add_argument('bed_file', + metavar='peptides.bed', + type=argparse.FileType('w'), + help='BED file of peptide genomic locations') + parser.add_argument('-a', '--accession', + action='store_true', + help='Append the accession to the peptide for BED name') + parser.add_argument('-d', '--debug', + action='store_true', + help='Debug') + args = parser.parse_args() + + pconn = sqlite3.connect(args.mz_to_sqlite) + pc = pconn.cursor() + mconn = sqlite3.connect(args.genome_mapping) + mc = mconn.cursor() + outfh = args.bed_file + pepfile = args.peptides_file for seq in pepfile.readlines(): seq = seq.strip() - pep_seq.append(tuple([seq])) - - c.executemany("insert into novel(peptide) values(?)", pep_seq) - conn.commit() - - c.execute("SELECT distinct psm.sequence, ps.id, ps.sequence from db_sequence ps, psm_entries psm, novel n, proteins_by_peptide pbp where psm.sequence = n.peptide and pbp.peptide_ref = psm.id and pbp.id = ps.id") - rows = c.fetchall() - - conn1 = sqlite3.connect(sys.argv[3]) - c1 = conn1.cursor() - - outfh = open(sys.argv[4], "w") - - master_dict = {} - for each in rows: - peptide = each[0] - acc = each[1] - acc_seq = each[2] - - c1.execute("SELECT chrom,start,end,name,strand,cds_start,cds_end FROM feature_cds_map map WHERE map.name = '"+acc+"'") - coordinates = c1.fetchall() - - if len(coordinates) != 0: - pep_start = 0 - pep_end = 0 - flag = 0 - splice_flag = 0 - spliced_peptide = [] - for each_entry in coordinates: - chromosome = each_entry[0] - start = int(each_entry[1]) - end = int(each_entry[2]) - strand = each_entry[4] - cds_start = int(each_entry[5]) - cds_end = int(each_entry[6]) - pep_pos_start = (acc_seq.find(peptide)*3) - pep_pos_end = pep_pos_start + (len(peptide)*3) - if pep_pos_start >= cds_start and pep_pos_end <= cds_end: - if strand == "+": - pep_start = start + pep_pos_start - cds_start - pep_end = start + pep_pos_end - cds_start - pep_thick_start = 0 - pep_thick_end = len(peptide) - flag == 1 - else: - pep_end = end - pep_pos_start + cds_start - pep_start = end - pep_pos_end + cds_start - pep_thick_start = 0 - pep_thick_end = len(peptide) - flag == 1 - spliced_peptide = [] - splice_flag = 0 + pc.execute(pep_stmt, (seq,)) + pep_refs = pc.fetchall() + for pep_ref in pep_refs: + (acc, pep_start, pep_end, pep_seq) = pep_ref + cds_start = (pep_start - 1) * 3 + cds_end = pep_end * 3 + if args.debug: + print('%s\t%s\t%s\t%d\t%d' % (acc, pep_start, pep_end, cds_start, cds_end), file=sys.stdout) + mc.execute(map_stmt, (acc, cds_start, cds_end)) + exons = mc.fetchall() + if args.debug: + print('\n'.join([str(e) for e in exons]), file=sys.stdout) + if exons: + chrom = exons[0][1] + strand = exons[0][4] + if strand == '+': + start = exons[0][2] + cds_start + end = exons[-1][2] + cds_end - exons[-1][5] + blk_start = [] + blk_size = [] + for exon in exons: + offset = cds_start if cds_start > exon[5] else 0 + bstart = exon[2] + offset + bsize = min(cds_end, exon[6]) - max(cds_start, exon[5]) + if args.debug: + print('bstart %d\tbsize %d\t %d' % (bstart, bsize, offset), file=sys.stdout) + blk_start.append(bstart - start) + blk_size.append(bsize) else: - if flag == 0: - if strand == "+": - if pep_pos_start >= cds_start and pep_pos_start <= cds_end and pep_pos_end > cds_end: - pep_start = start + pep_pos_start - cds_start - pep_end = end - pep_thick_start = 0 - pep_thick_end = (pep_end-pep_start) - spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end]) - splice_flag = splice_flag + 1 - if splice_flag == 2: - flag = 1 - elif pep_pos_end >= cds_start and pep_pos_end <= cds_end and pep_pos_start < cds_start: - pep_start = start - pep_end = start + pep_pos_end - cds_start - pep_thick_start = (len(peptide)*3)-(pep_end-pep_start) - pep_thick_end = (len(peptide)*3) - spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end]) - splice_flag = splice_flag + 1 - if splice_flag == 2: - flag = 1 - else: - pass - else: - if pep_pos_start >= cds_start and pep_pos_start <= cds_end and pep_pos_end >= cds_end: - pep_start = start - pep_end = end - pep_pos_start - cds_start - pep_thick_start = 0 - pep_thick_end = (pep_end-pep_start) - spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end]) - splice_flag = splice_flag + 1 - if splice_flag == 2: - flag = 1 - elif pep_pos_end >= cds_start and pep_pos_end <= cds_end and pep_pos_start <= cds_start: - pep_start = end - pep_pos_end + cds_start - pep_end = end - pep_thick_start = (len(peptide)*3)-(pep_end-pep_start) - pep_thick_end = (len(peptide)*3) - spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end]) - splice_flag = splice_flag + 1 - if splice_flag == 2: - flag = 1 - else: - pass - - if len(spliced_peptide) == 0: - if strand == "+": - bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"] - else: - bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"] - outfh.write("\t".join(bed_line)+"\n") - else: - if strand == "+": - pep_entry = spliced_peptide - pep_start = min([pep_entry[0][0], pep_entry[1][0]]) - pep_end = max([pep_entry[0][1], pep_entry[1][1]]) - blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))] - blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))] - bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)] - outfh.write("\t".join(bed_line)+"\n") - else: - pep_entry = spliced_peptide - pep_start = min([pep_entry[0][0], pep_entry[1][0]]) - pep_end = max([pep_entry[0][1], pep_entry[1][1]]) - blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))] - blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))] - bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)] - outfh.write("\t".join(bed_line)+"\n") - c.execute("DROP table novel") - conn.commit() - conn.close() - conn1.close() + start = exons[-1][2] + exons[-1][6] - cds_end + end = exons[0][3] - cds_start + exons[0][5] + blk_start = [] + blk_size = [] + for exon in reversed(exons): + bstart = exon[2] + exon[6] - min(exon[6], cds_end) + bsize = min(cds_end, exon[6]) - max(cds_start, exon[5]) + # bend = exon[3] - (exon[5] - max(exon[5], cds_start)) + bend = exon[3] - min(cds_start - exon[5], cds_start) + bend = exon[3] - bsize + if args.debug: + print('bstart %d\tbsize %d\tbend %d' % (bstart, bsize, bend), file=sys.stdout) + blk_start.append(bstart - start) + blk_size.append(bsize) + bed_line = [str(chrom), str(start), str(end), + '_'.join([seq, acc]) if args.accession else seq, + '255', strand, + str(start), str(end), + '0,0,0', + str(len(blk_start)), + ','.join([str(b) for b in blk_size]), + ','.join([str(b) for b in blk_start])] + if args.debug: + print('\t'.join(bed_line), file=sys.stdout) + outfh.write('\t'.join(bed_line) + '\n') + pconn.close() + mconn.close() outfh.close() pepfile.close() - - return None -if __name__ == "__main__": + + +if __name__ == '__main__': main()