Mercurial > repos > pravs > peptidegenomiccoordinate

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/peptideGenomicCoodinate.py	Tue Dec 18 16:22:29 2018 -0500
@@ -0,0 +1,150 @@
+
+#
+# Author: Praveen Kumar
+# University of Minnesota
+#
+# Get peptide's genomic coordinate from the protein's genomic mapping sqlite file (which is derived from the https://toolshed.g2.bx.psu.edu/view/galaxyp/translate_bed/038ecf54cbec)
+#
+# python peptideGenomicCoordinate.py <peptide_list> <mz_to_sqlite DB> <genomic mapping file DB> <output.bed>
+#
+#
+
+def main():
+    import sys
+    import sqlite3
+    conn = sqlite3.connect(sys.argv[2])
+    c = conn.cursor()
+    c.execute("DROP table if exists novel")
+    conn.commit()
+    c.execute("CREATE TABLE novel(peptide text)")
+    pepfile = open(sys.argv[1],"r")
+
+    for seq in pepfile.readlines():
+        seq = seq.strip()
+        c.execute('INSERT INTO novel VALUES ("'+seq+'")')
+        conn.commit()
+
+    c.execute("SELECT distinct psm.sequence, ps.id, ps.sequence from db_sequence ps, psm_entries psm, novel n, proteins_by_peptide pbp where psm.sequence = n.peptide and pbp.peptide_ref = psm.id and pbp.id = ps.id")
+    rows = c.fetchall()
+
+    conn1 = sqlite3.connect(sys.argv[3])
+    c1 = conn1.cursor()
+
+    outfh = open(sys.argv[4], "w")
+
+    master_dict = {}
+    for each in rows:
+        peptide = each[0]
+        acc = each[1]
+        acc_seq = each[2]
+
+        c1.execute("SELECT chrom,start,end,name,strand,cds_start,cds_end FROM feature_cds_map map WHERE map.name = '"+acc+"'")
+        coordinates = c1.fetchall()
+
+        if len(coordinates) != 0:
+            pep_start = 0
+            pep_end = 0
+            flag = 0
+            splice_flag = 0
+            spliced_peptide = []
+            for each_entry in coordinates:
+                chromosome = each_entry[0]
+                start = int(each_entry[1])
+                end = int(each_entry[2])
+                strand = each_entry[4]
+                cds_start = int(each_entry[5])
+                cds_end = int(each_entry[6])
+                pep_pos_start = (acc_seq.find(peptide)*3)
+                pep_pos_end = pep_pos_start + (len(peptide)*3)
+                if (pep_pos_start >= cds_start) and (pep_pos_end <= cds_end):
+                    if strand == "+":
+                        pep_start = start + pep_pos_start - cds_start
+                        pep_end = start + pep_pos_end - cds_start
+                        pep_thick_start = 0
+                        pep_thick_end = len(peptide)
+                        flag == 1
+                    else:
+                        pep_end = end - pep_pos_start + cds_start
+                        pep_start = end - pep_pos_end + cds_start
+                        pep_thick_start = 0
+                        pep_thick_end = len(peptide)
+                        flag == 1
+                    spliced_peptide = []
+                    splice_flag = 0
+                else:
+                    if flag == 0:
+                        if strand == "+":
+                            if (pep_pos_start >= cds_start) and (pep_pos_start <= cds_end) and (pep_pos_end > cds_end):
+                                pep_start = start + pep_pos_start - cds_start
+                                pep_end = end
+                                pep_thick_start = 0
+                                pep_thick_end = (pep_end-pep_start)
+                                spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
+                                splice_flag = splice_flag + 1
+                                if splice_flag == 2:
+                                    flag = 1
+                            elif (pep_pos_end >= cds_start) and (pep_pos_end <= cds_end) and (pep_pos_start < cds_start):
+                                pep_start = start
+                                pep_end = start + pep_pos_end - cds_start
+                                pep_thick_start = (len(peptide)*3)-(pep_end-pep_start)
+                                pep_thick_end = (len(peptide)*3)
+                                spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
+                                splice_flag = splice_flag + 1
+                                if splice_flag == 2:
+                                    flag = 1
+                            else:
+                                pass
+                        else:
+                            if (pep_pos_start >= cds_start) and (pep_pos_start <= cds_end) and (pep_pos_end >= cds_end):
+                                pep_start = start
+                                pep_end = end - pep_pos_start - cds_start
+                                pep_thick_start = 0
+                                pep_thick_end = (pep_end-pep_start)
+                                spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
+                                splice_flag = splice_flag + 1
+                                if splice_flag == 2:
+                                    flag = 1
+                            elif (pep_pos_end >= cds_start) and (pep_pos_end <= cds_end) and (pep_pos_start <= cds_start):
+                                pep_start = end - pep_pos_end + cds_start
+                                pep_end = end
+                                pep_thick_start = (len(peptide)*3)-(pep_end-pep_start)
+                                pep_thick_end = (len(peptide)*3)
+                                spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
+                                splice_flag = splice_flag + 1
+                                if splice_flag == 2:
+                                    flag = 1
+                            else:
+                                pass
+            if len(spliced_peptide) == 0:
+                if strand == "+":
+                    bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"]
+                else:
+                    bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"]
+                outfh.write("\t".join(bed_line)+"\n")
+            else:
+                if strand == "+":
+                    pep_entry = spliced_peptide
+                    pep_start = min([pep_entry[0][0], pep_entry[1][0]])
+                    pep_end = max([pep_entry[0][1], pep_entry[1][1]])
+                    blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))]
+                    blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))]
+                    bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)]
+                    outfh.write("\t".join(bed_line)+"\n")
+                else:
+                    pep_entry = spliced_peptide
+                    pep_start = min([pep_entry[0][0], pep_entry[1][0]])
+                    pep_end = max([pep_entry[0][1], pep_entry[1][1]])
+                    blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))]
+                    blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))]
+                    bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)]
+                    outfh.write("\t".join(bed_line)+"\n")
+    c.execute("DROP table novel")
+    conn.commit()
+    conn.close()
+    conn1.close()
+    outfh.close()
+    pepfile.close()
+
+    return None
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/peptideGenomicCoodinate.xml	Tue Dec 18 16:22:29 2018 -0500
@@ -0,0 +1,56 @@
+<tool id="peptidegenomiccoodinate" name="Peptide Genomic Coodinate" version="0.1.1">
+    <description>Get genomic location/coordinate of peptides using mzsqlite DB and genomic mapping sqlite DB</description>
+    <requirements>
+        <requirement type="package" version="2.7.9">python</requirement>
+        <!--<requirement type="package" version="3.26.0">sqlite</requirement>-->
+    </requirements>
+    <command detect_errors="aggressive"><![CDATA[
+        python '$__tool_directory__/peptideGenomicCoodinate.py'
+        '$peptideinput'
+        '$mzsqlite'
+        '$mapping'
+        '$peptide_bed'
+    ]]></command>
+    <inputs>
+        <param type="data" name="peptideinput" format="tabular" label="Peptide List (without any header line)"/>
+        <param type="data" name="mzsqlite" format="sqlite" label="mz to sqlite (mzsqlite) file"/>
+        <param type="data" name="mapping" format="sqlite" label="genomic mapping sqlite file"/>
+    </inputs>
+    <outputs>
+        <data format="bed" name="peptide_bed" label="${tool.name} on ${on_string}">
+            <actions>
+                <action name="column_names" type="metadata" default="chrom,chromStart,chromStop,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts"/>
+            </actions>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="peptideinput" value="peptides.tabular"/>
+            <param name="mzsqlite" value="test_mz_to_sqlite.sqlite"/>
+            <param name="mapping" value="test_genomic_mapping_sqlite.sqlite"/>
+            <output name="peptide_bed">
+                <assert_contents>
+                    <has_text text="115176449" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+        **PeptideGenomicCoodinate**
+        Gets genomic coordinate of peptides based on the information in mzsqlite and genomic mapping sqlite files. This tool is useful in a proteogenomics workflow.
+        This program loads two sqlite databases (mzsqlite and genomic mapping sqlite files) and calculates the genomic coordinates of the peptides provided as input. This outputs bed file for peptides.
+
+        Input: Peptide list file, mzsqlite sqlite DB file, and genomic mapping sqlite DB file
+        Output: Tabular BED file with all the columns
+
+    ]]></help>
+    <citations>
+      <citation type="bibtex">
+@misc{peptidegenomiccoodinate,
+    author={Kumar, Praveen},
+    year={2018},
+    title={PeptideGenomicCoordinate}
+}
+      </citation>
+    </citations>
+</tool>
--- a/peptideGenomicCoordinate.py	Sun Jun 17 04:55:42 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,150 +0,0 @@
-
-#
-# Author: Praveen Kumar
-# University of Minnesota
-# Galaxy-P Team
-# Get peptide's genomic coordinate from the protein's genomic mapping sqlite file (which is derived from the https://toolshed.g2.bx.psu.edu/view/galaxyp/translate_bed/038ecf54cbec)
-#
-# python peptideGenomicCoordinate.py <peptide_list> <mz_to_sqlite DB> <genomic mapping file DB> <output.bed>
-#
-#
-
-def main():
-    import sys
-    import sqlite3
-    conn = sqlite3.connect(sys.argv[2])
-    c = conn.cursor()
-    c.execute("DROP table if exists novel")
-    conn.commit()
-    c.execute("CREATE TABLE novel(peptide text)")
-    pepfile = open(sys.argv[1],"r")
-
-    for seq in pepfile.readlines():
-        seq = seq.strip()
-        c.execute('INSERT INTO novel VALUES ("'+seq+'")')
-        conn.commit()
-
-    c.execute("SELECT distinct psm.sequence, ps.id, ps.sequence from db_sequence ps, psm_entries psm, novel n, proteins_by_peptide pbp where psm.sequence = n.peptide and pbp.peptide_ref = psm.id and pbp.id = ps.id")
-    rows = c.fetchall()
-
-    conn1 = sqlite3.connect(sys.argv[3])
-    c1 = conn1.cursor()
-
-    outfh = open(sys.argv[4], "w")
-
-    master_dict = {}
-    for each in rows:
-        peptide = each[0]
-        acc = each[1]
-        acc_seq = each[2]
-
-        c1.execute("SELECT chrom,start,end,name,strand,cds_start,cds_end FROM feature_cds_map map WHERE map.name = '"+acc+"'")
-        coordinates = c1.fetchall()
-
-        if len(coordinates) != 0:
-            pep_start = 0
-            pep_end = 0
-            flag = 0
-            splice_flag = 0
-            spliced_peptide = []
-            for each_entry in coordinates:
-                chromosome = each_entry[0]
-                start = int(each_entry[1])
-                end = int(each_entry[2])
-                strand = each_entry[4]
-                cds_start = int(each_entry[5])
-                cds_end = int(each_entry[6])
-                pep_pos_start = (acc_seq.find(peptide)*3)
-                pep_pos_end = pep_pos_start + (len(peptide)*3)
-                if (pep_pos_start >= cds_start) and (pep_pos_end <= cds_end):
-                    if strand == "+":
-                        pep_start = start + pep_pos_start - cds_start
-                        pep_end = start + pep_pos_end - cds_start
-                        pep_thick_start = 0
-                        pep_thick_end = len(peptide)
-                        flag == 1
-                    else:
-                        pep_end = end - pep_pos_start + cds_start
-                        pep_start = end - pep_pos_end + cds_start
-                        pep_thick_start = 0
-                        pep_thick_end = len(peptide)
-                        flag == 1
-                    spliced_peptide = []
-                    splice_flag = 0
-                else:
-                    if flag == 0:
-                        if strand == "+":
-                            if (pep_pos_start >= cds_start) and (pep_pos_start <= cds_end) and (pep_pos_end > cds_end):
-                                pep_start = start + pep_pos_start - cds_start
-                                pep_end = end
-                                pep_thick_start = 0
-                                pep_thick_end = (pep_end-pep_start)
-                                spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
-                                splice_flag = splice_flag + 1
-                                if splice_flag == 2:
-                                    flag = 1
-                            elif (pep_pos_end >= cds_start) and (pep_pos_end <= cds_end) and (pep_pos_start < cds_start):
-                                pep_start = start
-                                pep_end = start + pep_pos_end - cds_start
-                                pep_thick_start = (len(peptide)*3)-(pep_end-pep_start)
-                                pep_thick_end = (len(peptide)*3)
-                                spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
-                                splice_flag = splice_flag + 1
-                                if splice_flag == 2:
-                                    flag = 1
-                            else:
-                                pass
-                        else:
-                            if (pep_pos_start >= cds_start) and (pep_pos_start <= cds_end) and (pep_pos_end >= cds_end):
-                                pep_start = start
-                                pep_end = end - pep_pos_start - cds_start
-                                pep_thick_start = 0
-                                pep_thick_end = (pep_end-pep_start)
-                                spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
-                                splice_flag = splice_flag + 1
-                                if splice_flag == 2:
-                                    flag = 1
-                            elif (pep_pos_end >= cds_start) and (pep_pos_end <= cds_end) and (pep_pos_start <= cds_start):
-                                pep_start = end - pep_pos_end + cds_start
-                                pep_end = end
-                                pep_thick_start = (len(peptide)*3)-(pep_end-pep_start)
-                                pep_thick_end = (len(peptide)*3)
-                                spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
-                                splice_flag = splice_flag + 1
-                                if splice_flag == 2:
-                                    flag = 1
-                            else:
-                                pass
-            if len(spliced_peptide) == 0:
-                if strand == "+":
-                    bed_line = [str(chromosome), str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"]
-                else:
-                    bed_line = [str(chromosome), str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"]
-                outfh.write("\t".join(bed_line)+"\n")
-            else:
-                if strand == "+":
-                    pep_entry = spliced_peptide
-                    pep_start = min([pep_entry[0][0], pep_entry[1][0]])
-                    pep_end = max([pep_entry[0][1], pep_entry[1][1]])
-                    blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))]
-                    blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))]
-                    bed_line = [str(chromosome), str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)]
-                    outfh.write("\t".join(bed_line)+"\n")
-                else:
-                    pep_entry = spliced_peptide
-                    pep_start = min([pep_entry[0][0], pep_entry[1][0]])
-                    pep_end = max([pep_entry[0][1], pep_entry[1][1]])
-                    blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))]
-                    blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))]
-                    bed_line = [str(chromosome), str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)]
-                    outfh.write("\t".join(bed_line)+"\n")
-    c.execute("DROP table novel")
-    conn.commit()
-    conn.close()
-    conn1.close()
-    outfh.close()
-    pepfile.close()
-
-    return None
-if __name__ == "__main__":
-    main()
--- a/peptideGenomicCoordinate.xml	Sun Jun 17 04:55:42 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,57 +0,0 @@
-<tool id="peptidegenomiccoordinate" name="Peptide Genomic Coordinate" version="0.1.2">
-    <description>Get genomic location/coordinate of peptides using mzsqlite DB and genomic mapping sqlite DB</description>
-    <requirements>
-        <requirement type="package" version="2.7.9">python</requirement>
-    </requirements>
-    <command detect_errors="aggressive"><![CDATA[
-        python '$__tool_directory__/peptideGenomicCoordinate.py' '$peptideinput' '$mzsqlite' '$mapping' '$peptide_bed'
-    ]]></command>
-
-    <inputs>
-        <param type="data" name="peptideinput" format="tabular" label="Peptide List (without any header line)"/>
-        <param type="data" name="mzsqlite" format="sqlite" label="mz to sqlite (mzsqlite) file"/>
-        <param type="data" name="mapping" format="sqlite" label="genomic mapping sqlite file"/>
-    </inputs>
-
-    <outputs>
-        <data format="bed" name="peptide_bed" label="${tool.name} on ${on_string}">
-            <actions>
-                <action name="column_names" type="metadata" default="chrom,chromStart,chromStop,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts"/>
-            </actions>
-        </data>
-    </outputs>
-
-    <tests>
-        <test>
-            <param name="peptide_input" value="peptides.tabular"/>
-            <param name="sqlite" value="test_mz_to_sqlite.sqlite"/>
-            <param name="sqlite" value="test_genomic_mapping_sqlite.sqlite"/>
-            <output name="peptide_bed" file="peptides_BED.bed"/>
-        </test>
-    </tests>
-    <help><![CDATA[
-        **PeptideGenomicCoordinate**
-
-        Gets genomic coordinate of peptides based on the information in mzsqlite and genomic mapping sqlite files.
-        This program loads two sqlite databases (mzsqlite and genomic mapping sqlite files) and calculates the genomic coordinates of the peptides provided as input. This outputs bed file for peptides.
-
-        Input: Peptide list file, mzsqlite sqlite DB file, and genomic mapping sqlite DB file
-        Output: Tabular BED file with all the columns
-
-        mzsqlite file from: https://toolshed.g2.bx.psu.edu/repos/galaxyp/mz_to_sqlite/mz_to_sqlite/2.0.0
-        genome mapping sqlite file from: https://toolshed.g2.bx.psu.edu/view/galaxyp/translate_bed/038ecf54cbec
-
-
-    P.S. : Requires sqlite
-
-    ]]></help>
-    <citations>
-      <citation type="bibtex">
-@misc{peptidegenomiccoodinate,
-    author={Kumar, Praveen},
-    year={2018},
-    title={PeptideGenomicCoordinate}
-}
-      </citation>
-    </citations>
-</tool>
--- a/test-data/peptides.tabular	Sun Jun 17 04:55:42 2018 -0400
+++ b/test-data/peptides.tabular	Tue Dec 18 16:22:29 2018 -0500
@@ -1,8 +1,4 @@
 AVDPDSSAEASGLR
-AVDPDSSAEASGLRAQDR
 DGDLENPVLYSGAVK
 DSGASGSILEASAAR
 ELGSSDLTAR
-ESSREALVEPTSESPRPALAR
-NIYITLLSCFK
-SPYREFTDHLVK
--- a/test-data/peptides_BED.bed	Sun Jun 17 04:55:42 2018 -0400
+++ b/test-data/peptides_BED.bed	Tue Dec 18 16:22:29 2018 -0500
@@ -1,8 +1,4 @@
 chr11	115176449	115176491	AVDPDSSAEASGLR	255	+	115176449	115176491	0	1	42	0
-chr11	115176449	115176503	AVDPDSSAEASGLRAQDR	255	+	115176449	115176503	0	1	54	0
 chr5	121445444	121445489	DGDLENPVLYSGAVK	255	-	121445444	121445489	0	1	45	0
 chr17	22866997	22867042	DSGASGSILEASAAR	255	-	22866997	22867042	0	1	45	0
 chr2	91155262	91155292	ELGSSDLTAR	255	-	91155262	91155292	0	1	30	0
-chr11	115180006	115180069	ESSREALVEPTSESPRPALAR	255	+	115180006	115180069	0	1	63	0
-chr4	58482350	58482383	NIYITLLSCFK	255	+	58482350	58482383	0	1	33	0
-chr17	24721702	24721826	SPYREFTDHLVK	255	+	24721702	24721826	0	2	12,24	0,100
Binary file test-data/test_genomic_mapping_sqlite.sqlite has changed
Binary file test-data/test_mz_to_sqlite.sqlite has changed