Repository 'pep_pointer'
hg clone https://toolshed.g2.bx.psu.edu/repos/galaxyp/pep_pointer

Changeset 0:149ed6a9680f (2017-12-29)
Next changeset 1:dc1b0f54f626 (2018-01-03)
Commit message:
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pep_pointer commit ac27a958fcb897c3cb56db313ebd282805b01103
added:
pep_pointer.py
pep_pointer.xml
test-data/classified_novel_peptides.txt
test-data/make_test_data.sh
test-data/mus17.gtf
test-data/novel_peptides_17.bed
tool-data/pep_pointer.loc.sample
tool_data_table_conf.xml.sample
b
diff -r 000000000000 -r 149ed6a9680f pep_pointer.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pep_pointer.py Fri Dec 29 12:37:22 2017 -0500
[
b'@@ -0,0 +1,221 @@\n+\n+# \n+# Author: Praveen Kumar\n+# Updated: Nov 8th, 2017\n+# \n+# \n+# \n+\n+import re\n+\n+\n+def main():\n+    import sys\n+    if len(sys.argv) == 4:\n+        inputFile = sys.argv\n+        infh = open(inputFile[1], "r")\n+        # infh = open("Mus_musculus.GRCm38.90.chr.gtf", "r")\n+        \n+        gtf = {}\n+        gtf_transcript = {}\n+        gtf_gene = {}\n+        for each in infh.readlines():\n+            a = each.split("\\t")\n+            if re.search("^[^#]", each):\n+                if re.search("gene_biotype \\"protein_coding\\"", a[8]) and int(a[4].strip()) != int(a[3].strip()):\n+                    type = a[2].strip()\n+                    if type == "gene" or type == "exon" or type == "CDS" or type == "five_prime_utr" or type == "three_prime_utr":\n+                        chr = "chr" + a[0].strip()\n+                        strand = a[6].strip()\n+                        if strand == "+":\n+                            start = a[3].strip()\n+                            end = a[4].strip()\n+                        elif strand == "-":\n+                            if int(a[4].strip()) > int(a[3].strip()):\n+                                start = a[3].strip()\n+                                end = a[4].strip()\n+                            elif int(a[4].strip()) < int(a[3].strip()):\n+                                start = a[4].strip()\n+                                end = a[3].strip()\n+                            else:\n+                                print "Something fishy in start end coordinates"\n+                        else:\n+                            print "Something fishy in reading"\n+                        if not gtf.has_key(strand):\n+                            gtf[strand] = {}\n+                        if not gtf[strand].has_key(type):\n+                            gtf[strand][type] = []\n+                        b = re.search("gene_id \\"(.+?)\\";", a[8].strip())\n+                        gene = b.group(1)\n+                        if type == "gene":\n+                            transcript = ""\n+                        else:\n+                            b = re.search("transcript_id \\"(.+?)\\";", a[8].strip())\n+                            transcript = b.group(1)\n+                        data = (chr, start, end, gene, transcript, strand, type)\n+                        gtf[strand][type].append(data)\n+                \n+                        if type == "exon":\n+                            if gtf_transcript.has_key(chr+"#"+strand):\n+                                if gtf_transcript[chr+"#"+strand].has_key(transcript+"#"+gene):\n+                                    gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))\n+                                    gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))\n+                                else:\n+                                    gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]]\n+                                    gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))\n+                                    gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))\n+                            else:\n+                                gtf_transcript[chr+"#"+strand] = {}\n+                                gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]]\n+                                gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))\n+                                gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))\n+                \n+                        if type == "gene":\n+                            if gtf_gene.has_key(chr+"#"+strand):\n+                                gtf_gene[chr+"#"+strand][0].append(int(start))\n+                                gtf_gene[chr+"#"+strand][1].append(int(end))\n+                                gtf_gene[chr+"#"+strand][2].append(gene)\n+                            else:\n+                              '..b'and text, type text)")\n+    \n+        for strand in gtf.keys():\n+            if strand == "+":\n+                st = "positive"\n+            elif strand == "-":\n+                st = "negative"\n+            else:\n+                print "Something fishy in writing . . ."\n+        \n+            for type in gtf[strand].keys():\n+                data = gtf[strand][type]\n+                c.executemany(\'INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)\', data)\n+            \n+        conn.commit()\n+    \n+        infh = open(inputFile[2], "r")\n+        # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r")\n+        data = infh.readlines()\n+        # output file\n+        outfh = open(inputFile[3], \'w\')\n+        # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w")\n+    \n+        for each in data:\n+            a = each.split("\\t")\n+            chr = a[0].strip()\n+            pep_start = a[1].strip()\n+            pep_end = a[2].strip()\n+            strand = a[5].strip()\n+            c.execute("select * from gtf_data where type = \'CDS\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+            rows = c.fetchall()\n+            if len(rows) > 0:\n+                outfh.write(each.strip() + "\\tCDS\\n")\n+            else:\n+                c.execute("select * from gtf_data where type = \'five_prime_utr\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+                rows = c.fetchall()\n+                if len(rows) > 0:\n+                    outfh.write(each.strip() + "\\tfive_prime_utr\\n")\n+                else:\n+                    c.execute("select * from gtf_data where type = \'three_prime_utr\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+                    rows = c.fetchall()\n+                    if len(rows) > 0:\n+                        outfh.write(each.strip() + "\\tthree_prime_utr\\n")\n+                    else:\n+                        c.execute("select * from gtf_data where type = \'exon\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+                        rows = c.fetchall()\n+                        if len(rows) > 0:\n+                            outfh.write(each.strip() + "\\texon\\n")\n+                        else:\n+                            c.execute("select * from gtf_data where type = \'intron\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+                            rows = c.fetchall()\n+                            if len(rows) > 0:\n+                                outfh.write(each.strip() + "\\tintron\\n")\n+                            else:\n+                                c.execute("select * from gtf_data where type = \'gene\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+                                rows = c.fetchall()\n+                                if len(rows) > 0:\n+                                    outfh.write(each.strip() + "\\tgene\\n")\n+                                else:\n+                                    c.execute("select * from gtf_data where type = \'intergenic\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+                                    rows = c.fetchall()\n+                                    if len(rows) > 0:\n+                                        outfh.write(each.strip() + "\\tintergene\\n")\n+                                    else:\n+                                        outfh.write(each.strip() + "\\tOVERLAPPING_ON_TWO_REGIONS: PLEASE_LOOK_MANUALLY (Will be updated in next version)\\n")\n+    \n+        conn.close()\n+        outfh.close()\n+    else:\n+        print "USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>"\n+    return None\n+\n+if __name__ == "__main__":\n+    main()\n+\n+\n+\n+\n+\n'
b
diff -r 000000000000 -r 149ed6a9680f pep_pointer.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pep_pointer.xml Fri Dec 29 12:37:22 2017 -0500
[
@@ -0,0 +1,55 @@
+<tool id="pep_pointer" name="PepPointer" version="0.1.1">
+    <description>classify genomic location of peptides</description>
+    <requirements>
+        <requirement type="package" version="2.7.9">python</requirement>
+    </requirements>
+    <command detect_errors="aggressive"><![CDATA[
+        python '$__tool_directory__/pep_pointer.py' '$gtf_input.gtf' '$bed' '$classified'
+    ]]></command>
+    <inputs>
+        <conditional name="gtf_input">
+            <param type="select" name="gtf_source" label="Choose the source of the GTF file">
+                <option value="cached" selected="true">Built-in</option>
+                <option value="history">From history</option>
+            </param>
+            <when value="cached">
+                <param name="gtf" type="select" format="gtf" label="GTF file with the genome of interest">
+                    <options from_data_table="pep_pointer"/>
+                </param>
+            </when>
+            <when value="history">
+                <param type="data" name="gtf" format="gtf" label="GTF file with the genome of interest"/>
+            </when>
+        </conditional>
+        <param type="data" name="bed" format="bed" label="BED file with chromosomal coordinates of peptide"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="classified" label="${tool.name} on ${on_string}">
+            <actions>
+                <action name="column_names" type="metadata" default="chrom,chromStart,chromStop,name,score,strand,annotation"/>
+            </actions>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="gtf_source" value="history"/>
+            <param name="gtf" value="mus17.gtf"/>
+            <param name="bed" value="novel_peptides_17.bed"/>
+            <output name="classified" file="classified_novel_peptides.txt"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+        **PepPointer**
+
+        Given chromosomal locations of peptides in a BED file, PepPointer classifies them as CDS, UTR, exon, intron, or intergene. 
+    ]]></help>
+    <citations>
+      <citation type="bibtex">
+@misc{peppointer,
+    author={Kumar, Praveen},
+    year={2017},
+    title={PepPointer}
+}
+      </citation>
+    </citations>
+</tool>
b
diff -r 000000000000 -r 149ed6a9680f test-data/classified_novel_peptides.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/classified_novel_peptides.txt Fri Dec 29 12:37:22 2017 -0500
b
@@ -0,0 +1,2 @@
+chr17 48786451 48786471 NVLAAPR 0 + intergene
+chr17 25669965 25669988 SALVLAGR 0 - intergene
b
diff -r 000000000000 -r 149ed6a9680f test-data/make_test_data.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/make_test_data.sh Fri Dec 29 12:37:22 2017 -0500
b
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+python ../pep_pointer.py mus17.gtf novel_peptides_17.bed classified_novel_peptides.txt
b
diff -r 000000000000 -r 149ed6a9680f test-data/mus17.gtf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mus17.gtf Fri Dec 29 12:37:22 2017 -0500
b
b'@@ -0,0 +1,70383 @@\n+17\thavana\tgene\t3044014\t3044733\t.\t-\t.\tgene_id "ENSMUSG00000068141"; gene_version "5"; gene_name "Gm10232"; gene_source "havana"; gene_biotype "processed_pseudogene"; havana_gene "OTTMUSG00000034618"; havana_gene_version "1";\n+17\thavana\ttranscript\t3044014\t3044733\t.\t-\t.\tgene_id "ENSMUSG00000068141"; gene_version "5"; transcript_id "ENSMUST00000089221"; transcript_version "2"; gene_name "Gm10232"; gene_source "havana"; gene_biotype "processed_pseudogene"; havana_gene "OTTMUSG00000034618"; havana_gene_version "1"; transcript_name "Gm10232-201"; transcript_source "havana"; transcript_biotype "processed_pseudogene"; havana_transcript "OTTMUST00000087860"; havana_transcript_version "1"; tag "basic"; transcript_support_level "NA";\n+17\thavana\texon\t3044014\t3044733\t.\t-\t.\tgene_id "ENSMUSG00000068141"; gene_version "5"; transcript_id "ENSMUST00000089221"; transcript_version "2"; exon_number "1"; gene_name "Gm10232"; gene_source "havana"; gene_biotype "processed_pseudogene"; havana_gene "OTTMUSG00000034618"; havana_gene_version "1"; transcript_name "Gm10232-201"; transcript_source "havana"; transcript_biotype "processed_pseudogene"; havana_transcript "OTTMUST00000087860"; havana_transcript_version "1"; exon_id "ENSMUSE00000477530"; exon_version "3"; tag "basic"; transcript_support_level "NA";\n+17\thavana\tgene\t3053159\t3054111\t.\t+\t.\tgene_id "ENSMUSG00000089690"; gene_version "1"; gene_name "Gm16569"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; havana_gene "OTTMUSG00000034619"; havana_gene_version "1";\n+17\thavana\ttranscript\t3053159\t3054111\t.\t+\t.\tgene_id "ENSMUSG00000089690"; gene_version "1"; transcript_id "ENSMUST00000160526"; transcript_version "1"; gene_name "Gm16569"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; havana_gene "OTTMUSG00000034619"; havana_gene_version "1"; transcript_name "Gm16569-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; havana_transcript "OTTMUST00000087861"; havana_transcript_version "1"; tag "basic"; transcript_support_level "NA";\n+17\thavana\texon\t3053159\t3053205\t.\t+\t.\tgene_id "ENSMUSG00000089690"; gene_version "1"; transcript_id "ENSMUST00000160526"; transcript_version "1"; exon_number "1"; gene_name "Gm16569"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; havana_gene "OTTMUSG00000034619"; havana_gene_version "1"; transcript_name "Gm16569-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; havana_transcript "OTTMUST00000087861"; havana_transcript_version "1"; exon_id "ENSMUSE00000850577"; exon_version "1"; tag "basic"; transcript_support_level "NA";\n+17\thavana\texon\t3053328\t3053510\t.\t+\t.\tgene_id "ENSMUSG00000089690"; gene_version "1"; transcript_id "ENSMUST00000160526"; transcript_version "1"; exon_number "2"; gene_name "Gm16569"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; havana_gene "OTTMUSG00000034619"; havana_gene_version "1"; transcript_name "Gm16569-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; havana_transcript "OTTMUST00000087861"; havana_transcript_version "1"; exon_id "ENSMUSE00000870808"; exon_version "1"; tag "basic"; transcript_support_level "NA";\n+17\thavana\texon\t3054028\t3054111\t.\t+\t.\tgene_id "ENSMUSG00000089690"; gene_version "1"; transcript_id "ENSMUST00000160526"; transcript_version "1"; exon_number "3"; gene_name "Gm16569"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; havana_gene "OTTMUSG00000034619"; havana_gene_version "1"; transcript_name "Gm16569-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; havana_transcript "OTTMUST00000087861"; havana_transcript_version "1"; exon_id "ENSMUSE00000852030"; exon_version "1"; tag "basic"; transcript_support_level "NA";\n+17\thavana\tgene\t3056687\t3057121\t.\t+\t.\tgene_id "ENSMUSG00000044697"; gene_version "5"; gene_name "Gm5479"; gene_source "havana"; gene_biotype "processed_pseudogene"; havana_gene "OTTMUSG00000034620"; havana_gene_version "1";\n+17'..b'e "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5";\n+17\tensembl\texon\t94873986\t94874112\t.\t+\t.\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "1"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSMUSE00000969268"; exon_version "1"; tag "basic"; transcript_support_level "5";\n+17\tensembl\tCDS\t94873986\t94874112\t.\t+\t0\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "1"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSMUSP00000103642"; protein_version "4"; tag "basic"; transcript_support_level "5";\n+17\tensembl\texon\t94874302\t94874362\t.\t+\t.\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "2"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSMUSE00001063304"; exon_version "1"; tag "basic"; transcript_support_level "5";\n+17\tensembl\tCDS\t94874302\t94874362\t.\t+\t2\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "2"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSMUSP00000103642"; protein_version "4"; tag "basic"; transcript_support_level "5";\n+17\tensembl\texon\t94875526\t94875820\t.\t+\t.\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "3"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSMUSE00001002010"; exon_version "1"; tag "basic"; transcript_support_level "5";\n+17\tensembl\tCDS\t94875526\t94875820\t.\t+\t1\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "3"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSMUSP00000103642"; protein_version "4"; tag "basic"; transcript_support_level "5";\n+17\tensembl\texon\t94876409\t94877497\t.\t+\t.\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "4"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSMUSE00001070453"; exon_version "1"; tag "basic"; transcript_support_level "5";\n+17\tensembl\tCDS\t94876409\t94877494\t.\t+\t0\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "4"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSMUSP00000103642"; protein_version "4"; tag "basic"; transcript_support_level "5";\n+17\tensembl\tstop_codon\t94877495\t94877497\t.\t+\t0\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "4"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5";\n'
b
diff -r 000000000000 -r 149ed6a9680f test-data/novel_peptides_17.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/novel_peptides_17.bed Fri Dec 29 12:37:22 2017 -0500
b
@@ -0,0 +1,2 @@
+chr17 48786451 48786471 NVLAAPR 0 +
+chr17 25669965 25669988 SALVLAGR 0 -
b
diff -r 000000000000 -r 149ed6a9680f tool-data/pep_pointer.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/pep_pointer.loc.sample Fri Dec 29 12:37:22 2017 -0500
b
@@ -0,0 +1,4 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of gtf data files.
+#name value
+#mouse /path/to/directory/with/mouse.gtf
b
diff -r 000000000000 -r 149ed6a9680f tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Fri Dec 29 12:37:22 2017 -0500
b
@@ -0,0 +1,6 @@
+<tables>
+    <table name="pep_pointer" comment_char="#">
+        <columns>name,value</columns>
+        <file path="tool-data/pep_pointer.loc"/>
+    </table>
+</tables>