Galaxy |

Changeset 0:149ed6a9680f (2017-12-29)

Next changeset 1:dc1b0f54f626 (2018-01-03)

Commit message:
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pep_pointer commit ac27a958fcb897c3cb56db313ebd282805b01103

added:
pep_pointer.py
pep_pointer.xml
test-data/classified_novel_peptides.txt
test-data/make_test_data.sh
test-data/mus17.gtf
test-data/novel_peptides_17.bed
tool-data/pep_pointer.loc.sample
tool_data_table_conf.xml.sample

diff -r 000000000000 -r 149ed6a9680f pep_pointer.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pep_pointer.py Fri Dec 29 12:37:22 2017 -0500

[

b'@@ -0,0 +1,221 @@\n+\n+# \n+# Author: Praveen Kumar\n+# Updated: Nov 8th, 2017\n+# \n+# \n+# \n+\n+import re\n+\n+\n+def main():\n+ import sys\n+ if len(sys.argv) == 4:\n+ inputFile = sys.argv\n+ infh = open(inputFile[1], "r")\n+ # infh = open("Mus_musculus.GRCm38.90.chr.gtf", "r")\n+ \n+ gtf = {}\n+ gtf_transcript = {}\n+ gtf_gene = {}\n+ for each in infh.readlines():\n+ a = each.split("\\t")\n+ if re.search("^[^#]", each):\n+ if re.search("gene_biotype \\"protein_coding\\"", a[8]) and int(a[4].strip()) != int(a[3].strip()):\n+ type = a[2].strip()\n+ if type == "gene" or type == "exon" or type == "CDS" or type == "five_prime_utr" or type == "three_prime_utr":\n+ chr = "chr" + a[0].strip()\n+ strand = a[6].strip()\n+ if strand == "+":\n+ start = a[3].strip()\n+ end = a[4].strip()\n+ elif strand == "-":\n+ if int(a[4].strip()) > int(a[3].strip()):\n+ start = a[3].strip()\n+ end = a[4].strip()\n+ elif int(a[4].strip()) < int(a[3].strip()):\n+ start = a[4].strip()\n+ end = a[3].strip()\n+ else:\n+ print "Something fishy in start end coordinates"\n+ else:\n+ print "Something fishy in reading"\n+ if not gtf.has_key(strand):\n+ gtf[strand] = {}\n+ if not gtf[strand].has_key(type):\n+ gtf[strand][type] = []\n+ b = re.search("gene_id \\"(.+?)\\";", a[8].strip())\n+ gene = b.group(1)\n+ if type == "gene":\n+ transcript = ""\n+ else:\n+ b = re.search("transcript_id \\"(.+?)\\";", a[8].strip())\n+ transcript = b.group(1)\n+ data = (chr, start, end, gene, transcript, strand, type)\n+ gtf[strand][type].append(data)\n+ \n+ if type == "exon":\n+ if gtf_transcript.has_key(chr+"#"+strand):\n+ if gtf_transcript[chr+"#"+strand].has_key(transcript+"#"+gene):\n+ gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))\n+ gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))\n+ else:\n+ gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]]\n+ gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))\n+ gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))\n+ else:\n+ gtf_transcript[chr+"#"+strand] = {}\n+ gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]]\n+ gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))\n+ gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))\n+ \n+ if type == "gene":\n+ if gtf_gene.has_key(chr+"#"+strand):\n+ gtf_gene[chr+"#"+strand][0].append(int(start))\n+ gtf_gene[chr+"#"+strand][1].append(int(end))\n+ gtf_gene[chr+"#"+strand][2].append(gene)\n+ else:\n+ '..b'and text, type text)")\n+ \n+ for strand in gtf.keys():\n+ if strand == "+":\n+ st = "positive"\n+ elif strand == "-":\n+ st = "negative"\n+ else:\n+ print "Something fishy in writing . . ."\n+ \n+ for type in gtf[strand].keys():\n+ data = gtf[strand][type]\n+ c.executemany(\'INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)\', data)\n+ \n+ conn.commit()\n+ \n+ infh = open(inputFile[2], "r")\n+ # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r")\n+ data = infh.readlines()\n+ # output file\n+ outfh = open(inputFile[3], \'w\')\n+ # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w")\n+ \n+ for each in data:\n+ a = each.split("\\t")\n+ chr = a[0].strip()\n+ pep_start = a[1].strip()\n+ pep_end = a[2].strip()\n+ strand = a[5].strip()\n+ c.execute("select * from gtf_data where type = \'CDS\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+ rows = c.fetchall()\n+ if len(rows) > 0:\n+ outfh.write(each.strip() + "\\tCDS\\n")\n+ else:\n+ c.execute("select * from gtf_data where type = \'five_prime_utr\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+ rows = c.fetchall()\n+ if len(rows) > 0:\n+ outfh.write(each.strip() + "\\tfive_prime_utr\\n")\n+ else:\n+ c.execute("select * from gtf_data where type = \'three_prime_utr\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+ rows = c.fetchall()\n+ if len(rows) > 0:\n+ outfh.write(each.strip() + "\\tthree_prime_utr\\n")\n+ else:\n+ c.execute("select * from gtf_data where type = \'exon\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+ rows = c.fetchall()\n+ if len(rows) > 0:\n+ outfh.write(each.strip() + "\\texon\\n")\n+ else:\n+ c.execute("select * from gtf_data where type = \'intron\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+ rows = c.fetchall()\n+ if len(rows) > 0:\n+ outfh.write(each.strip() + "\\tintron\\n")\n+ else:\n+ c.execute("select * from gtf_data where type = \'gene\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+ rows = c.fetchall()\n+ if len(rows) > 0:\n+ outfh.write(each.strip() + "\\tgene\\n")\n+ else:\n+ c.execute("select * from gtf_data where type = \'intergenic\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+ rows = c.fetchall()\n+ if len(rows) > 0:\n+ outfh.write(each.strip() + "\\tintergene\\n")\n+ else:\n+ outfh.write(each.strip() + "\\tOVERLAPPING_ON_TWO_REGIONS: PLEASE_LOOK_MANUALLY (Will be updated in next version)\\n")\n+ \n+ conn.close()\n+ outfh.close()\n+ else:\n+ print "USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>"\n+ return None\n+\n+if __name__ == "__main__":\n+ main()\n+\n+\n+\n+\n+\n'

diff -r 000000000000 -r 149ed6a9680f pep_pointer.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pep_pointer.xml Fri Dec 29 12:37:22 2017 -0500

[

@@ -0,0 +1,55 @@
+<tool id="pep_pointer" name="PepPointer" version="0.1.1">
+    <description>classify genomic location of peptides</description>
+    <requirements>
+        <requirement type="package" version="2.7.9">python</requirement>
+    </requirements>
+    <command detect_errors="aggressive"><![CDATA[
+        python '$__tool_directory__/pep_pointer.py' '$gtf_input.gtf' '$bed' '$classified'
+    ]]></command>
+    <inputs>
+        <conditional name="gtf_input">
+            <param type="select" name="gtf_source" label="Choose the source of the GTF file">
+                <option value="cached" selected="true">Built-in</option>
+                <option value="history">From history</option>
+            </param>
+            <when value="cached">
+                <param name="gtf" type="select" format="gtf" label="GTF file with the genome of interest">
+                    <options from_data_table="pep_pointer"/>
+                </param>
+            </when>
+            <when value="history">
+                <param type="data" name="gtf" format="gtf" label="GTF file with the genome of interest"/>
+            </when>
+        </conditional>
+        <param type="data" name="bed" format="bed" label="BED file with chromosomal coordinates of peptide"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="classified" label="${tool.name} on ${on_string}">
+            <actions>
+                <action name="column_names" type="metadata" default="chrom,chromStart,chromStop,name,score,strand,annotation"/>
+            </actions>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="gtf_source" value="history"/>
+            <param name="gtf" value="mus17.gtf"/>
+            <param name="bed" value="novel_peptides_17.bed"/>
+            <output name="classified" file="classified_novel_peptides.txt"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+        **PepPointer**
+
+        Given chromosomal locations of peptides in a BED file, PepPointer classifies them as CDS, UTR, exon, intron, or intergene.
+    ]]></help>
+    <citations>
+      <citation type="bibtex">
+@misc{peppointer,
+    author={Kumar, Praveen},
+    year={2017},
+    title={PepPointer}
+}
+      </citation>
+    </citations>
+</tool>

diff -r 000000000000 -r 149ed6a9680f test-data/classified_novel_peptides.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/classified_novel_peptides.txt Fri Dec 29 12:37:22 2017 -0500

@@ -0,0 +1,2 @@
+chr17 48786451 48786471 NVLAAPR 0 + intergene
+chr17 25669965 25669988 SALVLAGR 0 - intergene

diff -r 000000000000 -r 149ed6a9680f test-data/make_test_data.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/make_test_data.sh Fri Dec 29 12:37:22 2017 -0500

@@ -0,0 +1,3 @@
+#!/bin/bash
+
+python ../pep_pointer.py mus17.gtf novel_peptides_17.bed classified_novel_peptides.txt

diff -r 000000000000 -r 149ed6a9680f test-data/mus17.gtf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mus17.gtf Fri Dec 29 12:37:22 2017 -0500

b'@@ -0,0 +1,70383 @@\n+17\thavana\tgene\t3044014\t3044733\t.\t-\t.\tgene_id "ENSMUSG00000068141"; gene_version "5"; gene_name "Gm10232"; gene_source "havana"; gene_biotype "processed_pseudogene"; havana_gene "OTTMUSG00000034618"; havana_gene_version "1";\n+17\thavana\ttranscript\t3044014\t3044733\t.\t-\t.\tgene_id "ENSMUSG00000068141"; gene_version "5"; transcript_id "ENSMUST00000089221"; transcript_version "2"; gene_name "Gm10232"; gene_source "havana"; gene_biotype "processed_pseudogene"; havana_gene "OTTMUSG00000034618"; havana_gene_version "1"; transcript_name "Gm10232-201"; transcript_source "havana"; transcript_biotype "processed_pseudogene"; havana_transcript "OTTMUST00000087860"; havana_transcript_version "1"; tag "basic"; transcript_support_level "NA";\n+17\thavana\texon\t3044014\t3044733\t.\t-\t.\tgene_id "ENSMUSG00000068141"; gene_version "5"; transcript_id "ENSMUST00000089221"; transcript_version "2"; exon_number "1"; gene_name "Gm10232"; gene_source "havana"; gene_biotype "processed_pseudogene"; havana_gene "OTTMUSG00000034618"; havana_gene_version "1"; transcript_name "Gm10232-201"; transcript_source "havana"; transcript_biotype "processed_pseudogene"; havana_transcript "OTTMUST00000087860"; havana_transcript_version "1"; exon_id "ENSMUSE00000477530"; exon_version "3"; tag "basic"; transcript_support_level "NA";\n+17\thavana\tgene\t3053159\t3054111\t.\t+\t.\tgene_id "ENSMUSG00000089690"; gene_version "1"; gene_name "Gm16569"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; havana_gene "OTTMUSG00000034619"; havana_gene_version "1";\n+17\thavana\ttranscript\t3053159\t3054111\t.\t+\t.\tgene_id "ENSMUSG00000089690"; gene_version "1"; transcript_id "ENSMUST00000160526"; transcript_version "1"; gene_name "Gm16569"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; havana_gene "OTTMUSG00000034619"; havana_gene_version "1"; transcript_name "Gm16569-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; havana_transcript "OTTMUST00000087861"; havana_transcript_version "1"; tag "basic"; transcript_support_level "NA";\n+17\thavana\texon\t3053159\t3053205\t.\t+\t.\tgene_id "ENSMUSG00000089690"; gene_version "1"; transcript_id "ENSMUST00000160526"; transcript_version "1"; exon_number "1"; gene_name "Gm16569"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; havana_gene "OTTMUSG00000034619"; havana_gene_version "1"; transcript_name "Gm16569-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; havana_transcript "OTTMUST00000087861"; havana_transcript_version "1"; exon_id "ENSMUSE00000850577"; exon_version "1"; tag "basic"; transcript_support_level "NA";\n+17\thavana\texon\t3053328\t3053510\t.\t+\t.\tgene_id "ENSMUSG00000089690"; gene_version "1"; transcript_id "ENSMUST00000160526"; transcript_version "1"; exon_number "2"; gene_name "Gm16569"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; havana_gene "OTTMUSG00000034619"; havana_gene_version "1"; transcript_name "Gm16569-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; havana_transcript "OTTMUST00000087861"; havana_transcript_version "1"; exon_id "ENSMUSE00000870808"; exon_version "1"; tag "basic"; transcript_support_level "NA";\n+17\thavana\texon\t3054028\t3054111\t.\t+\t.\tgene_id "ENSMUSG00000089690"; gene_version "1"; transcript_id "ENSMUST00000160526"; transcript_version "1"; exon_number "3"; gene_name "Gm16569"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; havana_gene "OTTMUSG00000034619"; havana_gene_version "1"; transcript_name "Gm16569-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; havana_transcript "OTTMUST00000087861"; havana_transcript_version "1"; exon_id "ENSMUSE00000852030"; exon_version "1"; tag "basic"; transcript_support_level "NA";\n+17\thavana\tgene\t3056687\t3057121\t.\t+\t.\tgene_id "ENSMUSG00000044697"; gene_version "5"; gene_name "Gm5479"; gene_source "havana"; gene_biotype "processed_pseudogene"; havana_gene "OTTMUSG00000034620"; havana_gene_version "1";\n+17'..b'e "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5";\n+17\tensembl\texon\t94873986\t94874112\t.\t+\t.\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "1"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSMUSE00000969268"; exon_version "1"; tag "basic"; transcript_support_level "5";\n+17\tensembl\tCDS\t94873986\t94874112\t.\t+\t0\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "1"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSMUSP00000103642"; protein_version "4"; tag "basic"; transcript_support_level "5";\n+17\tensembl\texon\t94874302\t94874362\t.\t+\t.\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "2"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSMUSE00001063304"; exon_version "1"; tag "basic"; transcript_support_level "5";\n+17\tensembl\tCDS\t94874302\t94874362\t.\t+\t2\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "2"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSMUSP00000103642"; protein_version "4"; tag "basic"; transcript_support_level "5";\n+17\tensembl\texon\t94875526\t94875820\t.\t+\t.\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "3"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSMUSE00001002010"; exon_version "1"; tag "basic"; transcript_support_level "5";\n+17\tensembl\tCDS\t94875526\t94875820\t.\t+\t1\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "3"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSMUSP00000103642"; protein_version "4"; tag "basic"; transcript_support_level "5";\n+17\tensembl\texon\t94876409\t94877497\t.\t+\t.\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "4"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSMUSE00001070453"; exon_version "1"; tag "basic"; transcript_support_level "5";\n+17\tensembl\tCDS\t94876409\t94877494\t.\t+\t0\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "4"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSMUSP00000103642"; protein_version "4"; tag "basic"; transcript_support_level "5";\n+17\tensembl\tstop_codon\t94877495\t94877497\t.\t+\t0\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "4"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5";\n'

diff -r 000000000000 -r 149ed6a9680f test-data/novel_peptides_17.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/novel_peptides_17.bed Fri Dec 29 12:37:22 2017 -0500

@@ -0,0 +1,2 @@
+chr17 48786451 48786471 NVLAAPR 0 +
+chr17 25669965 25669988 SALVLAGR 0 -

diff -r 000000000000 -r 149ed6a9680f tool-data/pep_pointer.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/pep_pointer.loc.sample Fri Dec 29 12:37:22 2017 -0500

@@ -0,0 +1,4 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of gtf data files.
+#name value
+#mouse /path/to/directory/with/mouse.gtf

diff -r 000000000000 -r 149ed6a9680f tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Fri Dec 29 12:37:22 2017 -0500

@@ -0,0 +1,6 @@
+<tables>
+    <table name="pep_pointer" comment_char="#">
+        <columns>name,value</columns>
+        <file path="tool-data/pep_pointer.loc"/>
+    </table>
+</tables>