Next changeset 1:dc1b0f54f626 (2018-01-03) |
Commit message:
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pep_pointer commit ac27a958fcb897c3cb56db313ebd282805b01103 |
added:
pep_pointer.py pep_pointer.xml test-data/classified_novel_peptides.txt test-data/make_test_data.sh test-data/mus17.gtf test-data/novel_peptides_17.bed tool-data/pep_pointer.loc.sample tool_data_table_conf.xml.sample |
b |
diff -r 000000000000 -r 149ed6a9680f pep_pointer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pep_pointer.py Fri Dec 29 12:37:22 2017 -0500 |
[ |
b'@@ -0,0 +1,221 @@\n+\n+# \n+# Author: Praveen Kumar\n+# Updated: Nov 8th, 2017\n+# \n+# \n+# \n+\n+import re\n+\n+\n+def main():\n+ import sys\n+ if len(sys.argv) == 4:\n+ inputFile = sys.argv\n+ infh = open(inputFile[1], "r")\n+ # infh = open("Mus_musculus.GRCm38.90.chr.gtf", "r")\n+ \n+ gtf = {}\n+ gtf_transcript = {}\n+ gtf_gene = {}\n+ for each in infh.readlines():\n+ a = each.split("\\t")\n+ if re.search("^[^#]", each):\n+ if re.search("gene_biotype \\"protein_coding\\"", a[8]) and int(a[4].strip()) != int(a[3].strip()):\n+ type = a[2].strip()\n+ if type == "gene" or type == "exon" or type == "CDS" or type == "five_prime_utr" or type == "three_prime_utr":\n+ chr = "chr" + a[0].strip()\n+ strand = a[6].strip()\n+ if strand == "+":\n+ start = a[3].strip()\n+ end = a[4].strip()\n+ elif strand == "-":\n+ if int(a[4].strip()) > int(a[3].strip()):\n+ start = a[3].strip()\n+ end = a[4].strip()\n+ elif int(a[4].strip()) < int(a[3].strip()):\n+ start = a[4].strip()\n+ end = a[3].strip()\n+ else:\n+ print "Something fishy in start end coordinates"\n+ else:\n+ print "Something fishy in reading"\n+ if not gtf.has_key(strand):\n+ gtf[strand] = {}\n+ if not gtf[strand].has_key(type):\n+ gtf[strand][type] = []\n+ b = re.search("gene_id \\"(.+?)\\";", a[8].strip())\n+ gene = b.group(1)\n+ if type == "gene":\n+ transcript = ""\n+ else:\n+ b = re.search("transcript_id \\"(.+?)\\";", a[8].strip())\n+ transcript = b.group(1)\n+ data = (chr, start, end, gene, transcript, strand, type)\n+ gtf[strand][type].append(data)\n+ \n+ if type == "exon":\n+ if gtf_transcript.has_key(chr+"#"+strand):\n+ if gtf_transcript[chr+"#"+strand].has_key(transcript+"#"+gene):\n+ gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))\n+ gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))\n+ else:\n+ gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]]\n+ gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))\n+ gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))\n+ else:\n+ gtf_transcript[chr+"#"+strand] = {}\n+ gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]]\n+ gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))\n+ gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))\n+ \n+ if type == "gene":\n+ if gtf_gene.has_key(chr+"#"+strand):\n+ gtf_gene[chr+"#"+strand][0].append(int(start))\n+ gtf_gene[chr+"#"+strand][1].append(int(end))\n+ gtf_gene[chr+"#"+strand][2].append(gene)\n+ else:\n+ '..b'and text, type text)")\n+ \n+ for strand in gtf.keys():\n+ if strand == "+":\n+ st = "positive"\n+ elif strand == "-":\n+ st = "negative"\n+ else:\n+ print "Something fishy in writing . . ."\n+ \n+ for type in gtf[strand].keys():\n+ data = gtf[strand][type]\n+ c.executemany(\'INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)\', data)\n+ \n+ conn.commit()\n+ \n+ infh = open(inputFile[2], "r")\n+ # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r")\n+ data = infh.readlines()\n+ # output file\n+ outfh = open(inputFile[3], \'w\')\n+ # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w")\n+ \n+ for each in data:\n+ a = each.split("\\t")\n+ chr = a[0].strip()\n+ pep_start = a[1].strip()\n+ pep_end = a[2].strip()\n+ strand = a[5].strip()\n+ c.execute("select * from gtf_data where type = \'CDS\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+ rows = c.fetchall()\n+ if len(rows) > 0:\n+ outfh.write(each.strip() + "\\tCDS\\n")\n+ else:\n+ c.execute("select * from gtf_data where type = \'five_prime_utr\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+ rows = c.fetchall()\n+ if len(rows) > 0:\n+ outfh.write(each.strip() + "\\tfive_prime_utr\\n")\n+ else:\n+ c.execute("select * from gtf_data where type = \'three_prime_utr\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+ rows = c.fetchall()\n+ if len(rows) > 0:\n+ outfh.write(each.strip() + "\\tthree_prime_utr\\n")\n+ else:\n+ c.execute("select * from gtf_data where type = \'exon\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+ rows = c.fetchall()\n+ if len(rows) > 0:\n+ outfh.write(each.strip() + "\\texon\\n")\n+ else:\n+ c.execute("select * from gtf_data where type = \'intron\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+ rows = c.fetchall()\n+ if len(rows) > 0:\n+ outfh.write(each.strip() + "\\tintron\\n")\n+ else:\n+ c.execute("select * from gtf_data where type = \'gene\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+ rows = c.fetchall()\n+ if len(rows) > 0:\n+ outfh.write(each.strip() + "\\tgene\\n")\n+ else:\n+ c.execute("select * from gtf_data where type = \'intergenic\' and chr = \'"+chr+"\' and start <= "+pep_start+" and end >= "+pep_end+" and strand = \'"+strand+"\' ")\n+ rows = c.fetchall()\n+ if len(rows) > 0:\n+ outfh.write(each.strip() + "\\tintergene\\n")\n+ else:\n+ outfh.write(each.strip() + "\\tOVERLAPPING_ON_TWO_REGIONS: PLEASE_LOOK_MANUALLY (Will be updated in next version)\\n")\n+ \n+ conn.close()\n+ outfh.close()\n+ else:\n+ print "USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>"\n+ return None\n+\n+if __name__ == "__main__":\n+ main()\n+\n+\n+\n+\n+\n' |
b |
diff -r 000000000000 -r 149ed6a9680f pep_pointer.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pep_pointer.xml Fri Dec 29 12:37:22 2017 -0500 |
[ |
@@ -0,0 +1,55 @@ +<tool id="pep_pointer" name="PepPointer" version="0.1.1"> + <description>classify genomic location of peptides</description> + <requirements> + <requirement type="package" version="2.7.9">python</requirement> + </requirements> + <command detect_errors="aggressive"><![CDATA[ + python '$__tool_directory__/pep_pointer.py' '$gtf_input.gtf' '$bed' '$classified' + ]]></command> + <inputs> + <conditional name="gtf_input"> + <param type="select" name="gtf_source" label="Choose the source of the GTF file"> + <option value="cached" selected="true">Built-in</option> + <option value="history">From history</option> + </param> + <when value="cached"> + <param name="gtf" type="select" format="gtf" label="GTF file with the genome of interest"> + <options from_data_table="pep_pointer"/> + </param> + </when> + <when value="history"> + <param type="data" name="gtf" format="gtf" label="GTF file with the genome of interest"/> + </when> + </conditional> + <param type="data" name="bed" format="bed" label="BED file with chromosomal coordinates of peptide"/> + </inputs> + <outputs> + <data format="tabular" name="classified" label="${tool.name} on ${on_string}"> + <actions> + <action name="column_names" type="metadata" default="chrom,chromStart,chromStop,name,score,strand,annotation"/> + </actions> + </data> + </outputs> + <tests> + <test> + <param name="gtf_source" value="history"/> + <param name="gtf" value="mus17.gtf"/> + <param name="bed" value="novel_peptides_17.bed"/> + <output name="classified" file="classified_novel_peptides.txt"/> + </test> + </tests> + <help><![CDATA[ + **PepPointer** + + Given chromosomal locations of peptides in a BED file, PepPointer classifies them as CDS, UTR, exon, intron, or intergene. + ]]></help> + <citations> + <citation type="bibtex"> +@misc{peppointer, + author={Kumar, Praveen}, + year={2017}, + title={PepPointer} +} + </citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 149ed6a9680f test-data/classified_novel_peptides.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/classified_novel_peptides.txt Fri Dec 29 12:37:22 2017 -0500 |
b |
@@ -0,0 +1,2 @@ +chr17 48786451 48786471 NVLAAPR 0 + intergene +chr17 25669965 25669988 SALVLAGR 0 - intergene |
b |
diff -r 000000000000 -r 149ed6a9680f test-data/make_test_data.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/make_test_data.sh Fri Dec 29 12:37:22 2017 -0500 |
b |
@@ -0,0 +1,3 @@ +#!/bin/bash + +python ../pep_pointer.py mus17.gtf novel_peptides_17.bed classified_novel_peptides.txt |
b |
diff -r 000000000000 -r 149ed6a9680f test-data/mus17.gtf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mus17.gtf Fri Dec 29 12:37:22 2017 -0500 |
b |
b'@@ -0,0 +1,70383 @@\n+17\thavana\tgene\t3044014\t3044733\t.\t-\t.\tgene_id "ENSMUSG00000068141"; gene_version "5"; gene_name "Gm10232"; gene_source "havana"; gene_biotype "processed_pseudogene"; havana_gene "OTTMUSG00000034618"; havana_gene_version "1";\n+17\thavana\ttranscript\t3044014\t3044733\t.\t-\t.\tgene_id "ENSMUSG00000068141"; gene_version "5"; transcript_id "ENSMUST00000089221"; transcript_version "2"; gene_name "Gm10232"; gene_source "havana"; gene_biotype "processed_pseudogene"; havana_gene "OTTMUSG00000034618"; havana_gene_version "1"; transcript_name "Gm10232-201"; transcript_source "havana"; transcript_biotype "processed_pseudogene"; havana_transcript "OTTMUST00000087860"; havana_transcript_version "1"; tag "basic"; transcript_support_level "NA";\n+17\thavana\texon\t3044014\t3044733\t.\t-\t.\tgene_id "ENSMUSG00000068141"; gene_version "5"; transcript_id "ENSMUST00000089221"; transcript_version "2"; exon_number "1"; gene_name "Gm10232"; gene_source "havana"; gene_biotype "processed_pseudogene"; havana_gene "OTTMUSG00000034618"; havana_gene_version "1"; transcript_name "Gm10232-201"; transcript_source "havana"; transcript_biotype "processed_pseudogene"; havana_transcript "OTTMUST00000087860"; havana_transcript_version "1"; exon_id "ENSMUSE00000477530"; exon_version "3"; tag "basic"; transcript_support_level "NA";\n+17\thavana\tgene\t3053159\t3054111\t.\t+\t.\tgene_id "ENSMUSG00000089690"; gene_version "1"; gene_name "Gm16569"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; havana_gene "OTTMUSG00000034619"; havana_gene_version "1";\n+17\thavana\ttranscript\t3053159\t3054111\t.\t+\t.\tgene_id "ENSMUSG00000089690"; gene_version "1"; transcript_id "ENSMUST00000160526"; transcript_version "1"; gene_name "Gm16569"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; havana_gene "OTTMUSG00000034619"; havana_gene_version "1"; transcript_name "Gm16569-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; havana_transcript "OTTMUST00000087861"; havana_transcript_version "1"; tag "basic"; transcript_support_level "NA";\n+17\thavana\texon\t3053159\t3053205\t.\t+\t.\tgene_id "ENSMUSG00000089690"; gene_version "1"; transcript_id "ENSMUST00000160526"; transcript_version "1"; exon_number "1"; gene_name "Gm16569"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; havana_gene "OTTMUSG00000034619"; havana_gene_version "1"; transcript_name "Gm16569-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; havana_transcript "OTTMUST00000087861"; havana_transcript_version "1"; exon_id "ENSMUSE00000850577"; exon_version "1"; tag "basic"; transcript_support_level "NA";\n+17\thavana\texon\t3053328\t3053510\t.\t+\t.\tgene_id "ENSMUSG00000089690"; gene_version "1"; transcript_id "ENSMUST00000160526"; transcript_version "1"; exon_number "2"; gene_name "Gm16569"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; havana_gene "OTTMUSG00000034619"; havana_gene_version "1"; transcript_name "Gm16569-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; havana_transcript "OTTMUST00000087861"; havana_transcript_version "1"; exon_id "ENSMUSE00000870808"; exon_version "1"; tag "basic"; transcript_support_level "NA";\n+17\thavana\texon\t3054028\t3054111\t.\t+\t.\tgene_id "ENSMUSG00000089690"; gene_version "1"; transcript_id "ENSMUST00000160526"; transcript_version "1"; exon_number "3"; gene_name "Gm16569"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; havana_gene "OTTMUSG00000034619"; havana_gene_version "1"; transcript_name "Gm16569-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; havana_transcript "OTTMUST00000087861"; havana_transcript_version "1"; exon_id "ENSMUSE00000852030"; exon_version "1"; tag "basic"; transcript_support_level "NA";\n+17\thavana\tgene\t3056687\t3057121\t.\t+\t.\tgene_id "ENSMUSG00000044697"; gene_version "5"; gene_name "Gm5479"; gene_source "havana"; gene_biotype "processed_pseudogene"; havana_gene "OTTMUSG00000034620"; havana_gene_version "1";\n+17'..b'e "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5";\n+17\tensembl\texon\t94873986\t94874112\t.\t+\t.\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "1"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSMUSE00000969268"; exon_version "1"; tag "basic"; transcript_support_level "5";\n+17\tensembl\tCDS\t94873986\t94874112\t.\t+\t0\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "1"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSMUSP00000103642"; protein_version "4"; tag "basic"; transcript_support_level "5";\n+17\tensembl\texon\t94874302\t94874362\t.\t+\t.\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "2"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSMUSE00001063304"; exon_version "1"; tag "basic"; transcript_support_level "5";\n+17\tensembl\tCDS\t94874302\t94874362\t.\t+\t2\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "2"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSMUSP00000103642"; protein_version "4"; tag "basic"; transcript_support_level "5";\n+17\tensembl\texon\t94875526\t94875820\t.\t+\t.\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "3"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSMUSE00001002010"; exon_version "1"; tag "basic"; transcript_support_level "5";\n+17\tensembl\tCDS\t94875526\t94875820\t.\t+\t1\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "3"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSMUSP00000103642"; protein_version "4"; tag "basic"; transcript_support_level "5";\n+17\tensembl\texon\t94876409\t94877497\t.\t+\t.\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "4"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSMUSE00001070453"; exon_version "1"; tag "basic"; transcript_support_level "5";\n+17\tensembl\tCDS\t94876409\t94877494\t.\t+\t0\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "4"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSMUSP00000103642"; protein_version "4"; tag "basic"; transcript_support_level "5";\n+17\tensembl\tstop_codon\t94877495\t94877497\t.\t+\t0\tgene_id "ENSMUSG00000095193"; gene_version "1"; transcript_id "ENSMUST00000108007"; transcript_version "4"; exon_number "4"; gene_name "Gm20939"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "Gm20939-201"; transcript_source "ensembl"; transcript_biotype "protein_coding"; tag "basic"; transcript_support_level "5";\n' |
b |
diff -r 000000000000 -r 149ed6a9680f test-data/novel_peptides_17.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/novel_peptides_17.bed Fri Dec 29 12:37:22 2017 -0500 |
b |
@@ -0,0 +1,2 @@ +chr17 48786451 48786471 NVLAAPR 0 + +chr17 25669965 25669988 SALVLAGR 0 - |
b |
diff -r 000000000000 -r 149ed6a9680f tool-data/pep_pointer.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/pep_pointer.loc.sample Fri Dec 29 12:37:22 2017 -0500 |
b |
@@ -0,0 +1,4 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of gtf data files. +#name value +#mouse /path/to/directory/with/mouse.gtf |
b |
diff -r 000000000000 -r 149ed6a9680f tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Fri Dec 29 12:37:22 2017 -0500 |
b |
@@ -0,0 +1,6 @@ +<tables> + <table name="pep_pointer" comment_char="#"> + <columns>name,value</columns> + <file path="tool-data/pep_pointer.loc"/> + </table> +</tables> |