changeset 1:f93ad4882338 draft

Uploaded v0.0.6, adds unit tests and minor documentation changes.
author peterjc
date Wed, 17 Apr 2013 08:26:25 -0400
parents 0ad90e5eb390
children 9ec94203d895
files test-data/empty.fasta test-data/empty_nlstradamus.tabular test-data/four_human_proteins.fasta test-data/four_human_proteins.nlstradamus.tabular tools/protein_analysis/nlstradamus.txt tools/protein_analysis/nlstradamus.xml
diffstat 6 files changed, 128 insertions(+), 18 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/empty.fasta	Wed Apr 17 08:26:25 2013 -0400
@@ -0,0 +1,2 @@
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/empty_nlstradamus.tabular	Wed Apr 17 08:26:25 2013 -0400
@@ -0,0 +1,1 @@
+#ID	algorithm	score	start	stop	sequence
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins.fasta	Wed Apr 17 08:26:25 2013 -0400
@@ -0,0 +1,61 @@
+>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1
+MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRF
+SQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMK
+REYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFER
+VANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDK
+CVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHAD
+CDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREF
+HHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL
+>sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2
+MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAVGRHQVTLEESLAEG
+GFSTVFLVRTHGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSIS
+DNVWEVLILMEYCRAGQVVNQMNKKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRD
+LKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGG
+KPITTKADIWALGCLLYKLCFFTLPFGESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEP
+DPEHRPDIFQVSYFAFKFAKKDCPVSNINNSSIPSALPEPMTASEAAARKSQIKARITDT
+IGPTETSIAPRQRPKANSATTATPSVLTIQSSATPVKVLAPGEFGNHRPKGALRPGNGPE
+ILLGQGPPQQPPQQHRVLQQLQQGDWRLQQLHLQHRHPHQQQQQQQQQQQQQQQQQQQQQ
+QQQQQQHHHHHHHHLLQDAYMQQYQHATQQQQMLQQQFLMHSVYQPQPSASQYPTMMPQY
+QQAFFQQQMLAQHQPSQQQASPEYLTSPQEFSPALVSYTSSLPAQVGTIMDSSYSANRSV
+ADKEAIANFTNQKNISNPPDMSGWNPFGEDNFSKLTEEELLDREFDLLRSNRLEERASSD
+KNVDSLSAPHNHPPEDPFGSVPFISHSGSPEKKAEHSSINQENGTANPIKNGKTSPASKD
+QRTGKKTSVQGQVQKGNDESESDFESDPPSPKSSEEEEQDDEEVLQGEQGDFNDDDTEPE
+NLGHRPLLMDSEDEEEEEKHSSDSDYEQAKAKYSDMSSVYRDRSGSGPTQDLNTILLTSA
+QLSSDVAVETPKQEFDVFGAVPFFAVRAQQPQQEKNEKNLPQHRFPAAGLEQEEFDVFTK
+APFSKKVNVQECHAVGPEAHTIPGYPKSVDVFGSTPFQPFLTSTSKSESNEDLFGLVPFD
+EITGSQQQKVKQRSLQKLSSRQRRTKQDMSKSNGKRHHGTPTSTKKTLKPTYRTPERARR
+HKKVGRRDSQSSNEFLTISDSKENISVALTDGKDRGNVLQPEESLLDPFGAKPFHSPDLS
+WHPPHQGLSDIRADHNTVLPGRPRQNSLHGSFHSADVLKMDDFGAVPFTELVVQSITPHQ
+SQQSQPVELDPFGAAPFPSKQ
+>sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4
+MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL
+QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL
+VIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNE
+ECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECL
+GNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQG
+CHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGC
+TVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETL
+EIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQE
+RNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQ
+NVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFS
+DERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWE
+RQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQIL
+KELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAF
+PNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYV
+SARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCV
+SRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIG
+PLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSR
+EKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKG
+FTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMA
+AEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPV
+RWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDN
+CPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEME
+FEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN
+PS
+>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1
+MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY
+VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG
+GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP
+EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQES
+ATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI
+YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins.nlstradamus.tabular	Wed Apr 17 08:26:25 2013 -0400
@@ -0,0 +1,2 @@
+#ID	algorithm	score	start	stop	sequence
+sp|Q9NSY1|BMP2K_HUMAN	posterior	0.945	983	1027	RRTKQDMSKSNGKRHHGTPTSTKKTLKPTYRTPERARRHKKVGRR
--- a/tools/protein_analysis/nlstradamus.txt	Tue Jun 07 17:39:58 2011 -0400
+++ b/tools/protein_analysis/nlstradamus.txt	Wed Apr 17 08:26:25 2013 -0400
@@ -1,7 +1,7 @@
-Galaxy wrapper for NLStradamus v1.7 (C++ version)
-=================================================
+Galaxy wrapper for NLStradamus v1.7 or v1.8 (C++ version)
+=========================================================
 
-This wrapper is copyright 2011 by Peter Cock, The James Hutton Institute
+This wrapper is copyright 2011-2013 by Peter Cock, The James Hutton Institute
 (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
 See the licence text below.
 
@@ -11,15 +11,24 @@
 A. N. Nguyen Ba, A. Pogoutse, N. Provart, A. M. Moses.
 NLStradamus: a simple Hidden Markov Model for nuclear localization signal prediction.
 BMC Bioinformatics. 2009 Jun 29;10(1):202.
+http://dx.doi.org/10.1186/1471-2105-10-202
 
 http://www.moseslab.csb.utoronto.ca/NLStradamus
 
 Early versions of NLStradamus did not have a native tabular output format, this
 was added in version 1.7. Additionally a fast C++ implementation was added at
-this point (early versions of NLStradamus came as a perl script only). This
-wrapper expects the compiled C++ binary "NLStradamus" to be on the system PATH.
+this point (early versions of NLStradamus came as a perl script only).
+
+Version 1.8 fixed a C++ compilation issue on modern compilers, but is otherwise
+unchanged.
+
 
-To install the wrapper installed the following files under the Galaxy tools
+Installation
+============
+This wrapper expects the compiled C++ binary "NLStradamus" to be on the system
+PATH.
+
+To install the wrapper copy or move the following files under the Galaxy tools
 folder, e.g. in a tools/protein_analysis folder:
 
 * nlstradamus.xml (the Galaxy tool definition)
@@ -31,6 +40,9 @@
 
 <tool file="protein_analysis/nlstradamus.xml" />
 
+If you wish to run the unit tests, also add this to tools_conf.xml.sample
+and move/copy the test-data files under Galaxy's test-data folder.
+
 That's it.
 
 
@@ -38,6 +50,11 @@
 =======
 
 v0.0.3 - Initial public release
+v0.0.4 - Adding DOI link to reference
+         (Documentation change only)
+v0.0.5 - Assume non-zero return codes are errors
+v0.0.6 - Show output help text using a table
+       - Added unit tests
 
 
 Developers
@@ -46,17 +63,20 @@
 This script and related tools are being developed on the following hg branch:
 http://bitbucket.org/peterjc/galaxy-central/src/tools
 
-For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball use
+For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
 the following command from the Galaxy root folder:
 
-tar -czf nlstradmus.tar.gz tools/protein_analysis/nlstradum.xml tools/protein_analysis/nlstradum.txt
+$ tar -czf nlstradmus.tar.gz tools/protein_analysis/nlstradamus.xml tools/protein_analysis/nlstradamus.txt test-data/four_human_proteins.fasta test-data/four_human_proteins.nlstradamus.tabular test-data/empty.fasta test-data/empty_nlstradamus.tabular
 
 Check this worked:
 
 $ tar -tzf nlstradmus.tar.gz
-filter/seq_filter_by_id.py
-filter/seq_filter_by_id.txt
-filter/seq_filter_by_id.xml
+tools/protein_analysis/nlstradamus.xml
+tools/protein_analysis/nlstradamus.txt
+test-data/four_human_proteins.fasta
+test-data/four_human_proteins.nlstradamus.tabular
+test-data/empty.fasta
+test-data/empty_nlstradamus.tabular
 
 
 Licence (MIT/BSD style)
--- a/tools/protein_analysis/nlstradamus.xml	Tue Jun 07 17:39:58 2011 -0400
+++ b/tools/protein_analysis/nlstradamus.xml	Wed Apr 17 08:26:25 2013 -0400
@@ -1,8 +1,13 @@
-<tool id="nlstradamus" name="NLStradamus" version="0.0.3">
+<tool id="nlstradamus" name="NLStradamus" version="0.0.6">
     <description>Find nuclear localization signals (NLSs) in protein sequences</description>
     <command>
       NLStradamus -i $fasta_file -t $threshold -m $model -a $algorithm -tab > $tabular_file
     </command>
+    <stdio>
+        <!-- Assume anything other than zero is an error -->
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+    </stdio>
     <inputs>
         <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> 
         <param name="model" type="select" display="radio" label="Model">
@@ -25,6 +30,20 @@
         <requirement type="binary">NLStradamus</requirement>
     </requirements>
     <tests>
+        <test>
+            <param name="fasta_file" value="four_human_proteins.fasta" ftype="fasta" />
+            <param name="model" value="1" />
+            <param name="algorithm" value="1" />
+	    <param name="threshold" value="0.6" />
+            <output name="tabular_file" file="four_human_proteins.nlstradamus.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="fasta_file" value="empty.fasta" ftype="fasta" />
+            <param name="model" value="2" />
+            <param name="algorithm" value="2" />
+            <param name="threshold" value="0.125"/>
+            <output name="tabular_file" file="empty_nlstradamus.tabular" ftype="tabular" />
+        </test>
     </tests>
     <help>
     
@@ -36,12 +55,16 @@
 The input is a FASTA file of protein sequences, and the output is tabular
 with six columns (one row per NLS):
 
- * Sequence identifier
- * Algorithm (posterior or Viterbi)
- * Score (probability between threshold and 1 for posterior algorithm)
- * Start
- * End
- * Sequence of NLS
+====== ===================================================================
+Column Description
+------ -------------------------------------------------------------------
+    c1 Sequence identifier
+    c2 Algorithm (posterior or Viterbi)
+    c3 Score (probability between threshold and 1 for posterior algorithm)
+    c4 Start
+    c5 End
+    c6 Sequence of NLS
+====== ===================================================================
 
 -----
 
@@ -50,6 +73,7 @@
 A. N. Nguyen Ba, A. Pogoutse, N. Provart, A. M. Moses.
 NLStradamus: a simple Hidden Markov Model for nuclear localization signal prediction.
 BMC Bioinformatics. 2009 Jun 29;10(1):202.
+http://dx.doi.org/10.1186/1471-2105-10-202
 
 http://www.moseslab.csb.utoronto.ca/NLStradamus