Repository 'tmhmm_and_signalp'
hg clone https://toolshed.g2.bx.psu.edu/repos/peterjc/tmhmm_and_signalp

Changeset 9:e52220a9ddad (2013-01-25)
Previous changeset 8:976a5f2833cd (2012-07-30) Next changeset 10:09ff180d1615 (2013-03-27)
Commit message:
Uploaded v0.1.2 Use the new <stdio> settings in the XML wrappers to catch errors. Obeys SGE style XNSLOTS environment variable for thread count (otherwise default to 4).
modified:
tools/protein_analysis/README
tools/protein_analysis/promoter2.py
tools/protein_analysis/promoter2.xml
tools/protein_analysis/rxlr_motifs.xml
tools/protein_analysis/seq_analysis_utils.py
tools/protein_analysis/signalp3.py
tools/protein_analysis/signalp3.xml
tools/protein_analysis/tmhmm2.py
tools/protein_analysis/tmhmm2.xml
tools/protein_analysis/wolf_psort.py
tools/protein_analysis/wolf_psort.xml
added:
test-data/four_human_proteins.fasta.orig
b
diff -r 976a5f2833cd -r e52220a9ddad test-data/four_human_proteins.fasta.orig
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins.fasta.orig Fri Jan 25 06:08:31 2013 -0500
b
@@ -0,0 +1,61 @@
+>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1
+MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRF
+SQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMK
+REYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFER
+VANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDK
+CVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHAD
+CDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREF
+HHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL
+>sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2
+MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAVGRHQVTLEESLAEG
+GFSTVFLVRTHGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSIS
+DNVWEVLILMEYCRAGQVVNQMNKKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRD
+LKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGG
+KPITTKADIWALGCLLYKLCFFTLPFGESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEP
+DPEHRPDIFQVSYFAFKFAKKDCPVSNINNSSIPSALPEPMTASEAAARKSQIKARITDT
+IGPTETSIAPRQRPKANSATTATPSVLTIQSSATPVKVLAPGEFGNHRPKGALRPGNGPE
+ILLGQGPPQQPPQQHRVLQQLQQGDWRLQQLHLQHRHPHQQQQQQQQQQQQQQQQQQQQQ
+QQQQQQHHHHHHHHLLQDAYMQQYQHATQQQQMLQQQFLMHSVYQPQPSASQYPTMMPQY
+QQAFFQQQMLAQHQPSQQQASPEYLTSPQEFSPALVSYTSSLPAQVGTIMDSSYSANRSV
+ADKEAIANFTNQKNISNPPDMSGWNPFGEDNFSKLTEEELLDREFDLLRSNRLEERASSD
+KNVDSLSAPHNHPPEDPFGSVPFISHSGSPEKKAEHSSINQENGTANPIKNGKTSPASKD
+QRTGKKTSVQGQVQKGNDESESDFESDPPSPKSSEEEEQDDEEVLQGEQGDFNDDDTEPE
+NLGHRPLLMDSEDEEEEEKHSSDSDYEQAKAKYSDMSSVYRDRSGSGPTQDLNTILLTSA
+QLSSDVAVETPKQEFDVFGAVPFFAVRAQQPQQEKNEKNLPQHRFPAAGLEQEEFDVFTK
+APFSKKVNVQECHAVGPEAHTIPGYPKSVDVFGSTPFQPFLTSTSKSESNEDLFGLVPFD
+EITGSQQQKVKQRSLQKLSSRQRRTKQDMSKSNGKRHHGTPTSTKKTLKPTYRTPERARR
+HKKVGRRDSQSSNEFLTISDSKENISVALTDGKDRGNVLQPEESLLDPFGAKPFHSPDLS
+WHPPHQGLSDIRADHNTVLPGRPRQNSLHGSFHSADVLKMDDFGAVPFTELVVQSITPHQ
+SQQSQPVELDPFGAAPFPSKQ
+>sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4
+MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL
+QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL
+VIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNE
+ECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECL
+GNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQG
+CHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGC
+TVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETL
+EIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQE
+RNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQ
+NVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFS
+DERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWE
+RQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQIL
+KELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAF
+PNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYV
+SARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCV
+SRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIG
+PLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSR
+EKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKG
+FTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMA
+AEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPV
+RWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDN
+CPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEME
+FEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN
+PS
+>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1
+MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY
+VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG
+GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP
+EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQES
+ATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI
+YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
b
diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/README
--- a/tools/protein_analysis/README Mon Jul 30 12:56:54 2012 -0400
+++ b/tools/protein_analysis/README Fri Jan 25 06:08:31 2013 -0500
b
@@ -127,6 +127,9 @@
 v0.1.0 - Added Promoter 2.0 wrapper (similar to SignalP & TMHMM wrappers)
        - Support Galaxy's <parallelism> tag for SignalP, TMHMM & Promoter
 v0.1.1 - Fixed an error in the header of the tabular output from Promoter
+v0.1.2 - Use the new <stdio> settings in the XML wrappers to catch errors
+       - Use SGE style $NSLOTS for thread count (otherwise default to 4)
+
 
 Developers
 ==========
b
diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/promoter2.py
--- a/tools/protein_analysis/promoter2.py Mon Jul 30 12:56:54 2012 -0400
+++ b/tools/protein_analysis/promoter2.py Fri Jan 25 06:08:31 2013 -0500
[
@@ -30,20 +30,15 @@
 import os
 import commands
 import tempfile
-from seq_analysis_utils import stop_err, split_fasta, run_jobs
+from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count
 
 FASTA_CHUNK = 500
 
 if len(sys.argv) != 4:
     stop_err("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. "
              "Got %i arguments." % (len(sys.argv)-1))
-try:
-    num_threads = int(sys.argv[1])
-except:
-    num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined
-if num_threads < 1:
-    stop_err("Threads argument %s is not a positive integer" % sys.argv[1])
 
+num_threads = thread_count(sys.argv[3],default=4)
 fasta_file = os.path.abspath(sys.argv[2])
 tabular_file = os.path.abspath(sys.argv[3])
 
b
diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/promoter2.xml
--- a/tools/protein_analysis/promoter2.xml Mon Jul 30 12:56:54 2012 -0400
+++ b/tools/protein_analysis/promoter2.xml Fri Jan 25 06:08:31 2013 -0500
b
@@ -1,4 +1,4 @@
-<tool id="promoter2" name="Promoter 2.0" version="0.0.2">
+<tool id="promoter2" name="Promoter 2.0" version="0.0.3">
     <description>Find eukaryotic PolII promoters in DNA sequences</description>
     <!-- If job splitting is enabled, break up the query file into parts -->
     <!-- Using 2000 per chunk so 4 threads each doing 500 is ideal -->
@@ -9,6 +9,11 @@
         ##which (on SGE at least) will set the $NSLOTS environment variable.
         ##If the environment variable isn't set, get "", and defaults to one.
     </command>
+    <stdio>
+        <!-- Anything other than zero is an error -->
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+    </stdio>
     <inputs>
         <param name="fasta_file" type="data" format="fasta" label="FASTA file of DNA sequences"/> 
     </inputs>
b
diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/rxlr_motifs.xml
--- a/tools/protein_analysis/rxlr_motifs.xml Mon Jul 30 12:56:54 2012 -0400
+++ b/tools/protein_analysis/rxlr_motifs.xml Fri Jan 25 06:08:31 2013 -0500
b
@@ -1,9 +1,14 @@
-<tool id="rxlr_motifs" name="RXLR Motifs" version="0.0.5">
+<tool id="rxlr_motifs" name="RXLR Motifs" version="0.0.6">
     <description>Find RXLR Effectors of Plant Pathogenic Oomycetes</description>
     <command interpreter="python">
       rxlr_motifs.py $fasta_file 8 $model $tabular_file
       ##I want the number of threads to be a Galaxy config option...
     </command>
+    <stdio>
+        <!-- Anything other than zero is an error -->
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+    </stdio>
     <inputs>
         <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences" /> 
         <param name="model" type="select" label="Which RXLR model?">
b
diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/seq_analysis_utils.py
--- a/tools/protein_analysis/seq_analysis_utils.py Mon Jul 30 12:56:54 2012 -0400
+++ b/tools/protein_analysis/seq_analysis_utils.py Fri Jan 25 06:08:31 2013 -0500
[
@@ -19,6 +19,56 @@
     sys.stderr.write("%s\n" % msg)
     sys.exit(error_level)
 
+try:
+    from multiprocessing import cpu_count
+except ImportError:
+    #Must be under Python 2.5, this is copied from multiprocessing:
+    def cpu_count():
+        """Returns the number of CPUs in the system."""
+        if sys.platform == 'win32':
+            try:
+                num = int(os.environ['NUMBER_OF_PROCESSORS'])
+            except (ValueError, KeyError):
+                num = 0
+        elif 'bsd' in sys.platform or sys.platform == 'darwin':
+            comm = '/sbin/sysctl -n hw.ncpu'
+            if sys.platform == 'darwin':
+                comm = '/usr' + comm
+                try:
+                    with os.popen(comm) as p:
+                        num = int(p.read())
+                except ValueError:
+                    num = 0
+        else:
+            try:
+                num = os.sysconf('SC_NPROCESSORS_ONLN')
+            except (ValueError, OSError, AttributeError):
+                num = 0
+
+        if num >= 1:
+            return num
+        else:
+            raise NotImplementedError('cannot determine number of cpus')
+
+
+def thread_count(command_line_arg, default=1):
+    try:
+        num = int(command_line_arg)
+    except:
+        num = default
+    if num < 1:
+        stop_err("Threads argument %r is not a positive integer" % command_line_arg)
+    #Cap this with the pysical limit of the machine,
+    try:
+        num = min(num, cpu_count())
+    except NotImplementedError:
+        pass
+    #For debugging,
+    #hostname = os.environ.get("HOSTNAME", "this machine")
+    #sys.stderr.write("Using %i cores on %s\n" % (num, hostname))
+    return num
+
+
 def fasta_iterator(filename, max_len=None, truncate=None):
     """Simple FASTA parser yielding tuples of (title, sequence) strings."""
     handle = open(filename)
@@ -109,6 +159,11 @@
     pending = jobs[:]
     running = []
     results = {}
+    if threads == 1:
+        #Special case this for speed, don't need the waits
+        for cmd in jobs:
+            results[cmd] = subprocess.call(cmd, shell=True)
+        return results
     while pending or running:
         #See if any have finished
         for (cmd, process) in running:
b
diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/signalp3.py
--- a/tools/protein_analysis/signalp3.py Mon Jul 30 12:56:54 2012 -0400
+++ b/tools/protein_analysis/signalp3.py Fri Jan 25 06:08:31 2013 -0500
[
@@ -56,7 +56,8 @@
 import sys
 import os
 import tempfile
-from seq_analysis_utils import stop_err, split_fasta, run_jobs, fasta_iterator
+from seq_analysis_utils import stop_err, split_fasta, fasta_iterator
+from seq_analysis_utils import run_jobs, thread_count
 
 FASTA_CHUNK = 500
 MAX_LEN = 6000 #Found by trial and error
@@ -78,15 +79,8 @@
 if truncate < 0:
    stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2])
 
-try:
-   num_threads = int(sys.argv[3])
-except:
-   num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined
-if num_threads < 1:
-   stop_err("Threads argument %s is not a positive integer" % sys.argv[3])
-
+num_threads = thread_count(sys.argv[3], default=4)
 fasta_file = sys.argv[4]
-
 tabular_file = sys.argv[5]
 
 if len(sys.argv) == 8:
b
diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/signalp3.xml
--- a/tools/protein_analysis/signalp3.xml Mon Jul 30 12:56:54 2012 -0400
+++ b/tools/protein_analysis/signalp3.xml Fri Jan 25 06:08:31 2013 -0500
b
@@ -1,4 +1,4 @@
-<tool id="signalp3" name="SignalP 3.0" version="0.0.9">
+<tool id="signalp3" name="SignalP 3.0" version="0.0.10">
     <description>Find signal peptides in protein sequences</description>
     <!-- If job splitting is enabled, break up the query file into parts -->
     <!-- Using 2000 chunks meaning 4 threads doing 500 each is ideal -->
@@ -9,6 +9,11 @@
       ##which (on SGE at least) will set the $NSLOTS environment variable.
       ##If the environment variable isn't set, get "", and defaults to one.
     </command>
+    <stdio>
+        <!-- Anything other than zero is an error -->
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+    </stdio>
     <inputs>
         <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> 
         <param name="organism" type="select" display="radio" label="Organism">
b
diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/tmhmm2.py
--- a/tools/protein_analysis/tmhmm2.py Mon Jul 30 12:56:54 2012 -0400
+++ b/tools/protein_analysis/tmhmm2.py Fri Jan 25 06:08:31 2013 -0500
[
@@ -43,18 +43,14 @@
 import sys
 import os
 import tempfile
-from seq_analysis_utils import stop_err, split_fasta, run_jobs
+from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count
 
 FASTA_CHUNK = 500
 
 if len(sys.argv) != 4:
    stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file")
-try:
-   num_threads = int(sys.argv[1])
-except:
-   num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined
-if num_threads < 1:
-   stop_err("Threads argument %s is not a positive integer" % sys.argv[1])
+
+num_threads = thread_count(sys.argv[1], default=4)
 fasta_file = sys.argv[2]
 tabular_file = sys.argv[3]
 
b
diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/tmhmm2.xml
--- a/tools/protein_analysis/tmhmm2.xml Mon Jul 30 12:56:54 2012 -0400
+++ b/tools/protein_analysis/tmhmm2.xml Fri Jan 25 06:08:31 2013 -0500
b
@@ -1,4 +1,4 @@
-<tool id="tmhmm2" name="TMHMM 2.0" version="0.0.8">
+<tool id="tmhmm2" name="TMHMM 2.0" version="0.0.9">
     <description>Find transmembrane domains in protein sequences</description>
     <!-- If job splitting is enabled, break up the query file into parts -->
     <!-- Using 2000 chunks meaning 4 threads doing 500 each is ideal -->
@@ -9,6 +9,11 @@
       ##which (on SGE at least) will set the $NSLOTS environment variable.
       ##If the environment variable isn't set, get "", and defaults to one.
     </command>
+    <stdio>
+        <!-- Anything other than zero is an error -->
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+    </stdio>
     <inputs>
         <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> 
         <!--
b
diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/wolf_psort.py
--- a/tools/protein_analysis/wolf_psort.py Mon Jul 30 12:56:54 2012 -0400
+++ b/tools/protein_analysis/wolf_psort.py Fri Jan 25 06:08:31 2013 -0500
[
@@ -35,13 +35,13 @@
 """
 import sys
 import os
-from seq_analysis_utils import stop_err, split_fasta, run_jobs
+from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count
 
 FASTA_CHUNK = 500
 exe = "runWolfPsortSummary"
 
 """
-Note: I had trouble getting runWolfPsortSummary on the path (via a link, other
+Note: I had trouble getting runWolfPsortSummary on the path (via a link), other
 than by including all of /opt/WoLFPSORT_package_v0.2/bin , so used a wrapper
 python script called runWolfPsortSummary as follows:
 
@@ -65,15 +65,8 @@
 if organism not in ["animal", "plant", "fungi"]:
    stop_err("Organism argument %s is not one of animal, plant, fungi" % organism)
 
-try:
-   num_threads = int(sys.argv[2])
-except:
-   num_threads = 0
-if num_threads < 1:
-   stop_err("Threads argument %s is not a positive integer" % sys.argv[2])
-
+num_threads = thread_count(sys.argv[2], default=4)
 fasta_file = sys.argv[3]
-
 tabular_file = sys.argv[4]
 
 def clean_tabular(raw_handle, out_handle):
b
diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/wolf_psort.xml
--- a/tools/protein_analysis/wolf_psort.xml Mon Jul 30 12:56:54 2012 -0400
+++ b/tools/protein_analysis/wolf_psort.xml Fri Jan 25 06:08:31 2013 -0500
b
@@ -1,9 +1,14 @@
-<tool id="wolf_psort" name="WoLF PSORT" version="0.0.1">
+<tool id="wolf_psort" name="WoLF PSORT" version="0.0.2">
     <description>Eukaryote protein subcellular localization prediction</description>
     <command interpreter="python">
       wolf_psort.py $organism 8 $fasta_file $tabular_file
       ##I want the number of threads to be a Galaxy config option...
     </command>
+    <stdio>
+        <!-- Anything other than zero is an error -->
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+    </stdio>
     <inputs>
         <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> 
         <param name="organism" type="select" display="radio" label="Organism">