# HG changeset patch # User peterjc # Date 1506008120 14400 # Node ID a19b3ded8f333659004d0020747dfd97a132a8a8 # Parent f3ecd80850e21cb3ec7e6ed22bba342ea3335ed3 v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes diff -r f3ecd80850e2 -r a19b3ded8f33 tools/protein_analysis/README.rst --- a/tools/protein_analysis/README.rst Wed Feb 01 09:46:42 2017 -0500 +++ b/tools/protein_analysis/README.rst Thu Sep 21 11:35:20 2017 -0400 @@ -20,7 +20,7 @@ See the included LICENCE file for details (MIT open source licence). The wrappers are available from the Galaxy Tool Shed -http://toolshed.g2.bx.psu.edu/view/peterjc/tmhmm_and_signalp +http://toolshed.g2.bx.psu.edu/view/peterjc/tmhmm_and_signalp Citation ======== @@ -174,11 +174,24 @@ v0.2.6 - Use the new ``$GALAXY_SLOTS`` environment variable for thread count. - Updated the ``suite_config.xml`` file (overdue). - Tool definition now embeds citation information. -v0.2.7 - Style cleanup in Python scripts. +v0.2.7 - Style cleanup in Python scripts using ``pep8``. v0.2.8 - Reorder XML elements (internal change only). - Planemo for Tool Shed upload (``.shed.yml``, internal change only). - Record version of Promoter 2 via ````. v0.2.9 - Further style cleanup in Python scripts (internal change only). +v0.2.10 - Style cleanup in Python scripts using ``flake8``. + - Record TMHMM and SignalP wrapper version via ````. + - Python 3 compatible print function and exception handling. + - Python 3 compatible subprocess calling. + - Removed obsolete ``suite_config.xml`` file. +v0.2.11 - Updated RXLR tool dependencies to get HMMER2 via BioConda. + - Use ```` (internal change only). + - Single quote command line arguments (internal change only). + - Record WoLF PSORT wrapper version via ````. + - Fix error handling in SignalP wrapper. + - Record SignalP version via ````. + - Internal job splitter will skip starting any pending jobs after a + child job fails (so the entire task will error out more quickly). ======= ====================================================================== @@ -197,19 +210,19 @@ Planemo commands (which requires you have set your Tool Shed access details in ``~/.planemo.yml`` and that you have access rights on the Tool Shed):: - $ planemo shed_update -t testtoolshed --check_diff ~/repositories/pico_galaxy/tools/protein_analysis/ + $ planemo shed_update -t testtoolshed --check_diff tools/protein_analysis/ ... or:: - $ planemo shed_update -t toolshed --check_diff ~/repositories/pico_galaxy/tools/protein_analysis/ + $ planemo shed_update -t toolshed --check_diff tools/protein_analysis/ ... To just build and check the tar ball, use:: - $ planemo shed_upload --tar_only ~/repositories/pico_galaxy/tools/protein_analysis/ + $ planemo shed_upload --tar_only tools/protein_analysis/ ... - $ tar -tzf shed_upload.tar.gz + $ tar -tzf shed_upload.tar.gz test-data/Adenovirus.fasta test-data/Adenovirus.promoter2.tabular test-data/empty.fasta diff -r f3ecd80850e2 -r a19b3ded8f33 tools/protein_analysis/promoter2.py --- a/tools/protein_analysis/promoter2.py Wed Feb 01 09:46:42 2017 -0500 +++ b/tools/protein_analysis/promoter2.py Thu Sep 21 11:35:20 2017 -0400 @@ -18,7 +18,6 @@ Additionally, in order to take advantage of multiple cores the input FASTA file is broken into chunks and multiple copies of promoter run at once. This can be used in combination with the job-splitting available in Galaxy. - Note that rewriting the FASTA input file allows us to avoid a bug in promoter 2 with long descriptions in the FASTA header line (over 200 characters) which produces stray fragements of the description in the @@ -26,11 +25,15 @@ TODO - Automatically extract the sequence containing a promoter prediction? """ -import sys -import os + +from __future__ import print_function + import commands +import os +import sys import tempfile -from seq_analysis_utils import split_fasta, run_jobs, thread_count + +from seq_analysis_utils import run_jobs, split_fasta, thread_count FASTA_CHUNK = 500 @@ -49,6 +52,7 @@ def get_path_and_binary(): + """Determine path and binary names for promoter tool.""" platform = commands.getoutput("uname") # e.g. Linux shell_script = commands.getoutput("which promoter") if not os.path.isfile(shell_script): @@ -74,7 +78,7 @@ identifier = None queries = 0 for line in raw_handle: - # print repr(line) + # print(repr(line)) if not line.strip() or line == "Promoter prediction:\n": pass elif line[0] != " ": @@ -89,7 +93,7 @@ try: position, score, likelihood = line.strip().split(None, 2) except ValueError: - print "WARNING: Problem with line: %r" % line + print("WARNING: Problem with line: %r" % line) continue # sys.exit("ERROR: Problem with line: %r" % line) if likelihood not in ["ignored", @@ -100,6 +104,7 @@ out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood)) return queries + working_dir, bin = get_path_and_binary() if not os.path.isfile(fasta_file): @@ -124,9 +129,10 @@ except Exception: pass + if len(jobs) > 1 and num_threads > 1: # A small "info" message for Galaxy to show the user. - print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) + print("Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))) cur_dir = os.path.abspath(os.curdir) os.chdir(working_dir) results = run_jobs(jobs, num_threads) @@ -159,4 +165,4 @@ out_handle.close() clean_up(fasta_files + temp_files) -print "Results for %i queries" % queries +print("Results for %i queries" % queries) diff -r f3ecd80850e2 -r a19b3ded8f33 tools/protein_analysis/promoter2.xml --- a/tools/protein_analysis/promoter2.xml Wed Feb 01 09:46:42 2017 -0500 +++ b/tools/protein_analysis/promoter2.xml Thu Sep 21 11:35:20 2017 -0400 @@ -1,25 +1,19 @@ - + Find eukaryotic PolII promoters in DNA sequences - promoter promoter - - - - - - promoter2.py --version - - promoter2.py "\$GALAXY_SLOTS" "$fasta_file" "$tabular_file" - ##If the environment variable isn't set, get "", and the python wrapper - ##defaults to four threads. + +python $__tool_directory__/promoter2.py --version + + +python $__tool_directory__/promoter2.py "\$GALAXY_SLOTS" '$fasta_file' '$tabular_file' - + @@ -35,7 +29,7 @@ - + **What it does** This calls the Promoter 2.0 tool for prediction of eukaryotic PolII promoter sequences using a Neural Network (NN) model. @@ -59,7 +53,7 @@ below 0.5 ignored 0.5 - 0.8 Marginal prediction 0.8 - 1.0 Medium likely prediction -above 1.0 Highly likely prediction +above 1.0 Highly likely prediction ========= ======================== Internally the input FASTA file is divided into parts (to allow multiple processors to be used), and the raw output is reformatted into this tabular layout suitable for downstream analysis within Galaxy. diff -r f3ecd80850e2 -r a19b3ded8f33 tools/protein_analysis/psortb.py --- a/tools/protein_analysis/psortb.py Wed Feb 01 09:46:42 2017 -0500 +++ b/tools/protein_analysis/psortb.py Thu Sep 21 11:35:20 2017 -0400 @@ -21,10 +21,14 @@ Additionally it ensures the header line (with the column names) starts with a # character as used elsewhere in Galaxy. """ -import sys + +from __future__ import print_function + import os +import sys import tempfile -from seq_analysis_utils import split_fasta, run_jobs, thread_count + +from seq_analysis_utils import run_jobs, split_fasta, thread_count FASTA_CHUNK = 500 @@ -65,7 +69,8 @@ 'Motif-_Localization', 'Motif-_Details', 'OMPMotif-_Localization', 'OMPMotif-_Details', 'OMSVM-_Localization', 'OMSVM-_Details', 'PPSVM-_Localization', 'PPSVM-_Details', 'Profile-_Localization', 'Profile-_Details', - 'SCL-BLAST-_Localization', 'SCL-BLAST-_Details', 'SCL-BLASTe-_Localization', 'SCL-BLASTe-_Details', + 'SCL-BLAST-_Localization', 'SCL-BLAST-_Details', + 'SCL-BLASTe-_Localization', 'SCL-BLASTe-_Details', 'Signal-_Localization', 'Signal-_Details', 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score', 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', @@ -76,7 +81,8 @@ 'CytoSVM+_Localization', 'CytoSVM+_Details', 'ECSVM+_Localization', 'ECSVM+_Details', 'ModHMM+_Localization', 'ModHMM+_Details', 'Motif+_Localization', 'Motif+_Details', 'Profile+_Localization', 'Profile+_Details', - 'SCL-BLAST+_Localization', 'SCL-BLAST+_Details', 'SCL-BLASTe+_Localization', 'SCL-BLASTe+_Details', + 'SCL-BLAST+_Localization', 'SCL-BLAST+_Details', + 'SCL-BLASTe+_Localization', 'SCL-BLASTe+_Details', 'Signal+_Localization', 'Signal+_Details', 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', @@ -87,7 +93,8 @@ 'CytoSVM_a_Localization', 'CytoSVM_a_Details', 'ECSVM_a_Localization', 'ECSVM_a_Details', 'ModHMM_a_Localization', 'ModHMM_a_Details', 'Motif_a_Localization', 'Motif_a_Details', 'Profile_a_Localization', 'Profile_a_Details', - 'SCL-BLAST_a_Localization', 'SCL-BLAST_a_Details', 'SCL-BLASTe_a_Localization', 'SCL-BLASTe_a_Details', + 'SCL-BLAST_a_Localization', 'SCL-BLAST_a_Details', + 'SCL-BLASTe_a_Localization', 'SCL-BLASTe_a_Details', 'Signal_a_Localization', 'Signal_a_Details', 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', @@ -122,6 +129,7 @@ count += 1 return count + # Note that if the input FASTA file contains no sequences, # split_fasta returns an empty list (i.e. zero temp files). fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) @@ -139,9 +147,10 @@ except Exception: pass + if len(jobs) > 1 and num_threads > 1: # A small "info" message for Galaxy to show the user. - print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) + print("Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))) results = run_jobs(jobs, num_threads) for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): error_level = results[cmd] @@ -167,6 +176,6 @@ clean_up(fasta_files + temp_files) sys.exit("No output from psortb") out_handle.close() -print "%i records" % count +print("%i records" % count) clean_up(fasta_files + temp_files) diff -r f3ecd80850e2 -r a19b3ded8f33 tools/protein_analysis/psortb.xml --- a/tools/protein_analysis/psortb.xml Wed Feb 01 09:46:42 2017 -0500 +++ b/tools/protein_analysis/psortb.xml Thu Sep 21 11:35:20 2017 -0400 @@ -1,22 +1,16 @@ - + Determines sub-cellular localisation of bacterial/archaeal protein sequences - psort psort - - - - - - psortb.py --version - -psortb.py "\$GALAXY_SLOTS" "$type" "$long" "$cutoff" "$divergent" "$sequence" "$outfile" -##If the environment variable isn't set, get "", and python wrapper -##defaults to four threads. + +python $__tool_directory__/psortb.py --version + + +python $__tool_directory__/psortb.py "\$GALAXY_SLOTS" '$type' '$long' '$cutoff' '$divergent' '$sequence' '$outfile' + Find RXLR Effectors of Plant Pathogenic Oomycetes - signalp signalp - hmmsearch - hmmsearch + hmmer2 - - - - - - rxlr_motifs.py -v - - rxlr_motifs.py "$fasta_file" "\$GALAXY_SLOTS" $model "$tabular_file" + +python $__tool_directory__/rxlr_motifs.py -v + + +python $__tool_directory__/rxlr_motifs.py '$fasta_file' "\$GALAXY_SLOTS" '$model' '$tabular_file' - + - + @@ -51,7 +46,7 @@ - + **Background** Many effector proteins from oomycete plant pathogens for manipulating the host diff -r f3ecd80850e2 -r a19b3ded8f33 tools/protein_analysis/seq_analysis_utils.py --- a/tools/protein_analysis/seq_analysis_utils.py Wed Feb 01 09:46:42 2017 -0500 +++ b/tools/protein_analysis/seq_analysis_utils.py Thu Sep 21 11:35:20 2017 -0400 @@ -7,9 +7,13 @@ multiprocessing so the function run_jobs instead is a simple pool approach using just the subprocess library. """ -import sys + +from __future__ import print_function + import os import subprocess +import sys + from time import sleep __version__ = "0.0.2" @@ -47,6 +51,7 @@ def thread_count(command_line_arg, default=1): + """Determine number of threads to use from the command line args.""" try: num = int(command_line_arg) except ValueError: @@ -137,7 +142,7 @@ handle.close() files.append(new_filename) # print "%i records in %s" % (len(records), new_filename) - except ValueError, err: + except ValueError as err: # Max length failure from parser - clean up try: handle.close() @@ -152,37 +157,47 @@ return files -def run_jobs(jobs, threads, pause=10, verbose=False): +def run_jobs(jobs, threads, pause=10, verbose=False, fast_fail=True): """Takes list of cmd strings, returns dict with error levels.""" pending = jobs[:] running = [] results = {} + skipped = [] if threads == 1: # Special case this for speed, don't need the waits for cmd in jobs: results[cmd] = subprocess.call(cmd, shell=True) return results + failed = False while pending or running: # See if any have finished for (cmd, process) in running: return_code = process.poll() # non-blocking if return_code is not None: results[cmd] = return_code + if return_code: + failed = True running = [(cmd, process) for (cmd, process) in running if cmd not in results] if verbose: - print "%i jobs pending, %i running, %i completed" \ - % (len(pending), len(running), len(results)) + print("%i jobs pending, %i running, %i completed" % + (len(pending), len(running), len(results))) # See if we can start any new threads + if pending and failed and fast_fail: + # Don't start any more jobs + if verbose: + print("Failed, will not start remaining %i jobs" % len(pending)) + skipped = pending + pending = [] while pending and len(running) < threads: cmd = pending.pop(0) if verbose: - print cmd + print(cmd) process = subprocess.Popen(cmd, shell=True) running.append((cmd, process)) # Loop... sleep(pause) if verbose: - print "%i jobs completed" % len(results) - assert set(jobs) == set(results) + print("%i jobs completed" % len(results)) + assert set(jobs) == set(results).union(skipped) return results diff -r f3ecd80850e2 -r a19b3ded8f33 tools/protein_analysis/signalp3.py --- a/tools/protein_analysis/signalp3.py Wed Feb 01 09:46:42 2017 -0500 +++ b/tools/protein_analysis/signalp3.py Thu Sep 21 11:35:20 2017 -0400 @@ -52,16 +52,24 @@ Finally, you can opt to have a GFF3 file produced which will describe the predicted signal peptide and mature peptide for each protein (using one of the predictors which gives a cleavage site). *WORK IN PROGRESS* -""" -import sys +""" # noqa: E501 + +from __future__ import print_function + import os +import sys import tempfile -from seq_analysis_utils import split_fasta, fasta_iterator + +from seq_analysis_utils import fasta_iterator, split_fasta from seq_analysis_utils import run_jobs, thread_count FASTA_CHUNK = 500 MAX_LEN = 6000 # Found by trial and error +if "-v" in sys.argv or "--version" in sys.argv: + print("SignalP Galaxy wrapper version 0.0.19") + sys.exit(os.system("signalp -version")) + if len(sys.argv) not in [6, 8]: sys.exit("Require five (or 7) arguments, organism, truncate, threads, " "input protein FASTA file & output tabular file (plus " @@ -96,15 +104,8 @@ tmp_dir = tempfile.mkdtemp() -def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None): +def clean_tabular(raw_handle, out_handle, gff_handle=None): """Clean up SignalP output to make it tabular.""" - if cut_method: - cut_col = {"NN_Cmax": 2, - "NN_Ymax": 5, - "NN_Smax": 8, - "HMM_Cmax": 16}[cut_method] - else: - cut_col = None for line in raw_handle: if not line or line.startswith("#"): continue @@ -119,6 +120,7 @@ def make_gff(fasta_file, tabular_file, gff_file, cut_method): + """Make a GFF file.""" cut_col, score_col = {"NN_Cmax": (2, 1), "NN_Ymax": (5, 4), "NN_Smax": (8, 7), @@ -152,7 +154,7 @@ assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) score = parts[score_col] gff_handle.write("##sequence-region %s %i %i\n" - % (seqid, 1, len(seq))) + % (seqid, 1, len(seq))) # If the cut is at the very begining, there is no signal peptide! if cut > 1: # signal_peptide = SO:0000418 @@ -188,9 +190,10 @@ except Exception: pass + if len(jobs) > 1 and num_threads > 1: # A small "info" message for Galaxy to show the user. - print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) + print("Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))) results = run_jobs(jobs, num_threads) assert len(fasta_files) == len(temp_files) == len(jobs) for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): @@ -201,8 +204,11 @@ output = "(no output)" if error_level or output.lower().startswith("error running"): clean_up(fasta_files + temp_files) - sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), - error_level) + if output: + sys.stderr.write("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output)) + else: + sys.stderr.write("One or more tasks failed, e.g. %i from %r with no output\n" % (error_level, cmd)) + sys.exit(error_level) del results out_handle = open(tabular_file, "w") diff -r f3ecd80850e2 -r a19b3ded8f33 tools/protein_analysis/signalp3.xml --- a/tools/protein_analysis/signalp3.xml Wed Feb 01 09:46:42 2017 -0500 +++ b/tools/protein_analysis/signalp3.xml Thu Sep 21 11:35:20 2017 -0400 @@ -1,24 +1,19 @@ - + Find signal peptides in protein sequences - signalp signalp - - - - - - - signalp3.py $organism $truncate "\$GALAXY_SLOTS" $fasta_file $tabular_file - ##If the environment variable isn't set, get "", and the python wrapper - ##defaults to four threads. + +python $__tool_directory__/signalp3.py --version + + +python $__tool_directory__/signalp3.py $organism $truncate "\$GALAXY_SLOTS" '$fasta_file' '$tabular_file' - + @@ -35,36 +30,36 @@ - + - + - + - + - + - + **What it does** This calls the SignalP v3.0 tool for prediction of signal peptides, which uses both a Neural Network (NN) and Hidden Markov Model (HMM) to produce two sets of scores. @@ -83,12 +78,12 @@ **Neural Network Scores** -For each organism class (Eukaryote, Gram-negative and Gram-positive), two different neural networks are used, one for predicting the actual signal peptide and one for predicting the position of the signal peptidase I (SPase I) cleavage site. +For each organism class (Eukaryote, Gram-negative and Gram-positive), two different neural networks are used, one for predicting the actual signal peptide and one for predicting the position of the signal peptidase I (SPase I) cleavage site. The NN output comprises three different scores (C-max, S-max and Y-max) and two scores derived from them (S-mean and D-score). ====== ======= =============================================================== -Column Name Description +Column Name Description ------ ------- --------------------------------------------------------------- 2-4 C-score The C-score is the 'cleavage site' score. For each position in the submitted sequence, a C-score is reported, which should @@ -141,15 +136,15 @@ The raw output 'short' output from TMHMM v2.0 looks something like this (21 columns space separated - shown here formatted nicely). Notice that the identifiers are given twice, the first time truncated (as part of the NN predictions) and the second time in full (in the HMM predictions). -==================== ===== === = ===== === = ===== === = ===== = ===== = =================================== = ===== === = ===== = -# SignalP-NN euk predictions # SignalP-HMM euk predictions ------------------------------------------------------------------------------ ------------------------------------------------------------ -# name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ? -gi|2781234|pdb|1JLY| 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N gi|2781234|pdb|1JLY|B Q 0.000 17 N 0.000 N -gi|4959044|gb|AAD342 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N gi|4959044|gb|AAD34209.1|AF069992_1 Q 0.000 0 N 0.000 N -gi|671626|emb|CAA856 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N gi|671626|emb|CAA85685.1| Q 0.000 0 N 0.000 N -gi|3298468|dbj|BAA31 0.208 24 N 0.184 38 N 0.980 32 Y 0.613 Y 0.398 N gi|3298468|dbj|BAA31520.1| Q 0.066 24 N 0.139 N -==================== ===== === = ===== === = ===== === = ===== = ===== = =================================== = ===== === = ===== = +==================== ===== === = ===== === = ===== === = ===== = ===== = =================================== = ===== === = ===== = +# SignalP-NN euk predictions # SignalP-HMM euk predictions +----------------------------------------------------------------------------- ------------------------------------------------------------ +# name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ? +gi|2781234|pdb|1JLY| 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N gi|2781234|pdb|1JLY|B Q 0.000 17 N 0.000 N +gi|4959044|gb|AAD342 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N gi|4959044|gb|AAD34209.1|AF069992_1 Q 0.000 0 N 0.000 N +gi|671626|emb|CAA856 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N gi|671626|emb|CAA85685.1| Q 0.000 0 N 0.000 N +gi|3298468|dbj|BAA31 0.208 24 N 0.184 38 N 0.980 32 Y 0.613 Y 0.398 N gi|3298468|dbj|BAA31520.1| Q 0.066 24 N 0.139 N +==================== ===== === = ===== === = ===== === = ===== = ===== = =================================== = ===== === = ===== = In order to make this easier to use in Galaxy, the wrapper script simplifies this to remove the redundant column and use tabs for separation. It also includes a header line with unique column names. diff -r f3ecd80850e2 -r a19b3ded8f33 tools/protein_analysis/suite_config.xml --- a/tools/protein_analysis/suite_config.xml Wed Feb 01 09:46:42 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ - - TMHMM, SignalP, RXLR motifs, WoLF PSORT - - Find transmembrane domains in protein sequences - - - Find signal peptides in protein sequences - - - Find eukaryotic PolII promoters in DNA sequences - - - Bacteria/archaea protein subcellular localization prediction - - - Eukaryote protein subcellular localization prediction - - - Find RXLR Effectors of Plant Pathogenic Oomycetes - - diff -r f3ecd80850e2 -r a19b3ded8f33 tools/protein_analysis/tmhmm2.py --- a/tools/protein_analysis/tmhmm2.py Wed Feb 01 09:46:42 2017 -0500 +++ b/tools/protein_analysis/tmhmm2.py Thu Sep 21 11:35:20 2017 -0400 @@ -40,13 +40,20 @@ 64 bit machine with only the 32 bit binaries installed. This script will spot when there is no output from tmhmm2, and raise an error. """ -import sys + +from __future__ import print_function + import os +import sys import tempfile -from seq_analysis_utils import split_fasta, run_jobs, thread_count + +from seq_analysis_utils import run_jobs, split_fasta, thread_count FASTA_CHUNK = 500 +if "-v" in sys.argv or "--version" in sys.argv: + sys.exit("TMHMM wrapper version 0.0.16") + if len(sys.argv) != 4: sys.exit("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") @@ -81,10 +88,11 @@ assert topology.startswith("Topology="), line topology = topology[9:] out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" - % (identifier, length, exp_aa, first60, predhel, topology)) + % (identifier, length, exp_aa, first60, predhel, topology)) count += 1 return count + # Note that if the input FASTA file contains no sequences, # split_fasta returns an empty list (i.e. zero temp files). fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) @@ -103,9 +111,10 @@ except Exception: pass + if len(jobs) > 1 and num_threads > 1: # A small "info" message for Galaxy to show the user. - print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) + print("Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))) results = run_jobs(jobs, num_threads) for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): error_level = results[cmd] diff -r f3ecd80850e2 -r a19b3ded8f33 tools/protein_analysis/tmhmm2.xml --- a/tools/protein_analysis/tmhmm2.xml Wed Feb 01 09:46:42 2017 -0500 +++ b/tools/protein_analysis/tmhmm2.xml Thu Sep 21 11:35:20 2017 -0400 @@ -1,28 +1,23 @@ - + Find transmembrane domains in protein sequences - tmhmm - tmhmm + tmhmm2 - - - - - - - tmhmm2.py "\$GALAXY_SLOTS" $fasta_file $tabular_file - ##If the environment variable isn't set, get "", and the python wrapper - ##defaults to four threads. + +python $__tool_directory__/tmhmm2.py --version + + +python $__tool_directory__/tmhmm2.py "\$GALAXY_SLOTS" '$fasta_file' '$tabular_file' - + @@ -40,7 +35,7 @@ - + **What it does** This calls the TMHMM v2.0 tool for prediction of transmembrane (TM) helices in proteins using a hidden Markov model (HMM). @@ -65,7 +60,7 @@ One of the most common mistakes by the program is to reverse the direction of proteins with one TM segment (i.e. mixing up which end of the protein is outside and inside the membrane). -Do not use the program to predict whether a non-membrane protein is cytoplasmic or not. +Do not use the program to predict whether a non-membrane protein is cytoplasmic or not. **Notes** @@ -82,7 +77,7 @@ In order to make it easier to use in Galaxy, the wrapper script simplifies this to remove the redundant tags, and instead adds a comment line at the top with the column names: =================================== === ===== ======= ======= ==================== -#ID len ExpAA First60 PredHel Topology +#ID len ExpAA First60 PredHel Topology gi|2781234|pdb|1JLY|B 304 0.01 0.00 0 o gi|4959044|gb|AAD34209.1|AF069992_1 600 0.00 0.00 0 o gi|671626|emb|CAA85685.1| 473 0.19 0.00 0 o diff -r f3ecd80850e2 -r a19b3ded8f33 tools/protein_analysis/wolf_psort.py --- a/tools/protein_analysis/wolf_psort.py Wed Feb 01 09:46:42 2017 -0500 +++ b/tools/protein_analysis/wolf_psort.py Thu Sep 21 11:35:20 2017 -0400 @@ -33,9 +33,13 @@ normally use Python's multiprocessing library in this situation but it requires at least Python 2.6 and at the time of writing Galaxy still supports Python 2.4. """ -import sys + +from __future__ import print_function + import os -from seq_analysis_utils import split_fasta, run_jobs, thread_count +import sys + +from seq_analysis_utils import run_jobs, split_fasta, thread_count FASTA_CHUNK = 500 exe = "runWolfPsortSummary" @@ -61,6 +65,9 @@ https://lists.galaxyproject.org/pipermail/galaxy-dev/2015-December/023386.html """ +if "-v" in sys.argv or "--version" in sys.argv: + sys.exit("WoLF-PSORT wrapper version 0.0.11") + if len(sys.argv) != 5: sys.exit("Require four arguments, organism, threads, input protein FASTA file & output tabular file") @@ -84,6 +91,7 @@ out_handle.write("%s\t%s\t%s\t%i\n" % (name, comp, score, rank + 1)) + fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK) temp_files = [f + ".out" for f in fasta_files] assert len(fasta_files) == len(temp_files) @@ -97,9 +105,10 @@ if os.path.isfile(f): os.remove(f) + if len(jobs) > 1 and num_threads > 1: # A small "info" message for Galaxy to show the user. - print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) + print("Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))) results = run_jobs(jobs, num_threads) assert len(fasta_files) == len(temp_files) == len(jobs) for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): diff -r f3ecd80850e2 -r a19b3ded8f33 tools/protein_analysis/wolf_psort.xml --- a/tools/protein_analysis/wolf_psort.xml Wed Feb 01 09:46:42 2017 -0500 +++ b/tools/protein_analysis/wolf_psort.xml Thu Sep 21 11:35:20 2017 -0400 @@ -1,21 +1,16 @@ - + Eukaryote protein subcellular localization prediction - runWolfPsortSummary - psort + wolfpsort - - - - - - - wolf_psort.py $organism "\$GALAXY_SLOTS" "$fasta_file" "$tabular_file" - ##If the environment variable isn't set, get "", and python wrapper - ##defaults to four threads. + +python $__tool_directory__/wolf_psort.py --version + + +python $__tool_directory__/wolf_psort.py $organism "\$GALAXY_SLOTS" '$fasta_file' '$tabular_file' - + @@ -48,7 +43,7 @@ - + **What it does** This calls the WoLF PSORT tool for prediction of eukaryote protein subcellular localization. @@ -78,18 +73,18 @@ E.R. endoplasmic reticulum 0005783 extr extracellular 0005576, 0005618 golg Golgi apparatus 0005794(1) -lyso lysosome 0005764 -mito mitochondria 0005739 -nucl nuclear 0005634 -pero peroxisome 0005777(2) -plas plasma membrane 0005886 +lyso lysosome 0005764 +mito mitochondria 0005739 +nucl nuclear 0005634 +pero peroxisome 0005777(2) +plas plasma membrane 0005886 vacu vacuolar membrane 0005774(2) ====== ===================== ===================== Numbers in parentheses, such as "0005856(2)" indicate that descendant "part_of" cellular components were also included, up to the specified depth (2 in this case). For example, all of the children and grandchildren of "GO:0005856" were -included as "cysk". +included as "cysk". Additionally compound predictions like mito_nucl are also given.