Mercurial > repos > peterjc > tmhmm_and_signalp
changeset 19:f3ecd80850e2 draft
v0.2.9 Python style improvements
author | peterjc |
---|---|
date | Wed, 01 Feb 2017 09:46:42 -0500 |
parents | eb6ac44d4b8e |
children | a19b3ded8f33 |
files | tools/protein_analysis/README.rst tools/protein_analysis/promoter2.py tools/protein_analysis/psortb.py tools/protein_analysis/rxlr_motifs.py tools/protein_analysis/seq_analysis_utils.py tools/protein_analysis/signalp3.py tools/protein_analysis/tmhmm2.py tools/protein_analysis/wolf_psort.py |
diffstat | 8 files changed, 187 insertions(+), 172 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/protein_analysis/README.rst Tue Sep 01 09:56:36 2015 -0400 +++ b/tools/protein_analysis/README.rst Wed Feb 01 09:46:42 2017 -0500 @@ -178,6 +178,7 @@ v0.2.8 - Reorder XML elements (internal change only). - Planemo for Tool Shed upload (``.shed.yml``, internal change only). - Record version of Promoter 2 via ``<version_command>``. +v0.2.9 - Further style cleanup in Python scripts (internal change only). ======= ======================================================================
--- a/tools/protein_analysis/promoter2.py Tue Sep 01 09:56:36 2015 -0400 +++ b/tools/protein_analysis/promoter2.py Wed Feb 01 09:46:42 2017 -0500 @@ -30,7 +30,7 @@ import os import commands import tempfile -from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count +from seq_analysis_utils import split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 @@ -38,91 +38,94 @@ sys.exit(os.system("promoter -V")) if len(sys.argv) != 4: - sys_exit("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. " - "Got %i arguments." % (len(sys.argv)-1)) + sys.exit("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. " + "Got %i arguments." % (len(sys.argv) - 1)) -num_threads = thread_count(sys.argv[3],default=4) +num_threads = thread_count(sys.argv[3], default=4) fasta_file = os.path.abspath(sys.argv[2]) tabular_file = os.path.abspath(sys.argv[3]) tmp_dir = tempfile.mkdtemp() + def get_path_and_binary(): - platform = commands.getoutput("uname") #e.g. Linux + platform = commands.getoutput("uname") # e.g. Linux shell_script = commands.getoutput("which promoter") if not os.path.isfile(shell_script): - sys_exit("ERROR: Missing promoter executable shell script") + sys.exit("ERROR: Missing promoter executable shell script") path = None for line in open(shell_script): - if line.startswith("setenv"): #could then be tab or space! + if line.startswith("setenv"): # could then be tab or space! parts = line.rstrip().split(None, 2) if parts[0] == "setenv" and parts[1] == "PROM": path = parts[2] if not path: - sys_exit("ERROR: Could not find promoter path (PROM) in %r" % shell_script) + sys.exit("ERROR: Could not find promoter path (PROM) in %r" % shell_script) if not os.path.isdir(path): - sys_exit("ERROR: %r is not a directory" % path) + sys.exit("ERROR: %r is not a directory" % path) bin = "%s/bin/promoter_%s" % (path, platform) if not os.path.isfile(bin): - sys_exit("ERROR: Missing promoter binary %r" % bin) + sys.exit("ERROR: Missing promoter binary %r" % bin) return path, bin + def make_tabular(raw_handle, out_handle): """Parse text output into tabular, return query count.""" identifier = None queries = 0 for line in raw_handle: - #print repr(line) + # print repr(line) if not line.strip() or line == "Promoter prediction:\n": pass elif line[0] != " ": - identifier = line.strip().replace("\t", " ").split(None,1)[0] + identifier = line.strip().replace("\t", " ").split(None, 1)[0] queries += 1 elif line == " No promoter predicted\n": - #End of a record + # End of a record identifier = None elif line == " Position Score Likelihood\n": assert identifier else: try: - position, score, likelihood = line.strip().split(None,2) + position, score, likelihood = line.strip().split(None, 2) except ValueError: print "WARNING: Problem with line: %r" % line continue - #sys_exit("ERROR: Problem with line: %r" % line) + # sys.exit("ERROR: Problem with line: %r" % line) if likelihood not in ["ignored", "Marginal prediction", "Medium likely prediction", "Highly likely prediction"]: - sys_exit("ERROR: Problem with line: %r" % line) + sys.exit("ERROR: Problem with line: %r" % line) out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood)) return queries - + working_dir, bin = get_path_and_binary() if not os.path.isfile(fasta_file): - sys_exit("ERROR: Missing input FASTA file %r" % fasta_file) + sys.exit("ERROR: Missing input FASTA file %r" % fasta_file) -#Note that if the input FASTA file contains no sequences, -#split_fasta returns an empty list (i.e. zero temp files). -#We deliberately omit the FASTA descriptions to avoid a -#bug in promoter2 with descriptions over 200 characters. +# Note that if the input FASTA file contains no sequences, +# split_fasta returns an empty list (i.e. zero temp files). +# We deliberately omit the FASTA descriptions to avoid a +# bug in promoter2 with descriptions over 200 characters. fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False) -temp_files = [f+".out" for f in fasta_files] +temp_files = [f + ".out" for f in fasta_files] jobs = ["%s %s > %s" % (bin, fasta, temp) for fasta, temp in zip(fasta_files, temp_files)] + def clean_up(file_list): for f in file_list: if os.path.isfile(f): os.remove(f) try: os.rmdir(tmp_dir) - except: + except Exception: pass if len(jobs) > 1 and num_threads > 1: - #A small "info" message for Galaxy to show the user. + # A small "info" message for Galaxy to show the user. print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) cur_dir = os.path.abspath(os.curdir) os.chdir(working_dir) @@ -136,7 +139,7 @@ except IOError: output = "" clean_up(fasta_files + temp_files) - sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), + sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results @@ -151,7 +154,7 @@ data_handle.close() if not count: clean_up(fasta_files + temp_files) - sys_exit("No output from promoter2") + sys.exit("No output from promoter2") queries += count out_handle.close()
--- a/tools/protein_analysis/psortb.py Tue Sep 01 09:56:36 2015 -0400 +++ b/tools/protein_analysis/psortb.py Wed Feb 01 09:46:42 2017 -0500 @@ -24,7 +24,7 @@ import sys import os import tempfile -from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count +from seq_analysis_utils import split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 @@ -33,7 +33,7 @@ sys.exit(os.system("psort --version")) if len(sys.argv) != 8: - sys_exit("Require 7 arguments, number of threads (int), type (e.g. archaea), " + sys.exit("Require 7 arguments, number of threads (int), type (e.g. archaea), " "output (e.g. terse/normal/long), cutoff, divergent, input protein " "FASTA file & output tabular file") @@ -56,10 +56,10 @@ if out_type == "terse": header = ['SeqID', 'Localization', 'Score'] elif out_type == "normal": - sys_exit("Normal output not implemented yet, sorry.") + sys.exit("Normal output not implemented yet, sorry.") elif out_type == "long": if org_type == "-n": - #Gram negative bacteria + # Gram negative bacteria header = ['SeqID', 'CMSVM-_Localization', 'CMSVM-_Details', 'CytoSVM-_Localization', 'CytoSVM-_Details', 'ECSVM-_Localization', 'ECSVM-_Details', 'ModHMM-_Localization', 'ModHMM-_Details', 'Motif-_Localization', 'Motif-_Details', 'OMPMotif-_Localization', 'OMPMotif-_Details', @@ -71,7 +71,7 @@ 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', 'Secondary_Localization', 'PSortb_Version'] elif org_type == "-p": - #Gram positive bacteria + # Gram positive bacteria header = ['SeqID', 'CMSVM+_Localization', 'CMSVM+_Details', 'CWSVM+_Localization', 'CWSVM+_Details', 'CytoSVM+_Localization', 'CytoSVM+_Details', 'ECSVM+_Localization', 'ECSVM+_Details', 'ModHMM+_Localization', 'ModHMM+_Details', 'Motif+_Localization', 'Motif+_Details', @@ -82,7 +82,7 @@ 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', 'Secondary_Localization', 'PSortb_Version'] elif org_type == "-a": - #Archaea + # Archaea header = ['SeqID', 'CMSVM_a_Localization', 'CMSVM_a_Details', 'CWSVM_a_Localization', 'CWSVM_a_Details', 'CytoSVM_a_Localization', 'CytoSVM_a_Details', 'ECSVM_a_Localization', 'ECSVM_a_Details', 'ModHMM_a_Localization', 'ModHMM_a_Details', 'Motif_a_Localization', 'Motif_a_Details', @@ -93,27 +93,28 @@ 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', 'Secondary_Localization', 'PSortb_Version'] else: - sys_exit("Expected -n, -p or -a for the organism type, not %r" % org_type) + sys.exit("Expected -n, -p or -a for the organism type, not %r" % org_type) else: - sys_exit("Expected terse, normal or long for the output type, not %r" % out_type) + sys.exit("Expected terse, normal or long for the output type, not %r" % out_type) tmp_dir = tempfile.mkdtemp() + def clean_tabular(raw_handle, out_handle): """Clean up tabular TMHMM output, returns output line count.""" global header count = 0 for line in raw_handle: if not line.strip() or line.startswith("#"): - #Ignore any blank lines or comment lines + # Ignore any blank lines or comment lines continue parts = [x.strip() for x in line.rstrip("\r\n").split("\t")] if parts == header: - #Ignore the header line + # Ignore the header line continue if not parts[-1] and len(parts) == len(header) + 1: - #Ignore dummy blank extra column, e.g. - #"...2.0\t\tPSORTb version 3.0\t\n" + # Ignore dummy blank extra column, e.g. + # "...2.0\t\tPSORTb version 3.0\t\n" parts = parts[:-1] assert len(parts) == len(header), \ "%i fields, not %i, in line:\n%r" % (len(line), len(header), line) @@ -121,24 +122,25 @@ count += 1 return count -#Note that if the input FASTA file contains no sequences, -#split_fasta returns an empty list (i.e. zero temp files). +# Note that if the input FASTA file contains no sequences, +# split_fasta returns an empty list (i.e. zero temp files). fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) -temp_files = [f+".out" for f in fasta_files] +temp_files = [f + ".out" for f in fasta_files] jobs = ["psort %s %s %s -o %s %s > %s" % (org_type, cutoff, divergent, out_type, fasta, temp) for fasta, temp in zip(fasta_files, temp_files)] + def clean_up(file_list): for f in file_list: if os.path.isfile(f): os.remove(f) try: os.rmdir(tmp_dir) - except: + except Exception: pass if len(jobs) > 1 and num_threads > 1: - #A small "info" message for Galaxy to show the user. + # A small "info" message for Galaxy to show the user. print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) results = run_jobs(jobs, num_threads) for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): @@ -149,7 +151,7 @@ except IOError: output = "" clean_up(fasta_files + temp_files) - sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), + sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results del jobs @@ -163,7 +165,7 @@ data_handle.close() if not count: clean_up(fasta_files + temp_files) - sys_exit("No output from psortb") + sys.exit("No output from psortb") out_handle.close() print "%i records" % count
--- a/tools/protein_analysis/rxlr_motifs.py Tue Sep 01 09:56:36 2015 -0400 +++ b/tools/protein_analysis/rxlr_motifs.py Wed Feb 01 09:46:42 2017 -0500 @@ -40,14 +40,14 @@ import sys import re import subprocess -from seq_analysis_utils import sys_exit, fasta_iterator +from seq_analysis_utils import fasta_iterator if "-v" in sys.argv: print("RXLR Motifs v0.0.10") sys.exit(0) if len(sys.argv) != 5: - sys_exit("Requires four arguments: protein FASTA filename, threads, model, and output filename") + sys.exit("Requires four arguments: protein FASTA filename, threads, model, and output filename") fasta_file, threads, model, tabular_file = sys.argv[1:] hmm_output_file = tabular_file + ".hmm.tmp" @@ -86,8 +86,8 @@ min_rxlr_start = 1 max_rxlr_start = max_sp + max_sp_rxlr else: - sys_exit("Did not recognise the model name %r\n" - "Use Bhattacharjee2006, Win2007, or Whisson2007" % model) + sys.exit("Did not recognise the model name %r\n" + "Use Bhattacharjee2006, Win2007, or Whisson2007" % model) def get_hmmer_version(exe, required=None): @@ -105,23 +105,23 @@ return 3 else: raise ValueError("Could not determine version of %s" % exe) - + -#Run hmmsearch for Whisson et al. (2007) +# Run hmmsearch for Whisson et al. (2007) if model == "Whisson2007": hmm_file = os.path.join(os.path.split(sys.argv[0])[0], "whisson_et_al_rxlr_eer_cropped.hmm") if not os.path.isfile(hmm_file): - sys_exit("Missing HMM file for Whisson et al. (2007)") + sys.exit("Missing HMM file for Whisson et al. (2007)") if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): - sys_exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search) + sys.exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search) hmm_hits = set() valid_ids = set() for title, seq in fasta_iterator(fasta_file): - name = title.split(None,1)[0] + name = title.split(None, 1)[0] if name in valid_ids: - sys_exit("Duplicated identifier %r" % name) + sys.exit("Duplicated identifier %r" % name) else: valid_ids.add(name) if not valid_ids: @@ -146,7 +146,7 @@ % (hmmer_search, hmm_file, fasta_file, hmm_output_file) return_code = os.system(cmd) if return_code: - sys_exit("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) + sys.exit("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) handle = open(hmm_output_file) for line in handle: @@ -157,18 +157,18 @@ # Header continue else: - name = line.split(None,1)[0] - #Should be a sequence name in the HMMER3 table output. - #Could be anything in the HMMER2 stdout. + name = line.split(None, 1)[0] + # Should be a sequence name in the HMMER3 table output. + # Could be anything in the HMMER2 stdout. if name in valid_ids: hmm_hits.add(name) elif hmmer3: - sys_exit("Unexpected identifer %r in hmmsearch output" % name) + sys.exit("Unexpected identifer %r in hmmsearch output" % name) handle.close() # if hmmer3: # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) # else: - # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) + # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) # print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) os.remove(hmm_output_file) del valid_ids @@ -181,8 +181,8 @@ handle = open(signalp_input_file, "w") for title, seq in fasta_iterator(fasta_file): total += 1 - name = title.split(None,1)[0] - match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) + name = title.split(None, 1)[0] + match = re_rxlr.search(seq[min_rxlr_start - 1:].upper()) if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: # This is a potential RXLR, depending on the SignalP results. # Might as well truncate the sequence now, makes the temp file smaller @@ -199,11 +199,11 @@ # Run SignalP (using our wrapper script to get multi-core support etc) signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py") if not os.path.isfile(signalp_script): - sys_exit("Error - missing signalp3.py script") + sys.exit("Error - missing signalp3.py script") cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file) return_code = os.system(cmd) if return_code: - sys_exit("Error %i from SignalP:\n%s" % (return_code, cmd)) + sys.exit("Error %i from SignalP:\n%s" % (return_code, cmd)) # print "SignalP done" @@ -217,8 +217,8 @@ assert line.startswith("#ID\t"), line for line in handle: parts = line.rstrip("\t").split("\t") - assert len(parts)==20, repr(line) - yield parts[0], float(parts[18]), int(parts[5])-1 + assert len(parts) == 20, repr(line) + yield parts[0], float(parts[18]), int(parts[5]) - 1 handle.close() @@ -231,12 +231,12 @@ for title, seq in fasta_iterator(fasta_file): total += 1 rxlr = "N" - name = title.split(None,1)[0] - match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) + name = title.split(None, 1)[0] + match = re_rxlr.search(seq[min_rxlr_start - 1:].upper()) if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: del match # This was the criteria for calling SignalP, - #so it will be in the SignalP results. + # so it will be in the SignalP results. sp_id, sp_hmm_score, sp_nn_len = signalp_results.next() assert name == sp_id, "%s vs %s" % (name, sp_id) if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp:
--- a/tools/protein_analysis/seq_analysis_utils.py Tue Sep 01 09:56:36 2015 -0400 +++ b/tools/protein_analysis/seq_analysis_utils.py Wed Feb 01 09:46:42 2017 -0500 @@ -12,17 +12,12 @@ import subprocess from time import sleep -__version__ = "0.0.1" - -def sys_exit(msg, error_level=1): - """Print error message to stdout and quit with given error level.""" - sys.stderr.write("%s\n" % msg) - sys.exit(error_level) +__version__ = "0.0.2" try: from multiprocessing import cpu_count except ImportError: - #Must be under Python 2.5, this is copied from multiprocessing: + # Must be under Python 2.5, this is copied from multiprocessing: def cpu_count(): """Returns the number of CPUs in the system.""" if sys.platform == 'win32': @@ -54,18 +49,18 @@ def thread_count(command_line_arg, default=1): try: num = int(command_line_arg) - except: + except ValueError: num = default if num < 1: - sys_exit("Threads argument %r is not a positive integer" % command_line_arg) - #Cap this with the pysical limit of the machine, + sys.exit("Threads argument %r is not a positive integer" % command_line_arg) + # Cap this with the pysical limit of the machine, try: num = min(num, cpu_count()) except NotImplementedError: pass - #For debugging, - #hostname = os.environ.get("HOSTNAME", "this machine") - #sys.stderr.write("Using %i cores on %s\n" % (num, hostname)) + # For debugging, + # hostname = os.environ.get("HOSTNAME", "this machine") + # sys.stderr.write("Using %i cores on %s\n" % (num, hostname)) return num @@ -79,7 +74,7 @@ if truncate: seq = seq[:truncate] if max_len and len(seq) > max_len: - raise ValueError("Sequence %s is length %i, max length %i" \ + raise ValueError("Sequence %s is length %i, max length %i" % (title.split()[0], len(seq), max_len)) yield title, seq title = line[1:].rstrip() @@ -87,8 +82,8 @@ elif title: seq += line.strip() elif not line.strip() or line.startswith("#"): - #Ignore blank lines, and any comment lines - #between records (starting with hash). + # Ignore blank lines, and any comment lines + # between records (starting with hash). pass else: handle.close() @@ -98,11 +93,12 @@ if truncate: seq = seq[:truncate] if max_len and len(seq) > max_len: - raise ValueError("Sequence %s is length %i, max length %i" \ + raise ValueError("Sequence %s is length %i, max length %i" % (title.split()[0], len(seq), max_len)) yield title, seq raise StopIteration + def split_fasta(input_filename, output_filename_base, n=500, truncate=None, keep_descr=False, max_len=None): """Split FASTA file into sub-files each of at most n sequences. @@ -132,20 +128,20 @@ for title, seq in records: handle.write(">%s\n" % title) for i in range(0, len(seq), 60): - handle.write(seq[i:i+60] + "\n") + handle.write(seq[i:i + 60] + "\n") else: for title, seq in records: handle.write(">%s\n" % title.split()[0]) for i in range(0, len(seq), 60): - handle.write(seq[i:i+60] + "\n") + handle.write(seq[i:i + 60] + "\n") handle.close() files.append(new_filename) - #print "%i records in %s" % (len(records), new_filename) + # print "%i records in %s" % (len(records), new_filename) except ValueError, err: - #Max length failure from parser - clean up + # Max length failure from parser - clean up try: handle.close() - except: + except Exception: pass for f in files: if os.path.isfile(f): @@ -155,35 +151,36 @@ assert os.path.isfile(f), "Missing split file %r (!??)" % f return files + def run_jobs(jobs, threads, pause=10, verbose=False): """Takes list of cmd strings, returns dict with error levels.""" pending = jobs[:] running = [] results = {} if threads == 1: - #Special case this for speed, don't need the waits + # Special case this for speed, don't need the waits for cmd in jobs: results[cmd] = subprocess.call(cmd, shell=True) return results while pending or running: - #See if any have finished + # See if any have finished for (cmd, process) in running: - return_code = process.poll() #non-blocking + return_code = process.poll() # non-blocking if return_code is not None: results[cmd] = return_code - running = [(cmd, process) for (cmd, process) in running \ + running = [(cmd, process) for (cmd, process) in running if cmd not in results] if verbose: print "%i jobs pending, %i running, %i completed" \ % (len(pending), len(running), len(results)) - #See if we can start any new threads + # See if we can start any new threads while pending and len(running) < threads: cmd = pending.pop(0) if verbose: print cmd process = subprocess.Popen(cmd, shell=True) running.append((cmd, process)) - #Loop... + # Loop... sleep(pause) if verbose: print "%i jobs completed" % len(results)
--- a/tools/protein_analysis/signalp3.py Tue Sep 01 09:56:36 2015 -0400 +++ b/tools/protein_analysis/signalp3.py Wed Feb 01 09:46:42 2017 -0500 @@ -21,9 +21,9 @@ # SignalP-NN euk predictions # SignalP-HMM euk predictions # name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ? -gi|2781234|pdb|1JLY| 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N gi|2781234|pdb|1JLY|B Q 0.000 17 N 0.000 N -gi|4959044|gb|AAD342 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N gi|4959044|gb|AAD34209.1|AF069992_1 Q 0.000 0 N 0.000 N -gi|671626|emb|CAA856 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N gi|671626|emb|CAA85685.1| Q 0.000 0 N 0.000 N +gi|2781234|pdb|1JLY| 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N gi|2781234|pdb|1JLY|B Q 0.000 17 N 0.000 N +gi|4959044|gb|AAD342 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N gi|4959044|gb|AAD34209.1|AF069992_1 Q 0.000 0 N 0.000 N +gi|671626|emb|CAA856 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N gi|671626|emb|CAA85685.1| Q 0.000 0 N 0.000 N gi|3298468|dbj|BAA31 0.208 24 N 0.184 38 N 0.980 32 Y 0.613 Y 0.398 N gi|3298468|dbj|BAA31520.1| Q 0.066 24 N 0.139 N In order to make it easier to use in Galaxy, this wrapper script reformats @@ -56,28 +56,28 @@ import sys import os import tempfile -from seq_analysis_utils import sys_exit, split_fasta, fasta_iterator +from seq_analysis_utils import split_fasta, fasta_iterator from seq_analysis_utils import run_jobs, thread_count FASTA_CHUNK = 500 -MAX_LEN = 6000 #Found by trial and error +MAX_LEN = 6000 # Found by trial and error -if len(sys.argv) not in [6,8]: - sys_exit("Require five (or 7) arguments, organism, truncate, threads, " +if len(sys.argv) not in [6, 8]: + sys.exit("Require five (or 7) arguments, organism, truncate, threads, " "input protein FASTA file & output tabular file (plus " "optionally cut method and GFF3 output file). " - "Got %i arguments." % (len(sys.argv)-1)) + "Got %i arguments." % (len(sys.argv) - 1)) organism = sys.argv[1] if organism not in ["euk", "gram+", "gram-"]: - sys_exit("Organism argument %s is not one of euk, gram+ or gram-" % organism) + sys.exit("Organism argument %s is not one of euk, gram+ or gram-" % organism) try: truncate = int(sys.argv[2]) -except: +except ValueError: truncate = 0 if truncate < 0: - sys_exit("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) + sys.exit("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) num_threads = thread_count(sys.argv[3], default=4) fasta_file = sys.argv[4] @@ -86,7 +86,7 @@ if len(sys.argv) == 8: cut_method = sys.argv[6] if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]: - sys_exit("Invalid cut method %r" % cut_method) + sys.exit("Invalid cut method %r" % cut_method) gff3_file = sys.argv[7] else: cut_method = None @@ -95,39 +95,41 @@ tmp_dir = tempfile.mkdtemp() + def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None): """Clean up SignalP output to make it tabular.""" if cut_method: - cut_col = {"NN_Cmax" : 2, - "NN_Ymax" : 5, - "NN_Smax" : 8, - "HMM_Cmax" : 16}[cut_method] + cut_col = {"NN_Cmax": 2, + "NN_Ymax": 5, + "NN_Smax": 8, + "HMM_Cmax": 16}[cut_method] else: cut_col = None for line in raw_handle: if not line or line.startswith("#"): continue parts = line.rstrip("\r\n").split() - assert len(parts)==21, repr(line) + assert len(parts) == 21, repr(line) assert parts[14].startswith(parts[0]), \ "Bad entry in SignalP output, ID miss-match:\n%r" % line - #Remove redundant truncated name column (col 0) - #and put full name at start (col 14) + # Remove redundant truncated name column (col 0) + # and put full name at start (col 14) parts = parts[14:15] + parts[1:14] + parts[15:] out_handle.write("\t".join(parts) + "\n") + def make_gff(fasta_file, tabular_file, gff_file, cut_method): - cut_col, score_col = {"NN_Cmax" : (2,1), - "NN_Ymax" : (5,4), - "NN_Smax" : (8,7), - "HMM_Cmax" : (16,15), + cut_col, score_col = {"NN_Cmax": (2, 1), + "NN_Ymax": (5, 4), + "NN_Smax": (8, 7), + "HMM_Cmax": (16, 15), }[cut_method] source = "SignalP" - strand = "." #not stranded - phase = "." #not phased + strand = "." # not stranded + phase = "." # not phased tags = "Note=%s" % cut_method - + tab_handle = open(tabular_file) line = tab_handle.readline() assert line.startswith("#ID\t"), line @@ -139,53 +141,55 @@ parts = line.rstrip("\n").split("\t") seqid = parts[0] assert title.startswith(seqid), "%s vs %s" % (seqid, title) - if len(seq)==0: - #Is it possible to have a zero length reference in GFF3? + if not seq: + # Is it possible to have a zero length reference in GFF3? continue cut = int(parts[cut_col]) if cut == 0: assert cut_method == "HMM_Cmax", cut_method - #TODO - Why does it do this? + # TODO - Why does it do this? cut = 1 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) score = parts[score_col] - gff_handle.write("##sequence-region %s %i %i\n" \ + gff_handle.write("##sequence-region %s %i %i\n" % (seqid, 1, len(seq))) - #If the cut is at the very begining, there is no signal peptide! + # If the cut is at the very begining, there is no signal peptide! if cut > 1: - #signal_peptide = SO:0000418 - gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ + # signal_peptide = SO:0000418 + gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" % (seqid, source, - "signal_peptide", 1, cut-1, + "signal_peptide", 1, cut - 1, score, strand, phase, tags)) - #mature_protein_region = SO:0000419 - gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ + # mature_protein_region = SO:0000419 + gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" % (seqid, source, "mature_protein_region", cut, len(seq), score, strand, phase, tags)) - tab_handle.close() + tab_handle.close() gff_handle.close() fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"), n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) -temp_files = [f+".out" for f in fasta_files] +temp_files = [f + ".out" for f in fasta_files] assert len(fasta_files) == len(temp_files) jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) for (fasta, temp) in zip(fasta_files, temp_files)] assert len(fasta_files) == len(temp_files) == len(jobs) + def clean_up(file_list): + """Remove temp files, and if possible the temp directory.""" for f in file_list: if os.path.isfile(f): os.remove(f) try: os.rmdir(tmp_dir) - except: + except Exception: pass if len(jobs) > 1 and num_threads > 1: - #A small "info" message for Galaxy to show the user. + # A small "info" message for Galaxy to show the user. print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) results = run_jobs(jobs, num_threads) assert len(fasta_files) == len(temp_files) == len(jobs) @@ -197,17 +201,17 @@ output = "(no output)" if error_level or output.lower().startswith("error running"): clean_up(fasta_files + temp_files) - sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), + sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results out_handle = open(tabular_file, "w") fields = ["ID"] -#NN results: +# NN results: for name in ["Cmax", "Ymax", "Smax"]: - fields.extend(["NN_%s_score"%name, "NN_%s_pos"%name, "NN_%s_pred"%name]) + fields.extend(["NN_%s_score" % name, "NN_%s_pos" % name, "NN_%s_pred" % name]) fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"]) -#HMM results: +# HMM results: fields.extend(["HMM_type", "HMM_Cmax_score", "HMM_Cmax_pos", "HMM_Cmax_pred", "HMM_Sprob_score", "HMM_Sprob_pred"]) out_handle.write("#" + "\t".join(fields) + "\n") @@ -217,7 +221,7 @@ data_handle.close() out_handle.close() -#GFF3: +# GFF3: if cut_method: make_gff(fasta_file, tabular_file, gff3_file, cut_method)
--- a/tools/protein_analysis/tmhmm2.py Tue Sep 01 09:56:36 2015 -0400 +++ b/tools/protein_analysis/tmhmm2.py Wed Feb 01 09:46:42 2017 -0500 @@ -21,7 +21,7 @@ this to remove the redundant tags, and instead adds a comment line at the top with the column names: - #ID len ExpAA First60 PredHel Topology + #ID len ExpAA First60 PredHel Topology gi|2781234|pdb|1JLY|B 304 0.01 60 0.00 0 o gi|4959044|gb|AAD34209.1|AF069992_1 600 0.00 0 0.00 0 o gi|671626|emb|CAA85685.1| 473 0.19 0.00 0 o @@ -43,12 +43,12 @@ import sys import os import tempfile -from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count +from seq_analysis_utils import split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 if len(sys.argv) != 4: - sys_exit("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") + sys.exit("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") num_threads = thread_count(sys.argv[1], default=4) fasta_file = sys.argv[2] @@ -56,52 +56,55 @@ tmp_dir = tempfile.mkdtemp() + def clean_tabular(raw_handle, out_handle): """Clean up tabular TMHMM output, returns output line count.""" count = 0 for line in raw_handle: if not line.strip() or line.startswith("#"): - #Ignore any blank lines or comment lines + # Ignore any blank lines or comment lines continue parts = line.rstrip("\r\n").split("\t") try: - identifier, length, expAA, first60, predhel, topology = parts - except: - assert len(parts)!=6 - sys_exit("Bad line: %r" % line) + identifier, length, exp_aa, first60, predhel, topology = parts + except ValueError: + assert len(parts) != 6 + sys.exit("Bad line: %r" % line) assert length.startswith("len="), line length = length[4:] - assert expAA.startswith("ExpAA="), line - expAA = expAA[6:] + assert exp_aa.startswith("ExpAA="), line + exp_aa = exp_aa[6:] assert first60.startswith("First60="), line first60 = first60[8:] assert predhel.startswith("PredHel="), line predhel = predhel[8:] assert topology.startswith("Topology="), line topology = topology[9:] - out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" \ - % (identifier, length, expAA, first60, predhel, topology)) + out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" + % (identifier, length, exp_aa, first60, predhel, topology)) count += 1 return count -#Note that if the input FASTA file contains no sequences, -#split_fasta returns an empty list (i.e. zero temp files). +# Note that if the input FASTA file contains no sequences, +# split_fasta returns an empty list (i.e. zero temp files). fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) -temp_files = [f+".out" for f in fasta_files] +temp_files = [f + ".out" for f in fasta_files] jobs = ["tmhmm -short %s > %s" % (fasta, temp) for fasta, temp in zip(fasta_files, temp_files)] + def clean_up(file_list): + """Remove temp files, and if possible the temp directory.""" for f in file_list: if os.path.isfile(f): os.remove(f) try: os.rmdir(tmp_dir) - except: + except Exception: pass if len(jobs) > 1 and num_threads > 1: - #A small "info" message for Galaxy to show the user. + # A small "info" message for Galaxy to show the user. print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) results = run_jobs(jobs, num_threads) for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): @@ -112,7 +115,7 @@ except IOError: output = "" clean_up(fasta_files + temp_files) - sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), + sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results del jobs @@ -125,7 +128,7 @@ data_handle.close() if not count: clean_up(fasta_files + temp_files) - sys_exit("No output from tmhmm2") + sys.exit("No output from tmhmm2") out_handle.close() clean_up(fasta_files + temp_files)
--- a/tools/protein_analysis/wolf_psort.py Tue Sep 01 09:56:36 2015 -0400 +++ b/tools/protein_analysis/wolf_psort.py Wed Feb 01 09:46:42 2017 -0500 @@ -35,7 +35,7 @@ """ import sys import os -from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count +from seq_analysis_utils import split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 exe = "runWolfPsortSummary" @@ -56,44 +56,49 @@ return_code = subprocess.call(args) os.chdir(saved_dir) sys.exit(return_code) + +For more details on this workaround, see: +https://lists.galaxyproject.org/pipermail/galaxy-dev/2015-December/023386.html """ if len(sys.argv) != 5: - sys_exit("Require four arguments, organism, threads, input protein FASTA file & output tabular file") + sys.exit("Require four arguments, organism, threads, input protein FASTA file & output tabular file") organism = sys.argv[1] if organism not in ["animal", "plant", "fungi"]: - sys_exit("Organism argument %s is not one of animal, plant, fungi" % organism) + sys.exit("Organism argument %s is not one of animal, plant, fungi" % organism) num_threads = thread_count(sys.argv[2], default=4) fasta_file = sys.argv[3] tabular_file = sys.argv[4] + def clean_tabular(raw_handle, out_handle): """Clean up WoLF PSORT output to make it tabular.""" for line in raw_handle: if not line or line.startswith("#"): continue - name, data = line.rstrip("\r\n").split(None,1) + name, data = line.rstrip("\r\n").split(None, 1) for rank, comp_data in enumerate(data.split(",")): comp, score = comp_data.split() - out_handle.write("%s\t%s\t%s\t%i\n" \ - % (name, comp, score, rank+1)) + out_handle.write("%s\t%s\t%s\t%i\n" + % (name, comp, score, rank + 1)) fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK) -temp_files = [f+".out" for f in fasta_files] +temp_files = [f + ".out" for f in fasta_files] assert len(fasta_files) == len(temp_files) jobs = ["%s %s < %s > %s" % (exe, organism, fasta, temp) for (fasta, temp) in zip(fasta_files, temp_files)] assert len(fasta_files) == len(temp_files) == len(jobs) + def clean_up(file_list): for f in file_list: if os.path.isfile(f): os.remove(f) if len(jobs) > 1 and num_threads > 1: - #A small "info" message for Galaxy to show the user. + # A small "info" message for Galaxy to show the user. print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) results = run_jobs(jobs, num_threads) assert len(fasta_files) == len(temp_files) == len(jobs) @@ -106,7 +111,7 @@ if error_level or output.lower().startswith("error running"): clean_up(fasta_files) clean_up(temp_files) - sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), + sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results