Previous changeset 18:eb6ac44d4b8e (2015-09-01) Next changeset 20:a19b3ded8f33 (2017-09-21) |
Commit message:
v0.2.9 Python style improvements |
modified:
tools/protein_analysis/README.rst tools/protein_analysis/promoter2.py tools/protein_analysis/psortb.py tools/protein_analysis/rxlr_motifs.py tools/protein_analysis/seq_analysis_utils.py tools/protein_analysis/signalp3.py tools/protein_analysis/tmhmm2.py tools/protein_analysis/wolf_psort.py |
b |
diff -r eb6ac44d4b8e -r f3ecd80850e2 tools/protein_analysis/README.rst --- a/tools/protein_analysis/README.rst Tue Sep 01 09:56:36 2015 -0400 +++ b/tools/protein_analysis/README.rst Wed Feb 01 09:46:42 2017 -0500 |
b |
@@ -178,6 +178,7 @@ v0.2.8 - Reorder XML elements (internal change only). - Planemo for Tool Shed upload (``.shed.yml``, internal change only). - Record version of Promoter 2 via ``<version_command>``. +v0.2.9 - Further style cleanup in Python scripts (internal change only). ======= ====================================================================== |
b |
diff -r eb6ac44d4b8e -r f3ecd80850e2 tools/protein_analysis/promoter2.py --- a/tools/protein_analysis/promoter2.py Tue Sep 01 09:56:36 2015 -0400 +++ b/tools/protein_analysis/promoter2.py Wed Feb 01 09:46:42 2017 -0500 |
[ |
@@ -30,7 +30,7 @@ import os import commands import tempfile -from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count +from seq_analysis_utils import split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 @@ -38,91 +38,94 @@ sys.exit(os.system("promoter -V")) if len(sys.argv) != 4: - sys_exit("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. " - "Got %i arguments." % (len(sys.argv)-1)) + sys.exit("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. " + "Got %i arguments." % (len(sys.argv) - 1)) -num_threads = thread_count(sys.argv[3],default=4) +num_threads = thread_count(sys.argv[3], default=4) fasta_file = os.path.abspath(sys.argv[2]) tabular_file = os.path.abspath(sys.argv[3]) tmp_dir = tempfile.mkdtemp() + def get_path_and_binary(): - platform = commands.getoutput("uname") #e.g. Linux + platform = commands.getoutput("uname") # e.g. Linux shell_script = commands.getoutput("which promoter") if not os.path.isfile(shell_script): - sys_exit("ERROR: Missing promoter executable shell script") + sys.exit("ERROR: Missing promoter executable shell script") path = None for line in open(shell_script): - if line.startswith("setenv"): #could then be tab or space! + if line.startswith("setenv"): # could then be tab or space! parts = line.rstrip().split(None, 2) if parts[0] == "setenv" and parts[1] == "PROM": path = parts[2] if not path: - sys_exit("ERROR: Could not find promoter path (PROM) in %r" % shell_script) + sys.exit("ERROR: Could not find promoter path (PROM) in %r" % shell_script) if not os.path.isdir(path): - sys_exit("ERROR: %r is not a directory" % path) + sys.exit("ERROR: %r is not a directory" % path) bin = "%s/bin/promoter_%s" % (path, platform) if not os.path.isfile(bin): - sys_exit("ERROR: Missing promoter binary %r" % bin) + sys.exit("ERROR: Missing promoter binary %r" % bin) return path, bin + def make_tabular(raw_handle, out_handle): """Parse text output into tabular, return query count.""" identifier = None queries = 0 for line in raw_handle: - #print repr(line) + # print repr(line) if not line.strip() or line == "Promoter prediction:\n": pass elif line[0] != " ": - identifier = line.strip().replace("\t", " ").split(None,1)[0] + identifier = line.strip().replace("\t", " ").split(None, 1)[0] queries += 1 elif line == " No promoter predicted\n": - #End of a record + # End of a record identifier = None elif line == " Position Score Likelihood\n": assert identifier else: try: - position, score, likelihood = line.strip().split(None,2) + position, score, likelihood = line.strip().split(None, 2) except ValueError: print "WARNING: Problem with line: %r" % line continue - #sys_exit("ERROR: Problem with line: %r" % line) + # sys.exit("ERROR: Problem with line: %r" % line) if likelihood not in ["ignored", "Marginal prediction", "Medium likely prediction", "Highly likely prediction"]: - sys_exit("ERROR: Problem with line: %r" % line) + sys.exit("ERROR: Problem with line: %r" % line) out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood)) return queries - + working_dir, bin = get_path_and_binary() if not os.path.isfile(fasta_file): - sys_exit("ERROR: Missing input FASTA file %r" % fasta_file) + sys.exit("ERROR: Missing input FASTA file %r" % fasta_file) -#Note that if the input FASTA file contains no sequences, -#split_fasta returns an empty list (i.e. zero temp files). -#We deliberately omit the FASTA descriptions to avoid a -#bug in promoter2 with descriptions over 200 characters. +# Note that if the input FASTA file contains no sequences, +# split_fasta returns an empty list (i.e. zero temp files). +# We deliberately omit the FASTA descriptions to avoid a +# bug in promoter2 with descriptions over 200 characters. fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False) -temp_files = [f+".out" for f in fasta_files] +temp_files = [f + ".out" for f in fasta_files] jobs = ["%s %s > %s" % (bin, fasta, temp) for fasta, temp in zip(fasta_files, temp_files)] + def clean_up(file_list): for f in file_list: if os.path.isfile(f): os.remove(f) try: os.rmdir(tmp_dir) - except: + except Exception: pass if len(jobs) > 1 and num_threads > 1: - #A small "info" message for Galaxy to show the user. + # A small "info" message for Galaxy to show the user. print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) cur_dir = os.path.abspath(os.curdir) os.chdir(working_dir) @@ -136,7 +139,7 @@ except IOError: output = "" clean_up(fasta_files + temp_files) - sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), + sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results @@ -151,7 +154,7 @@ data_handle.close() if not count: clean_up(fasta_files + temp_files) - sys_exit("No output from promoter2") + sys.exit("No output from promoter2") queries += count out_handle.close() |
b |
diff -r eb6ac44d4b8e -r f3ecd80850e2 tools/protein_analysis/psortb.py --- a/tools/protein_analysis/psortb.py Tue Sep 01 09:56:36 2015 -0400 +++ b/tools/protein_analysis/psortb.py Wed Feb 01 09:46:42 2017 -0500 |
[ |
@@ -24,7 +24,7 @@ import sys import os import tempfile -from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count +from seq_analysis_utils import split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 @@ -33,7 +33,7 @@ sys.exit(os.system("psort --version")) if len(sys.argv) != 8: - sys_exit("Require 7 arguments, number of threads (int), type (e.g. archaea), " + sys.exit("Require 7 arguments, number of threads (int), type (e.g. archaea), " "output (e.g. terse/normal/long), cutoff, divergent, input protein " "FASTA file & output tabular file") @@ -56,10 +56,10 @@ if out_type == "terse": header = ['SeqID', 'Localization', 'Score'] elif out_type == "normal": - sys_exit("Normal output not implemented yet, sorry.") + sys.exit("Normal output not implemented yet, sorry.") elif out_type == "long": if org_type == "-n": - #Gram negative bacteria + # Gram negative bacteria header = ['SeqID', 'CMSVM-_Localization', 'CMSVM-_Details', 'CytoSVM-_Localization', 'CytoSVM-_Details', 'ECSVM-_Localization', 'ECSVM-_Details', 'ModHMM-_Localization', 'ModHMM-_Details', 'Motif-_Localization', 'Motif-_Details', 'OMPMotif-_Localization', 'OMPMotif-_Details', @@ -71,7 +71,7 @@ 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', 'Secondary_Localization', 'PSortb_Version'] elif org_type == "-p": - #Gram positive bacteria + # Gram positive bacteria header = ['SeqID', 'CMSVM+_Localization', 'CMSVM+_Details', 'CWSVM+_Localization', 'CWSVM+_Details', 'CytoSVM+_Localization', 'CytoSVM+_Details', 'ECSVM+_Localization', 'ECSVM+_Details', 'ModHMM+_Localization', 'ModHMM+_Details', 'Motif+_Localization', 'Motif+_Details', @@ -82,7 +82,7 @@ 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', 'Secondary_Localization', 'PSortb_Version'] elif org_type == "-a": - #Archaea + # Archaea header = ['SeqID', 'CMSVM_a_Localization', 'CMSVM_a_Details', 'CWSVM_a_Localization', 'CWSVM_a_Details', 'CytoSVM_a_Localization', 'CytoSVM_a_Details', 'ECSVM_a_Localization', 'ECSVM_a_Details', 'ModHMM_a_Localization', 'ModHMM_a_Details', 'Motif_a_Localization', 'Motif_a_Details', @@ -93,27 +93,28 @@ 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', 'Secondary_Localization', 'PSortb_Version'] else: - sys_exit("Expected -n, -p or -a for the organism type, not %r" % org_type) + sys.exit("Expected -n, -p or -a for the organism type, not %r" % org_type) else: - sys_exit("Expected terse, normal or long for the output type, not %r" % out_type) + sys.exit("Expected terse, normal or long for the output type, not %r" % out_type) tmp_dir = tempfile.mkdtemp() + def clean_tabular(raw_handle, out_handle): """Clean up tabular TMHMM output, returns output line count.""" global header count = 0 for line in raw_handle: if not line.strip() or line.startswith("#"): - #Ignore any blank lines or comment lines + # Ignore any blank lines or comment lines continue parts = [x.strip() for x in line.rstrip("\r\n").split("\t")] if parts == header: - #Ignore the header line + # Ignore the header line continue if not parts[-1] and len(parts) == len(header) + 1: - #Ignore dummy blank extra column, e.g. - #"...2.0\t\tPSORTb version 3.0\t\n" + # Ignore dummy blank extra column, e.g. + # "...2.0\t\tPSORTb version 3.0\t\n" parts = parts[:-1] assert len(parts) == len(header), \ "%i fields, not %i, in line:\n%r" % (len(line), len(header), line) @@ -121,24 +122,25 @@ count += 1 return count -#Note that if the input FASTA file contains no sequences, -#split_fasta returns an empty list (i.e. zero temp files). +# Note that if the input FASTA file contains no sequences, +# split_fasta returns an empty list (i.e. zero temp files). fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) -temp_files = [f+".out" for f in fasta_files] +temp_files = [f + ".out" for f in fasta_files] jobs = ["psort %s %s %s -o %s %s > %s" % (org_type, cutoff, divergent, out_type, fasta, temp) for fasta, temp in zip(fasta_files, temp_files)] + def clean_up(file_list): for f in file_list: if os.path.isfile(f): os.remove(f) try: os.rmdir(tmp_dir) - except: + except Exception: pass if len(jobs) > 1 and num_threads > 1: - #A small "info" message for Galaxy to show the user. + # A small "info" message for Galaxy to show the user. print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) results = run_jobs(jobs, num_threads) for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): @@ -149,7 +151,7 @@ except IOError: output = "" clean_up(fasta_files + temp_files) - sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), + sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results del jobs @@ -163,7 +165,7 @@ data_handle.close() if not count: clean_up(fasta_files + temp_files) - sys_exit("No output from psortb") + sys.exit("No output from psortb") out_handle.close() print "%i records" % count |
b |
diff -r eb6ac44d4b8e -r f3ecd80850e2 tools/protein_analysis/rxlr_motifs.py --- a/tools/protein_analysis/rxlr_motifs.py Tue Sep 01 09:56:36 2015 -0400 +++ b/tools/protein_analysis/rxlr_motifs.py Wed Feb 01 09:46:42 2017 -0500 |
[ |
@@ -40,14 +40,14 @@ import sys import re import subprocess -from seq_analysis_utils import sys_exit, fasta_iterator +from seq_analysis_utils import fasta_iterator if "-v" in sys.argv: print("RXLR Motifs v0.0.10") sys.exit(0) if len(sys.argv) != 5: - sys_exit("Requires four arguments: protein FASTA filename, threads, model, and output filename") + sys.exit("Requires four arguments: protein FASTA filename, threads, model, and output filename") fasta_file, threads, model, tabular_file = sys.argv[1:] hmm_output_file = tabular_file + ".hmm.tmp" @@ -86,8 +86,8 @@ min_rxlr_start = 1 max_rxlr_start = max_sp + max_sp_rxlr else: - sys_exit("Did not recognise the model name %r\n" - "Use Bhattacharjee2006, Win2007, or Whisson2007" % model) + sys.exit("Did not recognise the model name %r\n" + "Use Bhattacharjee2006, Win2007, or Whisson2007" % model) def get_hmmer_version(exe, required=None): @@ -105,23 +105,23 @@ return 3 else: raise ValueError("Could not determine version of %s" % exe) - + -#Run hmmsearch for Whisson et al. (2007) +# Run hmmsearch for Whisson et al. (2007) if model == "Whisson2007": hmm_file = os.path.join(os.path.split(sys.argv[0])[0], "whisson_et_al_rxlr_eer_cropped.hmm") if not os.path.isfile(hmm_file): - sys_exit("Missing HMM file for Whisson et al. (2007)") + sys.exit("Missing HMM file for Whisson et al. (2007)") if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): - sys_exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search) + sys.exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search) hmm_hits = set() valid_ids = set() for title, seq in fasta_iterator(fasta_file): - name = title.split(None,1)[0] + name = title.split(None, 1)[0] if name in valid_ids: - sys_exit("Duplicated identifier %r" % name) + sys.exit("Duplicated identifier %r" % name) else: valid_ids.add(name) if not valid_ids: @@ -146,7 +146,7 @@ % (hmmer_search, hmm_file, fasta_file, hmm_output_file) return_code = os.system(cmd) if return_code: - sys_exit("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) + sys.exit("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) handle = open(hmm_output_file) for line in handle: @@ -157,18 +157,18 @@ # Header continue else: - name = line.split(None,1)[0] - #Should be a sequence name in the HMMER3 table output. - #Could be anything in the HMMER2 stdout. + name = line.split(None, 1)[0] + # Should be a sequence name in the HMMER3 table output. + # Could be anything in the HMMER2 stdout. if name in valid_ids: hmm_hits.add(name) elif hmmer3: - sys_exit("Unexpected identifer %r in hmmsearch output" % name) + sys.exit("Unexpected identifer %r in hmmsearch output" % name) handle.close() # if hmmer3: # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) # else: - # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) + # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) # print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) os.remove(hmm_output_file) del valid_ids @@ -181,8 +181,8 @@ handle = open(signalp_input_file, "w") for title, seq in fasta_iterator(fasta_file): total += 1 - name = title.split(None,1)[0] - match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) + name = title.split(None, 1)[0] + match = re_rxlr.search(seq[min_rxlr_start - 1:].upper()) if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: # This is a potential RXLR, depending on the SignalP results. # Might as well truncate the sequence now, makes the temp file smaller @@ -199,11 +199,11 @@ # Run SignalP (using our wrapper script to get multi-core support etc) signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py") if not os.path.isfile(signalp_script): - sys_exit("Error - missing signalp3.py script") + sys.exit("Error - missing signalp3.py script") cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file) return_code = os.system(cmd) if return_code: - sys_exit("Error %i from SignalP:\n%s" % (return_code, cmd)) + sys.exit("Error %i from SignalP:\n%s" % (return_code, cmd)) # print "SignalP done" @@ -217,8 +217,8 @@ assert line.startswith("#ID\t"), line for line in handle: parts = line.rstrip("\t").split("\t") - assert len(parts)==20, repr(line) - yield parts[0], float(parts[18]), int(parts[5])-1 + assert len(parts) == 20, repr(line) + yield parts[0], float(parts[18]), int(parts[5]) - 1 handle.close() @@ -231,12 +231,12 @@ for title, seq in fasta_iterator(fasta_file): total += 1 rxlr = "N" - name = title.split(None,1)[0] - match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) + name = title.split(None, 1)[0] + match = re_rxlr.search(seq[min_rxlr_start - 1:].upper()) if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: del match # This was the criteria for calling SignalP, - #so it will be in the SignalP results. + # so it will be in the SignalP results. sp_id, sp_hmm_score, sp_nn_len = signalp_results.next() assert name == sp_id, "%s vs %s" % (name, sp_id) if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp: |
b |
diff -r eb6ac44d4b8e -r f3ecd80850e2 tools/protein_analysis/seq_analysis_utils.py --- a/tools/protein_analysis/seq_analysis_utils.py Tue Sep 01 09:56:36 2015 -0400 +++ b/tools/protein_analysis/seq_analysis_utils.py Wed Feb 01 09:46:42 2017 -0500 |
[ |
@@ -12,17 +12,12 @@ import subprocess from time import sleep -__version__ = "0.0.1" - -def sys_exit(msg, error_level=1): - """Print error message to stdout and quit with given error level.""" - sys.stderr.write("%s\n" % msg) - sys.exit(error_level) +__version__ = "0.0.2" try: from multiprocessing import cpu_count except ImportError: - #Must be under Python 2.5, this is copied from multiprocessing: + # Must be under Python 2.5, this is copied from multiprocessing: def cpu_count(): """Returns the number of CPUs in the system.""" if sys.platform == 'win32': @@ -54,18 +49,18 @@ def thread_count(command_line_arg, default=1): try: num = int(command_line_arg) - except: + except ValueError: num = default if num < 1: - sys_exit("Threads argument %r is not a positive integer" % command_line_arg) - #Cap this with the pysical limit of the machine, + sys.exit("Threads argument %r is not a positive integer" % command_line_arg) + # Cap this with the pysical limit of the machine, try: num = min(num, cpu_count()) except NotImplementedError: pass - #For debugging, - #hostname = os.environ.get("HOSTNAME", "this machine") - #sys.stderr.write("Using %i cores on %s\n" % (num, hostname)) + # For debugging, + # hostname = os.environ.get("HOSTNAME", "this machine") + # sys.stderr.write("Using %i cores on %s\n" % (num, hostname)) return num @@ -79,7 +74,7 @@ if truncate: seq = seq[:truncate] if max_len and len(seq) > max_len: - raise ValueError("Sequence %s is length %i, max length %i" \ + raise ValueError("Sequence %s is length %i, max length %i" % (title.split()[0], len(seq), max_len)) yield title, seq title = line[1:].rstrip() @@ -87,8 +82,8 @@ elif title: seq += line.strip() elif not line.strip() or line.startswith("#"): - #Ignore blank lines, and any comment lines - #between records (starting with hash). + # Ignore blank lines, and any comment lines + # between records (starting with hash). pass else: handle.close() @@ -98,11 +93,12 @@ if truncate: seq = seq[:truncate] if max_len and len(seq) > max_len: - raise ValueError("Sequence %s is length %i, max length %i" \ + raise ValueError("Sequence %s is length %i, max length %i" % (title.split()[0], len(seq), max_len)) yield title, seq raise StopIteration + def split_fasta(input_filename, output_filename_base, n=500, truncate=None, keep_descr=False, max_len=None): """Split FASTA file into sub-files each of at most n sequences. @@ -132,20 +128,20 @@ for title, seq in records: handle.write(">%s\n" % title) for i in range(0, len(seq), 60): - handle.write(seq[i:i+60] + "\n") + handle.write(seq[i:i + 60] + "\n") else: for title, seq in records: handle.write(">%s\n" % title.split()[0]) for i in range(0, len(seq), 60): - handle.write(seq[i:i+60] + "\n") + handle.write(seq[i:i + 60] + "\n") handle.close() files.append(new_filename) - #print "%i records in %s" % (len(records), new_filename) + # print "%i records in %s" % (len(records), new_filename) except ValueError, err: - #Max length failure from parser - clean up + # Max length failure from parser - clean up try: handle.close() - except: + except Exception: pass for f in files: if os.path.isfile(f): @@ -155,35 +151,36 @@ assert os.path.isfile(f), "Missing split file %r (!??)" % f return files + def run_jobs(jobs, threads, pause=10, verbose=False): """Takes list of cmd strings, returns dict with error levels.""" pending = jobs[:] running = [] results = {} if threads == 1: - #Special case this for speed, don't need the waits + # Special case this for speed, don't need the waits for cmd in jobs: results[cmd] = subprocess.call(cmd, shell=True) return results while pending or running: - #See if any have finished + # See if any have finished for (cmd, process) in running: - return_code = process.poll() #non-blocking + return_code = process.poll() # non-blocking if return_code is not None: results[cmd] = return_code - running = [(cmd, process) for (cmd, process) in running \ + running = [(cmd, process) for (cmd, process) in running if cmd not in results] if verbose: print "%i jobs pending, %i running, %i completed" \ % (len(pending), len(running), len(results)) - #See if we can start any new threads + # See if we can start any new threads while pending and len(running) < threads: cmd = pending.pop(0) if verbose: print cmd process = subprocess.Popen(cmd, shell=True) running.append((cmd, process)) - #Loop... + # Loop... sleep(pause) if verbose: print "%i jobs completed" % len(results) |
b |
diff -r eb6ac44d4b8e -r f3ecd80850e2 tools/protein_analysis/signalp3.py --- a/tools/protein_analysis/signalp3.py Tue Sep 01 09:56:36 2015 -0400 +++ b/tools/protein_analysis/signalp3.py Wed Feb 01 09:46:42 2017 -0500 |
[ |
b'@@ -21,9 +21,9 @@\n \n # SignalP-NN euk predictions \t # SignalP-HMM euk predictions\n # name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? \t# name ! Cmax pos ? Sprob ?\n-gi|2781234|pdb|1JLY| 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N\tgi|2781234|pdb|1JLY|B Q 0.000 17 N 0.000 N \n-gi|4959044|gb|AAD342 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N\tgi|4959044|gb|AAD34209.1|AF069992_1 Q 0.000 0 N 0.000 N \n-gi|671626|emb|CAA856 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N\tgi|671626|emb|CAA85685.1| Q 0.000 0 N 0.000 N \n+gi|2781234|pdb|1JLY| 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N\tgi|2781234|pdb|1JLY|B Q 0.000 17 N 0.000 N\n+gi|4959044|gb|AAD342 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N\tgi|4959044|gb|AAD34209.1|AF069992_1 Q 0.000 0 N 0.000 N\n+gi|671626|emb|CAA856 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N\tgi|671626|emb|CAA85685.1| Q 0.000 0 N 0.000 N\n gi|3298468|dbj|BAA31 0.208 24 N 0.184 38 N 0.980 32 Y 0.613 Y 0.398 N\tgi|3298468|dbj|BAA31520.1| Q 0.066 24 N 0.139 N\n \n In order to make it easier to use in Galaxy, this wrapper script reformats\n@@ -56,28 +56,28 @@\n import sys\n import os\n import tempfile\n-from seq_analysis_utils import sys_exit, split_fasta, fasta_iterator\n+from seq_analysis_utils import split_fasta, fasta_iterator\n from seq_analysis_utils import run_jobs, thread_count\n \n FASTA_CHUNK = 500\n-MAX_LEN = 6000 #Found by trial and error\n+MAX_LEN = 6000 # Found by trial and error\n \n-if len(sys.argv) not in [6,8]:\n- sys_exit("Require five (or 7) arguments, organism, truncate, threads, "\n+if len(sys.argv) not in [6, 8]:\n+ sys.exit("Require five (or 7) arguments, organism, truncate, threads, "\n "input protein FASTA file & output tabular file (plus "\n "optionally cut method and GFF3 output file). "\n- "Got %i arguments." % (len(sys.argv)-1))\n+ "Got %i arguments." % (len(sys.argv) - 1))\n \n organism = sys.argv[1]\n if organism not in ["euk", "gram+", "gram-"]:\n- sys_exit("Organism argument %s is not one of euk, gram+ or gram-" % organism)\n+ sys.exit("Organism argument %s is not one of euk, gram+ or gram-" % organism)\n \n try:\n truncate = int(sys.argv[2])\n-except:\n+except ValueError:\n truncate = 0\n if truncate < 0:\n- sys_exit("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2])\n+ sys.exit("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2])\n \n num_threads = thread_count(sys.argv[3], default=4)\n fasta_file = sys.argv[4]\n@@ -86,7 +86,7 @@\n if len(sys.argv) == 8:\n cut_method = sys.argv[6]\n if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]:\n- sys_exit("Invalid cut method %r" % cut_method)\n+ sys.exit("Invalid cut method %r" % cut_method)\n gff3_file = sys.argv[7]\n else:\n cut_method = None\n@@ -95,39 +95,41 @@\n \n tmp_dir = tempfile.mkdtemp()\n \n+\n def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None):\n """Clean up SignalP output to make it tabular."""\n if cut_method:\n- cut_col = {"NN_Cmax" : 2,\n- "NN_Ymax" : 5,\n- "NN_Smax" : 8,\n- "HMM_Cmax" : 16}[cut_method]\n+ cut_col = {"NN_Cmax": 2,\n+ "NN_Ymax": 5,\n+ "NN_Smax": 8,\n+ "HMM_Cmax": 16}[cut_method]\n else:\n cut_col = None\n for line in raw_handle:\n if not line or line.startswith("#"):\n continue\n parts = line.rstrip("\\r\\n").split()\n- assert len(parts)==21, repr(line)\n+ assert len(parts) == 21, repr(line)\n assert parts[14].startswith(parts[0]), \\\n "Bad entry in SignalP output, ID miss-match:\\n%r" % line\n- #Remove redundant truncated name column (col 0)\n- #and put full name at start (col 14)\n+ '..b'le to have a zero length reference in GFF3?\n+ if not seq:\n+ # Is it possible to have a zero length reference in GFF3?\n continue\n cut = int(parts[cut_col])\n if cut == 0:\n assert cut_method == "HMM_Cmax", cut_method\n- #TODO - Why does it do this?\n+ # TODO - Why does it do this?\n cut = 1\n assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq))\n score = parts[score_col]\n- gff_handle.write("##sequence-region %s %i %i\\n" \\\n+ gff_handle.write("##sequence-region %s %i %i\\n"\n % (seqid, 1, len(seq)))\n- #If the cut is at the very begining, there is no signal peptide!\n+ # If the cut is at the very begining, there is no signal peptide!\n if cut > 1:\n- #signal_peptide = SO:0000418\n- gff_handle.write("%s\\t%s\\t%s\\t%i\\t%i\\t%s\\t%s\\t%s\\t%s\\n" \\\n+ # signal_peptide = SO:0000418\n+ gff_handle.write("%s\\t%s\\t%s\\t%i\\t%i\\t%s\\t%s\\t%s\\t%s\\n"\n % (seqid, source,\n- "signal_peptide", 1, cut-1,\n+ "signal_peptide", 1, cut - 1,\n score, strand, phase, tags))\n- #mature_protein_region = SO:0000419\n- gff_handle.write("%s\\t%s\\t%s\\t%i\\t%i\\t%s\\t%s\\t%s\\t%s\\n" \\\n+ # mature_protein_region = SO:0000419\n+ gff_handle.write("%s\\t%s\\t%s\\t%i\\t%i\\t%s\\t%s\\t%s\\t%s\\n"\n % (seqid, source,\n "mature_protein_region", cut, len(seq),\n score, strand, phase, tags))\n- tab_handle.close()\n+ tab_handle.close()\n gff_handle.close()\n \n \n fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"),\n n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN)\n-temp_files = [f+".out" for f in fasta_files]\n+temp_files = [f + ".out" for f in fasta_files]\n assert len(fasta_files) == len(temp_files)\n jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp)\n for (fasta, temp) in zip(fasta_files, temp_files)]\n assert len(fasta_files) == len(temp_files) == len(jobs)\n \n+\n def clean_up(file_list):\n+ """Remove temp files, and if possible the temp directory."""\n for f in file_list:\n if os.path.isfile(f):\n os.remove(f)\n try:\n os.rmdir(tmp_dir)\n- except:\n+ except Exception:\n pass\n \n if len(jobs) > 1 and num_threads > 1:\n- #A small "info" message for Galaxy to show the user.\n+ # A small "info" message for Galaxy to show the user.\n print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))\n results = run_jobs(jobs, num_threads)\n assert len(fasta_files) == len(temp_files) == len(jobs)\n@@ -197,17 +201,17 @@\n output = "(no output)"\n if error_level or output.lower().startswith("error running"):\n clean_up(fasta_files + temp_files)\n- sys_exit("One or more tasks failed, e.g. %i from %r gave:\\n%s" % (error_level, cmd, output),\n+ sys.exit("One or more tasks failed, e.g. %i from %r gave:\\n%s" % (error_level, cmd, output),\n error_level)\n del results\n \n out_handle = open(tabular_file, "w")\n fields = ["ID"]\n-#NN results:\n+# NN results:\n for name in ["Cmax", "Ymax", "Smax"]:\n- fields.extend(["NN_%s_score"%name, "NN_%s_pos"%name, "NN_%s_pred"%name])\n+ fields.extend(["NN_%s_score" % name, "NN_%s_pos" % name, "NN_%s_pred" % name])\n fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"])\n-#HMM results:\n+# HMM results:\n fields.extend(["HMM_type", "HMM_Cmax_score", "HMM_Cmax_pos", "HMM_Cmax_pred",\n "HMM_Sprob_score", "HMM_Sprob_pred"])\n out_handle.write("#" + "\\t".join(fields) + "\\n")\n@@ -217,7 +221,7 @@\n data_handle.close()\n out_handle.close()\n \n-#GFF3:\n+# GFF3:\n if cut_method:\n make_gff(fasta_file, tabular_file, gff3_file, cut_method)\n \n' |
b |
diff -r eb6ac44d4b8e -r f3ecd80850e2 tools/protein_analysis/tmhmm2.py --- a/tools/protein_analysis/tmhmm2.py Tue Sep 01 09:56:36 2015 -0400 +++ b/tools/protein_analysis/tmhmm2.py Wed Feb 01 09:46:42 2017 -0500 |
[ |
@@ -21,7 +21,7 @@ this to remove the redundant tags, and instead adds a comment line at the top with the column names: - #ID len ExpAA First60 PredHel Topology + #ID len ExpAA First60 PredHel Topology gi|2781234|pdb|1JLY|B 304 0.01 60 0.00 0 o gi|4959044|gb|AAD34209.1|AF069992_1 600 0.00 0 0.00 0 o gi|671626|emb|CAA85685.1| 473 0.19 0.00 0 o @@ -43,12 +43,12 @@ import sys import os import tempfile -from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count +from seq_analysis_utils import split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 if len(sys.argv) != 4: - sys_exit("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") + sys.exit("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") num_threads = thread_count(sys.argv[1], default=4) fasta_file = sys.argv[2] @@ -56,52 +56,55 @@ tmp_dir = tempfile.mkdtemp() + def clean_tabular(raw_handle, out_handle): """Clean up tabular TMHMM output, returns output line count.""" count = 0 for line in raw_handle: if not line.strip() or line.startswith("#"): - #Ignore any blank lines or comment lines + # Ignore any blank lines or comment lines continue parts = line.rstrip("\r\n").split("\t") try: - identifier, length, expAA, first60, predhel, topology = parts - except: - assert len(parts)!=6 - sys_exit("Bad line: %r" % line) + identifier, length, exp_aa, first60, predhel, topology = parts + except ValueError: + assert len(parts) != 6 + sys.exit("Bad line: %r" % line) assert length.startswith("len="), line length = length[4:] - assert expAA.startswith("ExpAA="), line - expAA = expAA[6:] + assert exp_aa.startswith("ExpAA="), line + exp_aa = exp_aa[6:] assert first60.startswith("First60="), line first60 = first60[8:] assert predhel.startswith("PredHel="), line predhel = predhel[8:] assert topology.startswith("Topology="), line topology = topology[9:] - out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" \ - % (identifier, length, expAA, first60, predhel, topology)) + out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" + % (identifier, length, exp_aa, first60, predhel, topology)) count += 1 return count -#Note that if the input FASTA file contains no sequences, -#split_fasta returns an empty list (i.e. zero temp files). +# Note that if the input FASTA file contains no sequences, +# split_fasta returns an empty list (i.e. zero temp files). fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) -temp_files = [f+".out" for f in fasta_files] +temp_files = [f + ".out" for f in fasta_files] jobs = ["tmhmm -short %s > %s" % (fasta, temp) for fasta, temp in zip(fasta_files, temp_files)] + def clean_up(file_list): + """Remove temp files, and if possible the temp directory.""" for f in file_list: if os.path.isfile(f): os.remove(f) try: os.rmdir(tmp_dir) - except: + except Exception: pass if len(jobs) > 1 and num_threads > 1: - #A small "info" message for Galaxy to show the user. + # A small "info" message for Galaxy to show the user. print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) results = run_jobs(jobs, num_threads) for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): @@ -112,7 +115,7 @@ except IOError: output = "" clean_up(fasta_files + temp_files) - sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), + sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results del jobs @@ -125,7 +128,7 @@ data_handle.close() if not count: clean_up(fasta_files + temp_files) - sys_exit("No output from tmhmm2") + sys.exit("No output from tmhmm2") out_handle.close() clean_up(fasta_files + temp_files) |
b |
diff -r eb6ac44d4b8e -r f3ecd80850e2 tools/protein_analysis/wolf_psort.py --- a/tools/protein_analysis/wolf_psort.py Tue Sep 01 09:56:36 2015 -0400 +++ b/tools/protein_analysis/wolf_psort.py Wed Feb 01 09:46:42 2017 -0500 |
[ |
@@ -35,7 +35,7 @@ """ import sys import os -from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count +from seq_analysis_utils import split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 exe = "runWolfPsortSummary" @@ -56,44 +56,49 @@ return_code = subprocess.call(args) os.chdir(saved_dir) sys.exit(return_code) + +For more details on this workaround, see: +https://lists.galaxyproject.org/pipermail/galaxy-dev/2015-December/023386.html """ if len(sys.argv) != 5: - sys_exit("Require four arguments, organism, threads, input protein FASTA file & output tabular file") + sys.exit("Require four arguments, organism, threads, input protein FASTA file & output tabular file") organism = sys.argv[1] if organism not in ["animal", "plant", "fungi"]: - sys_exit("Organism argument %s is not one of animal, plant, fungi" % organism) + sys.exit("Organism argument %s is not one of animal, plant, fungi" % organism) num_threads = thread_count(sys.argv[2], default=4) fasta_file = sys.argv[3] tabular_file = sys.argv[4] + def clean_tabular(raw_handle, out_handle): """Clean up WoLF PSORT output to make it tabular.""" for line in raw_handle: if not line or line.startswith("#"): continue - name, data = line.rstrip("\r\n").split(None,1) + name, data = line.rstrip("\r\n").split(None, 1) for rank, comp_data in enumerate(data.split(",")): comp, score = comp_data.split() - out_handle.write("%s\t%s\t%s\t%i\n" \ - % (name, comp, score, rank+1)) + out_handle.write("%s\t%s\t%s\t%i\n" + % (name, comp, score, rank + 1)) fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK) -temp_files = [f+".out" for f in fasta_files] +temp_files = [f + ".out" for f in fasta_files] assert len(fasta_files) == len(temp_files) jobs = ["%s %s < %s > %s" % (exe, organism, fasta, temp) for (fasta, temp) in zip(fasta_files, temp_files)] assert len(fasta_files) == len(temp_files) == len(jobs) + def clean_up(file_list): for f in file_list: if os.path.isfile(f): os.remove(f) if len(jobs) > 1 and num_threads > 1: - #A small "info" message for Galaxy to show the user. + # A small "info" message for Galaxy to show the user. print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) results = run_jobs(jobs, num_threads) assert len(fasta_files) == len(temp_files) == len(jobs) @@ -106,7 +111,7 @@ if error_level or output.lower().startswith("error running"): clean_up(fasta_files) clean_up(temp_files) - sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), + sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results |