Mercurial > repos > peterjc > tmhmm_and_signalp
changeset 21:238eae32483c draft
"Check this is up to date with all 2020 changes (black etc)"
author | peterjc |
---|---|
date | Thu, 17 Jun 2021 08:21:06 +0000 |
parents | a19b3ded8f33 |
children | e1afa4b0b682 |
files | tools/protein_analysis/README.rst tools/protein_analysis/promoter2.py tools/protein_analysis/promoter2.xml tools/protein_analysis/psortb.py tools/protein_analysis/psortb.xml tools/protein_analysis/rxlr_motifs.py tools/protein_analysis/rxlr_motifs.xml tools/protein_analysis/seq_analysis_utils.py tools/protein_analysis/signalp3.py tools/protein_analysis/signalp3.xml tools/protein_analysis/tmhmm2.py tools/protein_analysis/tmhmm2.xml tools/protein_analysis/wolf_psort.py tools/protein_analysis/wolf_psort.xml |
diffstat | 14 files changed, 377 insertions(+), 175 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/protein_analysis/README.rst Thu Sep 21 11:35:20 2017 -0400 +++ b/tools/protein_analysis/README.rst Thu Jun 17 08:21:06 2021 +0000 @@ -31,7 +31,7 @@ Peter Cock, Bjoern Gruening, Konrad Paszkiewicz and Leighton Pritchard (2013). Galaxy tools and workflows for sequence analysis with applications in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 +https://doi.org/10.7717/peerj.167 Full reference information is included in the help text for each tool. @@ -192,6 +192,9 @@ - Record SignalP version via ``<version_command>``. - Internal job splitter will skip starting any pending jobs after a child job fails (so the entire task will error out more quickly). +v0.2.12 - Fix reporting of HMMER error in RXLR tool. + - Quote command line arguments when RXLR tool calls SignalP. + - Reduce number of temp files if SignalP is run with one thread. ======= ======================================================================
--- a/tools/protein_analysis/promoter2.py Thu Sep 21 11:35:20 2017 -0400 +++ b/tools/protein_analysis/promoter2.py Thu Jun 17 08:21:06 2021 +0000 @@ -41,8 +41,10 @@ sys.exit(os.system("promoter -V")) if len(sys.argv) != 4: - sys.exit("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. " - "Got %i arguments." % (len(sys.argv) - 1)) + sys.exit( + "Require three arguments, number of threads (int), input DNA FASTA " + "file & output tabular file. Got %i arguments." % (len(sys.argv) - 1) + ) num_threads = thread_count(sys.argv[3], default=4) fasta_file = os.path.abspath(sys.argv[2]) @@ -96,12 +98,16 @@ print("WARNING: Problem with line: %r" % line) continue # sys.exit("ERROR: Problem with line: %r" % line) - if likelihood not in ["ignored", - "Marginal prediction", - "Medium likely prediction", - "Highly likely prediction"]: + if likelihood not in [ + "ignored", + "Marginal prediction", + "Medium likely prediction", + "Highly likely prediction", + ]: sys.exit("ERROR: Problem with line: %r" % line) - out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood)) + out_handle.write( + "%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood) + ) return queries @@ -114,10 +120,13 @@ # split_fasta returns an empty list (i.e. zero temp files). # We deliberately omit the FASTA descriptions to avoid a # bug in promoter2 with descriptions over 200 characters. -fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False) +fasta_files = split_fasta( + fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False +) temp_files = [f + ".out" for f in fasta_files] -jobs = ["%s %s > %s" % (bin, fasta, temp) - for fasta, temp in zip(fasta_files, temp_files)] +jobs = [ + "%s %s > %s" % (bin, fasta, temp) for fasta, temp in zip(fasta_files, temp_files) +] def clean_up(file_list): @@ -145,8 +154,11 @@ except IOError: output = "" clean_up(fasta_files + temp_files) - sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), - error_level) + sys.exit( + "One or more tasks failed, e.g. %i from %r gave:\n%s" + % (error_level, cmd, output), + error_level, + ) del results del jobs
--- a/tools/protein_analysis/promoter2.xml Thu Sep 21 11:35:20 2017 -0400 +++ b/tools/protein_analysis/promoter2.xml Thu Jun 17 08:21:06 2021 +0000 @@ -66,12 +66,12 @@ Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). Galaxy tools and workflows for sequence analysis with applications in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 +https://doi.org/10.7717/peerj.167 Steen Knudsen (1999). Promoter2.0: for the recognition of PolII promoter sequences. Bioinformatics, 15:356-61. -http://dx.doi.org/10.1093/bioinformatics/15.5.356 +https://doi.org/10.1093/bioinformatics/15.5.356 See also http://www.cbs.dtu.dk/services/Promoter/output.php
--- a/tools/protein_analysis/psortb.py Thu Sep 21 11:35:20 2017 -0400 +++ b/tools/protein_analysis/psortb.py Thu Jun 17 08:21:06 2021 +0000 @@ -37,9 +37,11 @@ sys.exit(os.system("psort --version")) if len(sys.argv) != 8: - sys.exit("Require 7 arguments, number of threads (int), type (e.g. archaea), " - "output (e.g. terse/normal/long), cutoff, divergent, input protein " - "FASTA file & output tabular file") + sys.exit( + "Require 7 arguments, number of threads (int), type (e.g. archaea), " + "output (e.g. terse/normal/long), cutoff, divergent, input protein " + "FASTA file & output tabular file" + ) num_threads = thread_count(sys.argv[1], default=4) org_type = sys.argv[2] @@ -58,47 +60,117 @@ tabular_file = sys.argv[7] if out_type == "terse": - header = ['SeqID', 'Localization', 'Score'] + header = ["SeqID", "Localization", "Score"] elif out_type == "normal": sys.exit("Normal output not implemented yet, sorry.") elif out_type == "long": if org_type == "-n": # Gram negative bacteria - header = ['SeqID', 'CMSVM-_Localization', 'CMSVM-_Details', 'CytoSVM-_Localization', 'CytoSVM-_Details', - 'ECSVM-_Localization', 'ECSVM-_Details', 'ModHMM-_Localization', 'ModHMM-_Details', - 'Motif-_Localization', 'Motif-_Details', 'OMPMotif-_Localization', 'OMPMotif-_Details', - 'OMSVM-_Localization', 'OMSVM-_Details', 'PPSVM-_Localization', 'PPSVM-_Details', - 'Profile-_Localization', 'Profile-_Details', - 'SCL-BLAST-_Localization', 'SCL-BLAST-_Details', - 'SCL-BLASTe-_Localization', 'SCL-BLASTe-_Details', - 'Signal-_Localization', 'Signal-_Details', - 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score', - 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', - 'Secondary_Localization', 'PSortb_Version'] + header = [ + "SeqID", + "CMSVM-_Localization", + "CMSVM-_Details", + "CytoSVM-_Localization", + "CytoSVM-_Details", + "ECSVM-_Localization", + "ECSVM-_Details", + "ModHMM-_Localization", + "ModHMM-_Details", + "Motif-_Localization", + "Motif-_Details", + "OMPMotif-_Localization", + "OMPMotif-_Details", + "OMSVM-_Localization", + "OMSVM-_Details", + "PPSVM-_Localization", + "PPSVM-_Details", + "Profile-_Localization", + "Profile-_Details", + "SCL-BLAST-_Localization", + "SCL-BLAST-_Details", + "SCL-BLASTe-_Localization", + "SCL-BLASTe-_Details", + "Signal-_Localization", + "Signal-_Details", + "Cytoplasmic_Score", + "CytoplasmicMembrane_Score", + "Periplasmic_Score", + "OuterMembrane_Score", + "Extracellular_Score", + "Final_Localization", + "Final_Localization_Details", + "Final_Score", + "Secondary_Localization", + "PSortb_Version", + ] elif org_type == "-p": # Gram positive bacteria - header = ['SeqID', 'CMSVM+_Localization', 'CMSVM+_Details', 'CWSVM+_Localization', 'CWSVM+_Details', - 'CytoSVM+_Localization', 'CytoSVM+_Details', 'ECSVM+_Localization', 'ECSVM+_Details', - 'ModHMM+_Localization', 'ModHMM+_Details', 'Motif+_Localization', 'Motif+_Details', - 'Profile+_Localization', 'Profile+_Details', - 'SCL-BLAST+_Localization', 'SCL-BLAST+_Details', - 'SCL-BLASTe+_Localization', 'SCL-BLASTe+_Details', - 'Signal+_Localization', 'Signal+_Details', - 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', - 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', - 'Secondary_Localization', 'PSortb_Version'] + header = [ + "SeqID", + "CMSVM+_Localization", + "CMSVM+_Details", + "CWSVM+_Localization", + "CWSVM+_Details", + "CytoSVM+_Localization", + "CytoSVM+_Details", + "ECSVM+_Localization", + "ECSVM+_Details", + "ModHMM+_Localization", + "ModHMM+_Details", + "Motif+_Localization", + "Motif+_Details", + "Profile+_Localization", + "Profile+_Details", + "SCL-BLAST+_Localization", + "SCL-BLAST+_Details", + "SCL-BLASTe+_Localization", + "SCL-BLASTe+_Details", + "Signal+_Localization", + "Signal+_Details", + "Cytoplasmic_Score", + "CytoplasmicMembrane_Score", + "Cellwall_Score", + "Extracellular_Score", + "Final_Localization", + "Final_Localization_Details", + "Final_Score", + "Secondary_Localization", + "PSortb_Version", + ] elif org_type == "-a": # Archaea - header = ['SeqID', 'CMSVM_a_Localization', 'CMSVM_a_Details', 'CWSVM_a_Localization', 'CWSVM_a_Details', - 'CytoSVM_a_Localization', 'CytoSVM_a_Details', 'ECSVM_a_Localization', 'ECSVM_a_Details', - 'ModHMM_a_Localization', 'ModHMM_a_Details', 'Motif_a_Localization', 'Motif_a_Details', - 'Profile_a_Localization', 'Profile_a_Details', - 'SCL-BLAST_a_Localization', 'SCL-BLAST_a_Details', - 'SCL-BLASTe_a_Localization', 'SCL-BLASTe_a_Details', - 'Signal_a_Localization', 'Signal_a_Details', - 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', - 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', - 'Secondary_Localization', 'PSortb_Version'] + header = [ + "SeqID", + "CMSVM_a_Localization", + "CMSVM_a_Details", + "CWSVM_a_Localization", + "CWSVM_a_Details", + "CytoSVM_a_Localization", + "CytoSVM_a_Details", + "ECSVM_a_Localization", + "ECSVM_a_Details", + "ModHMM_a_Localization", + "ModHMM_a_Details", + "Motif_a_Localization", + "Motif_a_Details", + "Profile_a_Localization", + "Profile_a_Details", + "SCL-BLAST_a_Localization", + "SCL-BLAST_a_Details", + "SCL-BLASTe_a_Localization", + "SCL-BLASTe_a_Details", + "Signal_a_Localization", + "Signal_a_Details", + "Cytoplasmic_Score", + "CytoplasmicMembrane_Score", + "Cellwall_Score", + "Extracellular_Score", + "Final_Localization", + "Final_Localization_Details", + "Final_Score", + "Secondary_Localization", + "PSortb_Version", + ] else: sys.exit("Expected -n, -p or -a for the organism type, not %r" % org_type) else: @@ -123,8 +195,11 @@ # Ignore dummy blank extra column, e.g. # "...2.0\t\tPSORTb version 3.0\t\n" parts = parts[:-1] - assert len(parts) == len(header), \ - "%i fields, not %i, in line:\n%r" % (len(line), len(header), line) + assert len(parts) == len(header), "%i fields, not %i, in line:\n%r" % ( + len(line), + len(header), + line, + ) out_handle.write(line) count += 1 return count @@ -134,8 +209,11 @@ # split_fasta returns an empty list (i.e. zero temp files). fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) temp_files = [f + ".out" for f in fasta_files] -jobs = ["psort %s %s %s -o %s %s > %s" % (org_type, cutoff, divergent, out_type, fasta, temp) - for fasta, temp in zip(fasta_files, temp_files)] +jobs = [ + "psort %s %s %s -o %s %s > %s" + % (org_type, cutoff, divergent, out_type, fasta, temp) + for fasta, temp in zip(fasta_files, temp_files) +] def clean_up(file_list): @@ -160,8 +238,11 @@ except IOError: output = "" clean_up(fasta_files + temp_files) - sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), - error_level) + sys.exit( + "One or more tasks failed, e.g. %i from %r gave:\n%s" + % (error_level, cmd, output), + error_level, + ) del results del jobs
--- a/tools/protein_analysis/psortb.xml Thu Sep 21 11:35:20 2017 -0400 +++ b/tools/protein_analysis/psortb.xml Thu Jun 17 08:21:06 2021 +0000 @@ -81,14 +81,14 @@ Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). Galaxy tools and workflows for sequence analysis with applications in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 +https://doi.org/10.7717/peerj.167 N.Y. Yu, J.R. Wagner, M.R. Laird, G. Melli, S. Rey, R. Lo, P. Dao, S.C. Sahinalp, M. Ester, L.J. Foster, F.S.L. Brinkman (2010) PSORTb 3.0: Improved protein subcellular localization prediction with refined localization subcategories and predictive capabilities for all prokaryotes, Bioinformatics 26(13):1608-1615 -http://dx.doi.org/10.1093/bioinformatics/btq249 +https://doi.org/10.1093/bioinformatics/btq249 See also http://www.psort.org/documentation/index.html
--- a/tools/protein_analysis/rxlr_motifs.py Thu Sep 21 11:35:20 2017 -0400 +++ b/tools/protein_analysis/rxlr_motifs.py Thu Jun 17 08:21:06 2021 +0000 @@ -1,5 +1,5 @@ #!/usr/bin/env python -"""Implements assorted RXLR motif methods from the literature +"""Implements assorted RXLR motif methods from the literature. This script takes exactly four command line arguments: * Protein FASTA filename @@ -8,21 +8,18 @@ * Output tabular filename The model names are: - -Bhattacharjee2006: Simple regular expression search for RXLR -with additional requirements for positioning and signal peptide. - -Win2007: Simple regular expression search for RXLR, but with -different positional requirements. - -Whisson2007: As Bhattacharjee2006 but with a more complex regular -expression to look for RXLR-EER domain, and additionally calls HMMER. + * Bhattacharjee2006: Simple regular expression search for RXLR + with additional requirements for positioning and signal peptide. + * Win2007: Simple regular expression search for RXLR, but with + different positional requirements. + * Whisson2007: As Bhattacharjee2006 but with a more complex regular + expression to look for RXLR-EER domain, and additionally calls HMMER. See the help text in the accompanying Galaxy tool XML file for more details including the full references. -Note: - +Note +---- Bhattacharjee et al. (2006) and Win et al. (2007) used SignalP v2.0, which is no longer available. The current release is SignalP v3.0 (Mar 5, 2007). We have therefore opted to use the NN Ymax position for @@ -55,11 +52,14 @@ from seq_analysis_utils import fasta_iterator if "-v" in sys.argv: - print("RXLR Motifs v0.0.14") + print("RXLR Motifs v0.0.16") sys.exit(0) if len(sys.argv) != 5: - sys.exit("Requires four arguments: protein FASTA filename, threads, model, and output filename") + sys.exit( + "Requires four arguments: protein FASTA filename, threads, " + "model, and output filename" + ) fasta_file, threads, model, tabular_file = sys.argv[1:] hmm_output_file = tabular_file + ".hmm.tmp" @@ -98,15 +98,20 @@ min_rxlr_start = 1 max_rxlr_start = max_sp + max_sp_rxlr else: - sys.exit("Did not recognise the model name %r\n" - "Use Bhattacharjee2006, Win2007, or Whisson2007" % model) + sys.exit( + "Did not recognise the model name %r\n" + "Use Bhattacharjee2006, Win2007, or Whisson2007" % model + ) def get_hmmer_version(exe, required=None): try: - child = subprocess.Popen([exe, "-h"], - universal_newlines=True, - stdout=subprocess.PIPE, stderr=subprocess.PIPE) + child = subprocess.Popen( + [exe, "-h"], + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) except OSError: raise ValueError("Could not run %s" % exe) stdout, stderr = child.communicate() @@ -122,8 +127,9 @@ # Run hmmsearch for Whisson et al. (2007) if model == "Whisson2007": - hmm_file = os.path.join(os.path.split(sys.argv[0])[0], - "whisson_et_al_rxlr_eer_cropped.hmm") + hmm_file = os.path.join( + os.path.split(sys.argv[0])[0], "whisson_et_al_rxlr_eer_cropped.hmm" + ) if not os.path.isfile(hmm_file): sys.exit("Missing HMM file for Whisson et al. (2007)") if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): @@ -143,23 +149,32 @@ else: # I've left the code to handle HMMER 3 in situ, in case # we revisit the choice to insist on HMMER 2. - hmmer3 = (3 == get_hmmer_version(hmmer_search)) + hmmer3 = 3 == get_hmmer_version(hmmer_search) # Using zero (or 5.6?) for bitscore threshold if hmmer3: # The HMMER3 table output is easy to parse # In HMMER3 can't use both -T and -E - cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \ - % (hmmer_search, hmm_output_file, hmm_file, fasta_file) + cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" % ( + hmmer_search, + hmm_output_file, + hmm_file, + fasta_file, + ) else: # For HMMER2 we are stuck with parsing stdout # Put 1e6 to effectively have no expectation threshold (otherwise # HMMER defaults to 10 and the calculated e-value depends on the # input FASTA file, and we can loose hits of interest). - cmd = "%s -T 0 -E 1e6 %s %s > %s" \ - % (hmmer_search, hmm_file, fasta_file, hmm_output_file) + cmd = "%s -T 0 -E 1e6 %s %s > %s" % ( + hmmer_search, + hmm_file, + fasta_file, + hmm_output_file, + ) return_code = os.system(cmd) if return_code: - sys.exit("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) + sys.stderr.write("Error %i from hmmsearch:\n%s\n" % (return_code, cmd)) + sys.exit(return_code) handle = open(hmm_output_file) for line in handle: @@ -195,7 +210,7 @@ for title, seq in fasta_iterator(fasta_file): total += 1 name = title.split(None, 1)[0] - match = re_rxlr.search(seq[min_rxlr_start - 1:].upper()) + match = re_rxlr.search(seq[min_rxlr_start - 1 :].upper()) if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: # This is a potential RXLR, depending on the SignalP results. # Might as well truncate the sequence now, makes the temp file smaller @@ -213,7 +228,13 @@ signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py") if not os.path.isfile(signalp_script): sys.exit("Error - missing signalp3.py script") -cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file) +cmd = "python '%s' 'euk' '%i' '%s' '%s' '%s'" % ( + signalp_script, + signalp_trunc, + threads, + signalp_input_file, + signalp_output_file, +) return_code = os.system(cmd) if return_code: sys.exit("Error %i from SignalP:\n%s" % (return_code, cmd)) @@ -221,7 +242,10 @@ def parse_signalp(filename): - """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length. + """Parse SignalP output, yield tuples of values. + + Returns tuples of ID, HMM_Sprob_score and NN predicted signal + peptide length. For signal peptide length we use NN_Ymax_pos (minus one). """ @@ -237,7 +261,7 @@ # Parse SignalP results and apply the strict RXLR criteria total = 0 -tally = dict() +tally = {} handle = open(tabular_file, "w") handle.write("#ID\t%s\n" % model) signalp_results = parse_signalp(signalp_output_file) @@ -245,12 +269,12 @@ total += 1 rxlr = "N" name = title.split(None, 1)[0] - match = re_rxlr.search(seq[min_rxlr_start - 1:].upper()) + match = re_rxlr.search(seq[min_rxlr_start - 1 :].upper()) if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: del match # This was the criteria for calling SignalP, # so it will be in the SignalP results. - sp_id, sp_hmm_score, sp_nn_len = signalp_results.next() + sp_id, sp_hmm_score, sp_nn_len = next(signalp_results) assert name == sp_id, "%s vs %s" % (name, sp_id) if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp: match = re_rxlr.search(seq[sp_nn_len:].upper()) @@ -278,7 +302,7 @@ # Check the iterator is finished try: - signalp_results.next() + next(signalp_results) assert False, "Unexpected data in SignalP output" except StopIteration: pass @@ -289,4 +313,4 @@ # Short summary to stdout for Galaxy's info display print("%s for %i sequences:" % (model, total)) -print(", ".join("%s = %i" % kv for kv in sorted(tally.iteritems()))) +print(", ".join("%s = %i" % kv for kv in sorted(tally.items())))
--- a/tools/protein_analysis/rxlr_motifs.xml Thu Sep 21 11:35:20 2017 -0400 +++ b/tools/protein_analysis/rxlr_motifs.xml Thu Jun 17 08:21:06 2021 +0000 @@ -1,4 +1,4 @@ -<tool id="rxlr_motifs" name="RXLR Motifs" version="0.0.14"> +<tool id="rxlr_motifs" name="RXLR Motifs" version="0.0.15"> <description>Find RXLR Effectors of Plant Pathogenic Oomycetes</description> <requirements> <!-- Need SignalP for all the models --> @@ -138,37 +138,37 @@ Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). Galaxy tools and workflows for sequence analysis with applications in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 +https://doi.org/10.7717/peerj.167 Stephen C. Whisson, Petra C. Boevink, Lucy Moleleki, Anna O. Avrova, Juan G. Morales, Eleanor M. Gilroy, Miles R. Armstrong, Severine Grouffaud, Pieter van West, Sean Chapman, Ingo Hein, Ian K. Toth, Leighton Pritchard and Paul R. J. Birch (2007). A translocation signal for delivery of oomycete effector proteins into host plant cells. Nature 450:115-118. -http://dx.doi.org/10.1038/nature06203 +https://doi.org/10.1038/nature06203 Joe Win, William Morgan, Jorunn Bos, Ksenia V. Krasileva, Liliana M. Cano, Angela Chaparro-Garcia, Randa Ammar, Brian J. Staskawicz and Sophien Kamoun (2007). Adaptive evolution has targeted the C-terminal domain of the RXLR effectors of plant pathogenic oomycetes. The Plant Cell 19:2349-2369. -http://dx.doi.org/10.1105/tpc.107.051037 +https://doi.org/10.1105/tpc.107.051037 Souvik Bhattacharjee, N. Luisa Hiller, Konstantinos Liolios, Joe Win, Thirumala-Devi Kanneganti, Carolyn Young, Sophien Kamoun and Kasturi Haldar (2006). The malarial host-targeting signal is conserved in the Irish potato famine pathogen. PLoS Pathogens, 2(5):e50. -http://dx.doi.org/10.1371/journal.ppat.0020050 +https://doi.org/10.1371/journal.ppat.0020050 Trudy A. Torto, Shuang Li, Allison Styer, Edgar Huitema, Antonino Testa, Neil A.R. Gow, Pieter van West and Sophien Kamoun (2003). EST mining and functional expression assays identify extracellular effector proteins from the plant pathogen *phytophthora*. Genome Research, 13:1675-1685. -http://dx.doi.org/10.1101/gr.910003 +https://doi.org/10.1101/gr.910003 Sean R. Eddy (1998). Profile hidden Markov models. Bioinformatics, 14(9):755–763. -http://dx.doi.org/10.1093/bioinformatics/14.9.755 +https://doi.org/10.1093/bioinformatics/14.9.755 Nielsen, Engelbrecht, Brunak and von Heijne (1997). Identification of prokaryotic and eukaryotic signal peptides and prediction of their cleavage sites. Protein Engineering, 10:1-6. -http://dx.doi.org/10.1093/protein/10.1.1 +https://doi.org/10.1093/protein/10.1.1 This wrapper is available to install into other Galaxy Instances via the Galaxy Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/tmhmm_and_signalp
--- a/tools/protein_analysis/seq_analysis_utils.py Thu Sep 21 11:35:20 2017 -0400 +++ b/tools/protein_analysis/seq_analysis_utils.py Thu Jun 17 08:21:06 2021 +0000 @@ -16,23 +16,26 @@ from time import sleep -__version__ = "0.0.2" +if sys.version_info[0] < 3: + range = xrange # noqa: F821 + +__version__ = "0.0.4" try: from multiprocessing import cpu_count except ImportError: # Must be under Python 2.5, this is copied from multiprocessing: def cpu_count(): - """Returns the number of CPUs in the system.""" - if sys.platform == 'win32': + """Return the number of CPUs in the system.""" + if sys.platform == "win32": try: - num = int(os.environ['NUMBER_OF_PROCESSORS']) + num = int(os.environ["NUMBER_OF_PROCESSORS"]) except (ValueError, KeyError): num = 0 - elif 'bsd' in sys.platform or sys.platform == 'darwin': - comm = '/sbin/sysctl -n hw.ncpu' - if sys.platform == 'darwin': - comm = '/usr' + comm + elif "bsd" in sys.platform or sys.platform == "darwin": + comm = "/sbin/sysctl -n hw.ncpu" + if sys.platform == "darwin": + comm = "/usr" + comm try: with os.popen(comm) as p: num = int(p.read()) @@ -40,14 +43,14 @@ num = 0 else: try: - num = os.sysconf('SC_NPROCESSORS_ONLN') + num = os.sysconf("SC_NPROCESSORS_ONLN") except (ValueError, OSError, AttributeError): num = 0 if num >= 1: return num else: - raise NotImplementedError('cannot determine number of cpus') + raise NotImplementedError("cannot determine number of cpus") def thread_count(command_line_arg, default=1): @@ -70,7 +73,7 @@ def fasta_iterator(filename, max_len=None, truncate=None): - """Simple FASTA parser yielding tuples of (title, sequence) strings.""" + """Parse FASTA file yielding tuples of (name, sequence).""" handle = open(filename) title, seq = "", "" for line in handle: @@ -79,8 +82,10 @@ if truncate: seq = seq[:truncate] if max_len and len(seq) > max_len: - raise ValueError("Sequence %s is length %i, max length %i" - % (title.split()[0], len(seq), max_len)) + raise ValueError( + "Sequence %s is length %i, max length %i" + % (title.split()[0], len(seq), max_len) + ) yield title, seq title = line[1:].rstrip() seq = "" @@ -98,13 +103,22 @@ if truncate: seq = seq[:truncate] if max_len and len(seq) > max_len: - raise ValueError("Sequence %s is length %i, max length %i" - % (title.split()[0], len(seq), max_len)) + raise ValueError( + "Sequence %s is length %i, max length %i" + % (title.split()[0], len(seq), max_len) + ) yield title, seq raise StopIteration -def split_fasta(input_filename, output_filename_base, n=500, truncate=None, keep_descr=False, max_len=None): +def split_fasta( + input_filename, + output_filename_base, + n=500, + truncate=None, + keep_descr=False, + max_len=None, +): """Split FASTA file into sub-files each of at most n sequences. Returns a list of the filenames used (based on the input filename). @@ -122,7 +136,7 @@ records = [] for i in range(n): try: - records.append(iterator.next()) + records.append(next(iterator)) except StopIteration: break if not records: @@ -133,12 +147,12 @@ for title, seq in records: handle.write(">%s\n" % title) for i in range(0, len(seq), 60): - handle.write(seq[i:i + 60] + "\n") + handle.write(seq[i : i + 60] + "\n") else: for title, seq in records: handle.write(">%s\n" % title.split()[0]) for i in range(0, len(seq), 60): - handle.write(seq[i:i + 60] + "\n") + handle.write(seq[i : i + 60] + "\n") handle.close() files.append(new_filename) # print "%i records in %s" % (len(records), new_filename) @@ -158,7 +172,7 @@ def run_jobs(jobs, threads, pause=10, verbose=False, fast_fail=True): - """Takes list of cmd strings, returns dict with error levels.""" + """Take list of cmd strings, return dict with error levels.""" pending = jobs[:] running = [] results = {} @@ -177,11 +191,12 @@ results[cmd] = return_code if return_code: failed = True - running = [(cmd, process) for (cmd, process) in running - if cmd not in results] + running = [(cmd, process) for (cmd, process) in running if cmd not in results] if verbose: - print("%i jobs pending, %i running, %i completed" % - (len(pending), len(running), len(results))) + print( + "%i jobs pending, %i running, %i completed" + % (len(pending), len(running), len(results)) + ) # See if we can start any new threads if pending and failed and fast_fail: # Don't start any more jobs
--- a/tools/protein_analysis/signalp3.py Thu Sep 21 11:35:20 2017 -0400 +++ b/tools/protein_analysis/signalp3.py Thu Jun 17 08:21:06 2021 +0000 @@ -71,10 +71,12 @@ sys.exit(os.system("signalp -version")) if len(sys.argv) not in [6, 8]: - sys.exit("Require five (or 7) arguments, organism, truncate, threads, " - "input protein FASTA file & output tabular file (plus " - "optionally cut method and GFF3 output file). " - "Got %i arguments." % (len(sys.argv) - 1)) + sys.exit( + "Require five (or 7) arguments, organism, truncate, threads, " + "input protein FASTA file & output tabular file (plus " + "optionally cut method and GFF3 output file). " + "Got %i arguments." % (len(sys.argv) - 1) + ) organism = sys.argv[1] if organism not in ["euk", "gram+", "gram-"]: @@ -111,8 +113,9 @@ continue parts = line.rstrip("\r\n").split() assert len(parts) == 21, repr(line) - assert parts[14].startswith(parts[0]), \ + assert parts[14].startswith(parts[0]), ( "Bad entry in SignalP output, ID miss-match:\n%r" % line + ) # Remove redundant truncated name column (col 0) # and put full name at start (col 14) parts = parts[14:15] + parts[1:14] + parts[15:] @@ -121,11 +124,12 @@ def make_gff(fasta_file, tabular_file, gff_file, cut_method): """Make a GFF file.""" - cut_col, score_col = {"NN_Cmax": (2, 1), - "NN_Ymax": (5, 4), - "NN_Smax": (8, 7), - "HMM_Cmax": (16, 15), - }[cut_method] + cut_col, score_col = { + "NN_Cmax": (2, 1), + "NN_Ymax": (5, 4), + "NN_Smax": (8, 7), + "HMM_Cmax": (16, 15), + }[cut_method] source = "SignalP" strand = "." # not stranded @@ -153,30 +157,62 @@ cut = 1 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) score = parts[score_col] - gff_handle.write("##sequence-region %s %i %i\n" - % (seqid, 1, len(seq))) + gff_handle.write("##sequence-region %s %i %i\n" % (seqid, 1, len(seq))) # If the cut is at the very begining, there is no signal peptide! if cut > 1: # signal_peptide = SO:0000418 - gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" - % (seqid, source, - "signal_peptide", 1, cut - 1, - score, strand, phase, tags)) + gff_handle.write( + "%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" + % ( + seqid, + source, + "signal_peptide", + 1, + cut - 1, + score, + strand, + phase, + tags, + ) + ) # mature_protein_region = SO:0000419 - gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" - % (seqid, source, - "mature_protein_region", cut, len(seq), - score, strand, phase, tags)) + gff_handle.write( + "%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" + % ( + seqid, + source, + "mature_protein_region", + cut, + len(seq), + score, + strand, + phase, + tags, + ) + ) tab_handle.close() gff_handle.close() -fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"), - n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) +if num_threads == 1: + # Still want to call split_fasta to apply truncation, but + # no reason to make multiple files - and more chance of + # hitting file system glitches if we do. So, + FASTA_CHUNK = sys.maxsize + +fasta_files = split_fasta( + fasta_file, + os.path.join(tmp_dir, "signalp"), + n=FASTA_CHUNK, + truncate=truncate, + max_len=MAX_LEN, +) temp_files = [f + ".out" for f in fasta_files] assert len(fasta_files) == len(temp_files) -jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) - for (fasta, temp) in zip(fasta_files, temp_files)] +jobs = [ + "signalp -short -t %s %s > %s" % (organism, fasta, temp) + for (fasta, temp) in zip(fasta_files, temp_files) +] assert len(fasta_files) == len(temp_files) == len(jobs) @@ -205,9 +241,15 @@ if error_level or output.lower().startswith("error running"): clean_up(fasta_files + temp_files) if output: - sys.stderr.write("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output)) + sys.stderr.write( + "One or more tasks failed, e.g. %i from %r gave:\n%s" + % (error_level, cmd, output) + ) else: - sys.stderr.write("One or more tasks failed, e.g. %i from %r with no output\n" % (error_level, cmd)) + sys.stderr.write( + "One or more tasks failed, e.g. %i from %r with no output\n" + % (error_level, cmd) + ) sys.exit(error_level) del results @@ -218,8 +260,16 @@ fields.extend(["NN_%s_score" % name, "NN_%s_pos" % name, "NN_%s_pred" % name]) fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"]) # HMM results: -fields.extend(["HMM_type", "HMM_Cmax_score", "HMM_Cmax_pos", "HMM_Cmax_pred", - "HMM_Sprob_score", "HMM_Sprob_pred"]) +fields.extend( + [ + "HMM_type", + "HMM_Cmax_score", + "HMM_Cmax_pos", + "HMM_Cmax_pred", + "HMM_Sprob_score", + "HMM_Sprob_pred", + ] +) out_handle.write("#" + "\t".join(fields) + "\n") for temp in temp_files: data_handle = open(temp)
--- a/tools/protein_analysis/signalp3.xml Thu Sep 21 11:35:20 2017 -0400 +++ b/tools/protein_analysis/signalp3.xml Thu Jun 17 08:21:06 2021 +0000 @@ -168,17 +168,17 @@ Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). Galaxy tools and workflows for sequence analysis with applications in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 +https://doi.org/10.7717/peerj.167 Bendtsen, Nielsen, von Heijne, and Brunak (2004). Improved prediction of signal peptides: SignalP 3.0. J. Mol. Biol., 340:783-795. -http://dx.doi.org/10.1016/j.jmb.2004.05.028 +https://doi.org/10.1016/j.jmb.2004.05.028 Nielsen, Engelbrecht, Brunak and von Heijne (1997). Identification of prokaryotic and eukaryotic signal peptides and prediction of their cleavage sites. Protein Engineering, 10:1-6. -http://dx.doi.org/10.1093/protein/10.1.1 +https://doi.org/10.1093/protein/10.1.1 Nielsen and Krogh (1998). Prediction of signal peptides and signal anchors by a hidden Markov model.
--- a/tools/protein_analysis/tmhmm2.py Thu Sep 21 11:35:20 2017 -0400 +++ b/tools/protein_analysis/tmhmm2.py Thu Jun 17 08:21:06 2021 +0000 @@ -39,7 +39,7 @@ Also tmhmm2 can fail without returning an error code, for example if run on a 64 bit machine with only the 32 bit binaries installed. This script will spot when there is no output from tmhmm2, and raise an error. -""" +""" # noqa: E501 from __future__ import print_function @@ -55,7 +55,10 @@ sys.exit("TMHMM wrapper version 0.0.16") if len(sys.argv) != 4: - sys.exit("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") + sys.exit( + "Require three arguments, number of threads (int), input protein " + "FASTA file & output tabular file" + ) num_threads = thread_count(sys.argv[1], default=4) fasta_file = sys.argv[2] @@ -87,8 +90,10 @@ predhel = predhel[8:] assert topology.startswith("Topology="), line topology = topology[9:] - out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" - % (identifier, length, exp_aa, first60, predhel, topology)) + out_handle.write( + "%s\t%s\t%s\t%s\t%s\t%s\n" + % (identifier, length, exp_aa, first60, predhel, topology) + ) count += 1 return count @@ -97,8 +102,10 @@ # split_fasta returns an empty list (i.e. zero temp files). fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) temp_files = [f + ".out" for f in fasta_files] -jobs = ["tmhmm -short %s > %s" % (fasta, temp) - for fasta, temp in zip(fasta_files, temp_files)] +jobs = [ + "tmhmm -short %s > %s" % (fasta, temp) + for fasta, temp in zip(fasta_files, temp_files) +] def clean_up(file_list): @@ -124,8 +131,11 @@ except IOError: output = "" clean_up(fasta_files + temp_files) - sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), - error_level) + sys.exit( + "One or more tasks failed, e.g. %i from %r gave:\n%s" + % (error_level, cmd, output), + error_level, + ) del results del jobs
--- a/tools/protein_analysis/tmhmm2.xml Thu Sep 21 11:35:20 2017 -0400 +++ b/tools/protein_analysis/tmhmm2.xml Thu Jun 17 08:21:06 2021 +0000 @@ -95,12 +95,12 @@ Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). Galaxy tools and workflows for sequence analysis with applications in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 +https://doi.org/10.7717/peerj.167 Krogh, Larsson, von Heijne, and Sonnhammer (2001). Predicting Transmembrane Protein Topology with a Hidden Markov Model: Application to Complete Genomes. J. Mol. Biol. 305:567-580. -http://dx.doi.org/10.1006/jmbi.2000.4315 +https://doi.org/10.1006/jmbi.2000.4315 Sonnhammer, von Heijne, and Krogh (1998). A hidden Markov model for predicting transmembrane helices in protein sequences.
--- a/tools/protein_analysis/wolf_psort.py Thu Sep 21 11:35:20 2017 -0400 +++ b/tools/protein_analysis/wolf_psort.py Thu Jun 17 08:21:06 2021 +0000 @@ -69,7 +69,10 @@ sys.exit("WoLF-PSORT wrapper version 0.0.11") if len(sys.argv) != 5: - sys.exit("Require four arguments, organism, threads, input protein FASTA file & output tabular file") + sys.exit( + "Require four arguments, organism, threads, input protein FASTA file, " + "and output tabular file" + ) organism = sys.argv[1] if organism not in ["animal", "plant", "fungi"]: @@ -88,15 +91,16 @@ name, data = line.rstrip("\r\n").split(None, 1) for rank, comp_data in enumerate(data.split(",")): comp, score = comp_data.split() - out_handle.write("%s\t%s\t%s\t%i\n" - % (name, comp, score, rank + 1)) + out_handle.write("%s\t%s\t%s\t%i\n" % (name, comp, score, rank + 1)) fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK) temp_files = [f + ".out" for f in fasta_files] assert len(fasta_files) == len(temp_files) -jobs = ["%s %s < %s > %s" % (exe, organism, fasta, temp) - for (fasta, temp) in zip(fasta_files, temp_files)] +jobs = [ + "%s %s < %s > %s" % (exe, organism, fasta, temp) + for (fasta, temp) in zip(fasta_files, temp_files) +] assert len(fasta_files) == len(temp_files) == len(jobs) @@ -120,8 +124,11 @@ if error_level or output.lower().startswith("error running"): clean_up(fasta_files) clean_up(temp_files) - sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), - error_level) + sys.exit( + "One or more tasks failed, e.g. %i from %r gave:\n%s" + % (error_level, cmd, output), + error_level, + ) del results out_handle = open(tabular_file, "w")
--- a/tools/protein_analysis/wolf_psort.xml Thu Sep 21 11:35:20 2017 -0400 +++ b/tools/protein_analysis/wolf_psort.xml Thu Jun 17 08:21:06 2021 +0000 @@ -127,12 +127,12 @@ Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). Galaxy tools and workflows for sequence analysis with applications in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 +https://doi.org/10.7717/peerj.167 Paul Horton, Keun-Joon Park, Takeshi Obayashi, Naoya Fujita, Hajime Harada, C.J. Adams-Collier, and Kenta Nakai (2007). WoLF PSORT: Protein Localization Predictor. Nucleic Acids Research, 35(S2), W585-W587. -http://dx.doi.org/10.1093/nar/gkm259 +https://doi.org/10.1093/nar/gkm259 Paul Horton, Keun-Joon Park, Takeshi Obayashi and Kenta Nakai (2006). Protein Subcellular Localization Prediction with WoLF PSORT.