Mercurial > repos > peterjc > tmhmm_and_signalp

diff tools/protein_analysis/rxlr_motifs.py @ 21:238eae32483c draft
"Check this is up to date with all 2020 changes (black etc)"
author: peterjc
date: Thu, 17 Jun 2021 08:21:06 +0000
parents: a19b3ded8f33
children: e1996f0f4e85
--- a/tools/protein_analysis/rxlr_motifs.py	Thu Sep 21 11:35:20 2017 -0400
+++ b/tools/protein_analysis/rxlr_motifs.py	Thu Jun 17 08:21:06 2021 +0000
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-"""Implements assorted RXLR motif methods from the literature
+"""Implements assorted RXLR motif methods from the literature.
 
 This script takes exactly four command line arguments:
  * Protein FASTA filename
@@ -8,21 +8,18 @@
  * Output tabular filename
 
 The model names are:
-
-Bhattacharjee2006: Simple regular expression search for RXLR
-with additional requirements for positioning and signal peptide.
-
-Win2007: Simple regular expression search for RXLR, but with
-different positional requirements.
-
-Whisson2007: As Bhattacharjee2006 but with a more complex regular
-expression to look for RXLR-EER domain, and additionally calls HMMER.
+ * Bhattacharjee2006: Simple regular expression search for RXLR
+   with additional requirements for positioning and signal peptide.
+ * Win2007: Simple regular expression search for RXLR, but with
+   different positional requirements.
+ * Whisson2007: As Bhattacharjee2006 but with a more complex regular
+   expression to look for RXLR-EER domain, and additionally calls HMMER.
 
 See the help text in the accompanying Galaxy tool XML file for more
 details including the full references.
 
-Note:
-
+Note
+----
 Bhattacharjee et al. (2006) and Win et al. (2007) used SignalP v2.0,
 which is no longer available. The current release is SignalP v3.0
 (Mar 5, 2007). We have therefore opted to use the NN Ymax position for
@@ -55,11 +52,14 @@
 from seq_analysis_utils import fasta_iterator
 
 if "-v" in sys.argv:
-    print("RXLR Motifs v0.0.14")
+    print("RXLR Motifs v0.0.16")
     sys.exit(0)
 
 if len(sys.argv) != 5:
-    sys.exit("Requires four arguments: protein FASTA filename, threads, model, and output filename")
+    sys.exit(
+        "Requires four arguments: protein FASTA filename, threads, "
+        "model, and output filename"
+    )
 
 fasta_file, threads, model, tabular_file = sys.argv[1:]
 hmm_output_file = tabular_file + ".hmm.tmp"
@@ -98,15 +98,20 @@
     min_rxlr_start = 1
     max_rxlr_start = max_sp + max_sp_rxlr
 else:
-    sys.exit("Did not recognise the model name %r\n"
-             "Use Bhattacharjee2006, Win2007, or Whisson2007" % model)
+    sys.exit(
+        "Did not recognise the model name %r\n"
+        "Use Bhattacharjee2006, Win2007, or Whisson2007" % model
+    )
 
 
 def get_hmmer_version(exe, required=None):
     try:
-        child = subprocess.Popen([exe, "-h"],
-                                 universal_newlines=True,
-                                 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        child = subprocess.Popen(
+            [exe, "-h"],
+            universal_newlines=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
     except OSError:
         raise ValueError("Could not run %s" % exe)
     stdout, stderr = child.communicate()
@@ -122,8 +127,9 @@
 
 # Run hmmsearch for Whisson et al. (2007)
 if model == "Whisson2007":
-    hmm_file = os.path.join(os.path.split(sys.argv[0])[0],
-                            "whisson_et_al_rxlr_eer_cropped.hmm")
+    hmm_file = os.path.join(
+        os.path.split(sys.argv[0])[0], "whisson_et_al_rxlr_eer_cropped.hmm"
+    )
     if not os.path.isfile(hmm_file):
         sys.exit("Missing HMM file for Whisson et al. (2007)")
     if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"):
@@ -143,23 +149,32 @@
     else:
         # I've left the code to handle HMMER 3 in situ, in case
         # we revisit the choice to insist on HMMER 2.
-        hmmer3 = (3 == get_hmmer_version(hmmer_search))
+        hmmer3 = 3 == get_hmmer_version(hmmer_search)
         # Using zero (or 5.6?) for bitscore threshold
         if hmmer3:
             # The HMMER3 table output is easy to parse
             # In HMMER3 can't use both -T and -E
-            cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \
-                  % (hmmer_search, hmm_output_file, hmm_file, fasta_file)
+            cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" % (
+                hmmer_search,
+                hmm_output_file,
+                hmm_file,
+                fasta_file,
+            )
         else:
             # For HMMER2 we are stuck with parsing stdout
             # Put 1e6 to effectively have no expectation threshold (otherwise
             # HMMER defaults to 10 and the calculated e-value depends on the
             # input FASTA file, and we can loose hits of interest).
-            cmd = "%s -T 0 -E 1e6 %s %s > %s" \
-                  % (hmmer_search, hmm_file, fasta_file, hmm_output_file)
+            cmd = "%s -T 0 -E 1e6 %s %s > %s" % (
+                hmmer_search,
+                hmm_file,
+                fasta_file,
+                hmm_output_file,
+            )
         return_code = os.system(cmd)
         if return_code:
-            sys.exit("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code)
+            sys.stderr.write("Error %i from hmmsearch:\n%s\n" % (return_code, cmd))
+            sys.exit(return_code)
 
         handle = open(hmm_output_file)
         for line in handle:
@@ -195,7 +210,7 @@
 for title, seq in fasta_iterator(fasta_file):
     total += 1
     name = title.split(None, 1)[0]
-    match = re_rxlr.search(seq[min_rxlr_start - 1:].upper())
+    match = re_rxlr.search(seq[min_rxlr_start - 1 :].upper())
     if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start:
         # This is a potential RXLR, depending on the SignalP results.
         # Might as well truncate the sequence now, makes the temp file smaller
@@ -213,7 +228,13 @@
 signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py")
 if not os.path.isfile(signalp_script):
     sys.exit("Error - missing signalp3.py script")
-cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file)
+cmd = "python '%s' 'euk' '%i' '%s' '%s' '%s'" % (
+    signalp_script,
+    signalp_trunc,
+    threads,
+    signalp_input_file,
+    signalp_output_file,
+)
 return_code = os.system(cmd)
 if return_code:
     sys.exit("Error %i from SignalP:\n%s" % (return_code, cmd))
@@ -221,7 +242,10 @@
 
 
 def parse_signalp(filename):
-    """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length.
+    """Parse SignalP output, yield tuples of values.
+
+    Returns tuples of ID, HMM_Sprob_score and NN predicted signal
+    peptide length.
 
     For signal peptide length we use NN_Ymax_pos (minus one).
     """
@@ -237,7 +261,7 @@
 
 # Parse SignalP results and apply the strict RXLR criteria
 total = 0
-tally = dict()
+tally = {}
 handle = open(tabular_file, "w")
 handle.write("#ID\t%s\n" % model)
 signalp_results = parse_signalp(signalp_output_file)
@@ -245,12 +269,12 @@
     total += 1
     rxlr = "N"
     name = title.split(None, 1)[0]
-    match = re_rxlr.search(seq[min_rxlr_start - 1:].upper())
+    match = re_rxlr.search(seq[min_rxlr_start - 1 :].upper())
     if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start:
         del match
         # This was the criteria for calling SignalP,
         # so it will be in the SignalP results.
-        sp_id, sp_hmm_score, sp_nn_len = signalp_results.next()
+        sp_id, sp_hmm_score, sp_nn_len = next(signalp_results)
         assert name == sp_id, "%s vs %s" % (name, sp_id)
         if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp:
             match = re_rxlr.search(seq[sp_nn_len:].upper())
@@ -278,7 +302,7 @@
 
 # Check the iterator is finished
 try:
-    signalp_results.next()
+    next(signalp_results)
     assert False, "Unexpected data in SignalP output"
 except StopIteration:
     pass
@@ -289,4 +313,4 @@
 
 # Short summary to stdout for Galaxy's info display
 print("%s for %i sequences:" % (model, total))
-print(", ".join("%s = %i" % kv for kv in sorted(tally.iteritems())))
+print(", ".join("%s = %i" % kv for kv in sorted(tally.items())))
author	peterjc
date	Thu, 17 Jun 2021 08:21:06 +0000
parents	a19b3ded8f33
children	e1996f0f4e85