diff tools/protein_analysis/rxlr_motifs.py @ 6:a290c6d4e658

Migrated tool version 0.0.9 from old tool shed archive to new tool shed repository
author peterjc
date Tue, 07 Jun 2011 18:07:09 -0400
parents
children 7de64c8b258d
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/protein_analysis/rxlr_motifs.py	Tue Jun 07 18:07:09 2011 -0400
@@ -0,0 +1,268 @@
+#!/usr/bin/env python
+"""Implements assorted RXLR motif methods from the literature
+
+This script takes exactly four command line arguments:
+ * Protein FASTA filename
+ * Number of threads
+ * Model name (Bhattacharjee2006, Win2007, Whisson2007)
+ * Output tabular filename
+
+The model names are:
+
+Bhattacharjee2006: Simple regular expression search for RXLR
+with additional requirements for positioning and signal peptide.
+
+Win2007: Simple regular expression search for RXLR, but with
+different positional requirements.
+
+Whisson2007: As Bhattacharjee2006 but with a more complex regular
+expression to look for RXLR-EER domain, and additionally calls HMMER.
+
+See the help text in the accompanying Galaxy tool XML file for more
+details including the full references.
+
+Note:
+
+Bhattacharjee et al. (2006) and Win et al. (2007) used SignalP v2.0,
+which is no longer available. The current release is SignalP v3.0
+(Mar 5, 2007). We have therefore opted to use the NN Ymax position for
+the predicted cleavage site, as this is expected to be more accurate.
+Also note that the HMM score values have changed from v2.0 to v3.0.
+Whisson et al. (2007) used SignalP v3.0 anyway.
+
+Whisson et al. (2007) used HMMER 2.3.2, and althought their HMM model
+can still be used with hmmsearch from HMMER 3 this this does give
+slightly different results. We expect the hmmsearch from HMMER 2.3.2
+(the last stable release of HMMER 2) to be present on the path under
+the name hmmsearch2 (allowing it to co-exist with HMMER 3).
+"""
+import os
+import sys
+import re
+import subprocess
+from seq_analysis_utils import stop_err, fasta_iterator
+
+if len(sys.argv) != 5:
+   stop_err("Requires four arguments: protein FASTA filename, threads, model, and output filename")
+
+fasta_file, threads, model, tabular_file = sys.argv[1:]
+hmm_output_file = tabular_file + ".hmm.tmp"
+signalp_input_file = tabular_file + ".fasta.tmp"
+signalp_output_file = tabular_file + ".tabular.tmp"
+min_signalp_hmm = 0.9
+hmmer_search = "hmmsearch2"
+
+if model == "Bhattacharjee2006":
+   signalp_trunc = 70
+   re_rxlr = re.compile("R.LR")
+   min_sp = 10
+   max_sp = 40
+   max_sp_rxlr = 100
+   min_rxlr_start = 1
+   #Allow signal peptide to be at most 40aa, and want RXLR to be
+   #within 100aa, therefore for the prescreen the max start is 140:
+   max_rxlr_start = max_sp + max_sp_rxlr
+elif model == "Win2007":
+   signalp_trunc = 70
+   re_rxlr = re.compile("R.LR")
+   min_sp = 10
+   max_sp = 40
+   min_rxlr_start = 30
+   max_rxlr_start = 60
+   #No explicit limit on separation of signal peptide clevage
+   #and RXLR, but shortest signal peptide is 10, and furthest
+   #away RXLR is 60, so effectively limit is 50.
+   max_sp_rxlr = max_rxlr_start - min_sp + 1
+elif model == "Whisson2007":
+   signalp_trunc = 0 #zero for no truncation
+   re_rxlr = re.compile("R.LR.{,40}[ED][ED][KR]")
+   min_sp = 10
+   max_sp = 40
+   max_sp_rxlr = 100
+   min_rxlr_start = 1
+   max_rxlr_start = max_sp + max_sp_rxlr
+else:
+   stop_err("Did not recognise the model name %r\n"
+            "Use Bhattacharjee2006, Win2007, or Whisson2007" % model)
+
+
+def get_hmmer_version(exe, required=None):
+    cmd = "%s -h" % exe
+    try:
+        child = subprocess.Popen([exe, "-h"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    except OSError:
+        raise ValueError("Could not run %s" % exe)
+    stdout, stderr = child.communicate()
+    if required:
+        return required in stdout
+    elif "HMMER 2" in stdout:
+        return 2
+    elif "HMMER 3" in stdout:
+        return 3
+    else:
+        raise ValueError("Could not determine version of %s" % exe)
+    
+
+#Run hmmsearch for Whisson et al. (2007)
+if model == "Whisson2007":
+    hmm_file = os.path.join(os.path.split(sys.argv[0])[0],
+                       "whisson_et_al_rxlr_eer_cropped.hmm")
+    if not os.path.isfile(hmm_file):
+        stop_err("Missing HMM file for Whisson et al. (2007)")
+    if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"):
+        stop_err("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_searcher)
+    #I've left the code to handle HMMER 3 in situ, in case
+    #we revisit the choice to insist on HMMER 2.
+    hmmer3 = (3 == get_hmmer_version(hmmer_search))
+    #Using zero (or 5.6?) for bitscore threshold
+    if hmmer3:
+        #The HMMER3 table output is easy to parse
+        #In HMMER3 can't use both -T and -E
+        cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \
+              % (hmmer_search, hmm_output_file, hmm_file, fasta_file)
+    else:
+        #For HMMER2 we are stuck with parsing stdout
+        #Put 1e6 to effectively have no expectation threshold (otherwise
+        #HMMER defaults to 10 and the calculated e-value depends on the
+        #input FASTA file, and we can loose hits of interest).
+        cmd = "%s -T 0 -E 1e6 %s %s > %s" \
+              % (hmmer_search, hmm_file, fasta_file, hmm_output_file)
+    return_code = os.system(cmd)
+    if return_code:
+        stop_err("Error %i from hmmsearch:\n%s" % (return_code, cmd))
+    hmm_hits = set()
+    valid_ids = set()
+    for title, seq in fasta_iterator(fasta_file):
+        name = title.split(None,1)[0]
+        if name in valid_ids:
+            stop_err("Duplicated identifier %r" % name)
+        else:
+            valid_ids.add(name)
+    handle = open(hmm_output_file)
+    for line in handle:
+        if not line.strip():
+            #We expect blank lines in the HMMER2 stdout
+            continue
+        elif line.startswith("#"):
+            #Header
+            continue
+        else:
+            name = line.split(None,1)[0]
+            #Should be a sequence name in the HMMER3 table output.
+            #Could be anything in the HMMER2 stdout.
+            if name in valid_ids:
+                hmm_hits.add(name)
+            elif hmmer3:
+                stop_err("Unexpected identifer %r in hmmsearch output" % name)
+    handle.close()
+    #if hmmer3:
+    #    print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids))
+    #else:
+    #    print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids))  
+    #print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids))
+    os.remove(hmm_output_file)
+    del valid_ids
+
+
+#Prepare short list of candidates containing RXLR to pass to SignalP
+assert min_rxlr_start > 0, "Min value one, since zero based counting"
+count = 0
+total = 0
+handle = open(signalp_input_file, "w")
+for title, seq in fasta_iterator(fasta_file):
+    total += 1
+    name = title.split(None,1)[0]
+    match = re_rxlr.search(seq[min_rxlr_start-1:].upper())
+    if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start:
+        #This is a potential RXLR, depending on the SignalP results.
+        #Might as well truncate the sequence now, makes the temp file smaller
+        if signalp_trunc:
+            handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc]))
+        else:
+            #Does it matter we don't line wrap?
+            handle.write(">%s\n%s\n" % (name, seq))
+        count += 1
+handle.close()
+#print "Running SignalP on %i/%i potentials." % (count, total)
+
+
+#Run SignalP (using our wrapper script to get multi-core support etc)
+signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py")
+if not os.path.isfile(signalp_script):
+    stop_err("Error - missing signalp3.py script")
+cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file)
+return_code = os.system(cmd)
+if return_code:
+    stop_err("Error %i from SignalP:\n%s" % (return_code, cmd))
+#print "SignalP done"
+
+def parse_signalp(filename):
+    """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length.
+
+    For signal peptide length we use NN_Ymax_pos (minus one).
+    """
+    handle = open(filename)
+    line = handle.readline()
+    assert line.startswith("#ID\t"), line
+    for line in handle:
+        parts = line.rstrip("\t").split("\t")
+        assert len(parts)==20, repr(line)
+        yield parts[0], float(parts[18]), int(parts[5])-1
+    handle.close()
+
+
+#Parse SignalP results and apply the strict RXLR criteria
+total = 0
+tally = dict()
+handle = open(tabular_file, "w")
+handle.write("#ID\t%s\n" % model)
+signalp_results = parse_signalp(signalp_output_file)
+for title, seq in fasta_iterator(fasta_file):
+    total += 1
+    rxlr = "N"
+    name = title.split(None,1)[0]
+    match = re_rxlr.search(seq[min_rxlr_start-1:].upper())
+    if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start:
+        del match
+        #This was the criteria for calling SignalP,
+        #so it will be in the SignalP results.
+        sp_id, sp_hmm_score, sp_nn_len = signalp_results.next()
+        assert name == sp_id, "%s vs %s" % (name, sp_id)
+        if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp:
+            match = re_rxlr.search(seq[sp_nn_len:].upper())
+            if match and match.start() + 1 <= max_sp_rxlr: #1-based counting
+                rxlr_start = sp_nn_len + match.start() + 1
+                if min_rxlr_start <= rxlr_start <= max_rxlr_start:
+                    rxlr = "Y"
+    if model == "Whisson2007":
+        #Combine the signalp with regular expression heuristic and the HMM
+        if name in hmm_hits and rxlr == "N":
+            rxlr = "hmm" #HMM only
+        elif rxlr == "N":
+            rxlr = "neither" #Don't use N (no)
+        elif name not in hmm_hits and rxlr == "Y":
+            rxlr = "re" #Heuristic only
+        #Now have a four way classifier: Y, hmm, re, neither
+        #and count is the number of Y results (both HMM and heuristic)
+    handle.write("%s\t%s\n" % (name, rxlr))
+    try:
+        tally[rxlr] += 1
+    except KeyError:
+        tally[rxlr] = 1
+handle.close()
+assert sum(tally.values()) == total
+
+#Check the iterator is finished
+try:
+    signalp_results.next()
+    assert False, "Unexpected data in SignalP output"
+except StopIteration:
+    pass
+
+#Cleanup
+os.remove(signalp_input_file)
+os.remove(signalp_output_file)
+
+#Short summary to stdout for Galaxy's info display
+print "%s for %i sequences:" % (model, total)
+print ", ".join("%s = %i" % kv for kv in sorted(tally.iteritems()))