diff tools/protein_analysis/signalp3.py @ 19:f3ecd80850e2 draft

v0.2.9 Python style improvements
author peterjc
date Wed, 01 Feb 2017 09:46:42 -0500
parents eb6ac44d4b8e
children a19b3ded8f33
line wrap: on
line diff
--- a/tools/protein_analysis/signalp3.py	Tue Sep 01 09:56:36 2015 -0400
+++ b/tools/protein_analysis/signalp3.py	Wed Feb 01 09:46:42 2017 -0500
@@ -21,9 +21,9 @@
 
 # SignalP-NN euk predictions                                   	                # SignalP-HMM euk predictions
 # name                Cmax  pos ?  Ymax  pos ?  Smax  pos ?  Smean ?  D     ? 	# name      !  Cmax  pos ?  Sprob ?
-gi|2781234|pdb|1JLY|  0.061  17 N  0.043  17 N  0.199   1 N  0.067 N  0.055 N	gi|2781234|pdb|1JLY|B  Q  0.000  17 N  0.000 N  
-gi|4959044|gb|AAD342  0.099 191 N  0.012  38 N  0.023  12 N  0.014 N  0.013 N	gi|4959044|gb|AAD34209.1|AF069992_1  Q  0.000   0 N  0.000 N  
-gi|671626|emb|CAA856  0.139 381 N  0.020   8 N  0.121   4 N  0.067 N  0.044 N	gi|671626|emb|CAA85685.1|  Q  0.000   0 N  0.000 N  
+gi|2781234|pdb|1JLY|  0.061  17 N  0.043  17 N  0.199   1 N  0.067 N  0.055 N	gi|2781234|pdb|1JLY|B  Q  0.000  17 N  0.000 N
+gi|4959044|gb|AAD342  0.099 191 N  0.012  38 N  0.023  12 N  0.014 N  0.013 N	gi|4959044|gb|AAD34209.1|AF069992_1  Q  0.000   0 N  0.000 N
+gi|671626|emb|CAA856  0.139 381 N  0.020   8 N  0.121   4 N  0.067 N  0.044 N	gi|671626|emb|CAA85685.1|  Q  0.000   0 N  0.000 N
 gi|3298468|dbj|BAA31  0.208  24 N  0.184  38 N  0.980  32 Y  0.613 Y  0.398 N	gi|3298468|dbj|BAA31520.1|  Q  0.066  24 N  0.139 N
 
 In order to make it easier to use in Galaxy, this wrapper script reformats
@@ -56,28 +56,28 @@
 import sys
 import os
 import tempfile
-from seq_analysis_utils import sys_exit, split_fasta, fasta_iterator
+from seq_analysis_utils import split_fasta, fasta_iterator
 from seq_analysis_utils import run_jobs, thread_count
 
 FASTA_CHUNK = 500
-MAX_LEN = 6000 #Found by trial and error
+MAX_LEN = 6000  # Found by trial and error
 
-if len(sys.argv) not in  [6,8]:
-    sys_exit("Require five (or 7) arguments, organism, truncate, threads, "
+if len(sys.argv) not in [6, 8]:
+    sys.exit("Require five (or 7) arguments, organism, truncate, threads, "
              "input protein FASTA file & output tabular file (plus "
              "optionally cut method and GFF3 output file). "
-             "Got %i arguments." % (len(sys.argv)-1))
+             "Got %i arguments." % (len(sys.argv) - 1))
 
 organism = sys.argv[1]
 if organism not in ["euk", "gram+", "gram-"]:
-    sys_exit("Organism argument %s is not one of euk, gram+ or gram-" % organism)
+    sys.exit("Organism argument %s is not one of euk, gram+ or gram-" % organism)
 
 try:
     truncate = int(sys.argv[2])
-except:
+except ValueError:
     truncate = 0
 if truncate < 0:
-    sys_exit("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2])
+    sys.exit("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2])
 
 num_threads = thread_count(sys.argv[3], default=4)
 fasta_file = sys.argv[4]
@@ -86,7 +86,7 @@
 if len(sys.argv) == 8:
     cut_method = sys.argv[6]
     if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]:
-        sys_exit("Invalid cut method %r" % cut_method)
+        sys.exit("Invalid cut method %r" % cut_method)
     gff3_file = sys.argv[7]
 else:
     cut_method = None
@@ -95,39 +95,41 @@
 
 tmp_dir = tempfile.mkdtemp()
 
+
 def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None):
     """Clean up SignalP output to make it tabular."""
     if cut_method:
-        cut_col = {"NN_Cmax" : 2,
-                   "NN_Ymax" : 5,
-                   "NN_Smax" : 8,
-                   "HMM_Cmax" : 16}[cut_method]
+        cut_col = {"NN_Cmax": 2,
+                   "NN_Ymax": 5,
+                   "NN_Smax": 8,
+                   "HMM_Cmax": 16}[cut_method]
     else:
         cut_col = None
     for line in raw_handle:
         if not line or line.startswith("#"):
             continue
         parts = line.rstrip("\r\n").split()
-        assert len(parts)==21, repr(line)
+        assert len(parts) == 21, repr(line)
         assert parts[14].startswith(parts[0]), \
             "Bad entry in SignalP output, ID miss-match:\n%r" % line
-        #Remove redundant truncated name column (col 0)
-        #and put full name at start (col 14)
+        # Remove redundant truncated name column (col 0)
+        # and put full name at start (col 14)
         parts = parts[14:15] + parts[1:14] + parts[15:]
         out_handle.write("\t".join(parts) + "\n")
 
+
 def make_gff(fasta_file, tabular_file, gff_file, cut_method):
-    cut_col, score_col = {"NN_Cmax" : (2,1),
-                          "NN_Ymax" : (5,4),
-                          "NN_Smax" : (8,7),
-                          "HMM_Cmax" : (16,15),
+    cut_col, score_col = {"NN_Cmax": (2, 1),
+                          "NN_Ymax": (5, 4),
+                          "NN_Smax": (8, 7),
+                          "HMM_Cmax": (16, 15),
                           }[cut_method]
 
     source = "SignalP"
-    strand = "." #not stranded
-    phase = "." #not phased
+    strand = "."  # not stranded
+    phase = "."  # not phased
     tags = "Note=%s" % cut_method
-    
+
     tab_handle = open(tabular_file)
     line = tab_handle.readline()
     assert line.startswith("#ID\t"), line
@@ -139,53 +141,55 @@
         parts = line.rstrip("\n").split("\t")
         seqid = parts[0]
         assert title.startswith(seqid), "%s vs %s" % (seqid, title)
-        if len(seq)==0:
-            #Is it possible to have a zero length reference in GFF3?
+        if not seq:
+            # Is it possible to have a zero length reference in GFF3?
             continue
         cut = int(parts[cut_col])
         if cut == 0:
             assert cut_method == "HMM_Cmax", cut_method
-            #TODO - Why does it do this?
+            # TODO - Why does it do this?
             cut = 1
         assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq))
         score = parts[score_col]
-        gff_handle.write("##sequence-region %s %i %i\n" \
+        gff_handle.write("##sequence-region %s %i %i\n"
                           % (seqid, 1, len(seq)))
-        #If the cut is at the very begining, there is no signal peptide!
+        # If the cut is at the very begining, there is no signal peptide!
         if cut > 1:
-            #signal_peptide = SO:0000418
-            gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \
+            # signal_peptide = SO:0000418
+            gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n"
                              % (seqid, source,
-                                "signal_peptide", 1, cut-1,
+                                "signal_peptide", 1, cut - 1,
                                 score, strand, phase, tags))
-        #mature_protein_region = SO:0000419
-        gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \
+        # mature_protein_region = SO:0000419
+        gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n"
                          % (seqid, source,
                             "mature_protein_region", cut, len(seq),
                             score, strand, phase, tags))
-        tab_handle.close()
+    tab_handle.close()
     gff_handle.close()
 
 
 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"),
                           n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN)
-temp_files = [f+".out" for f in fasta_files]
+temp_files = [f + ".out" for f in fasta_files]
 assert len(fasta_files) == len(temp_files)
 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp)
         for (fasta, temp) in zip(fasta_files, temp_files)]
 assert len(fasta_files) == len(temp_files) == len(jobs)
 
+
 def clean_up(file_list):
+    """Remove temp files, and if possible the temp directory."""
     for f in file_list:
         if os.path.isfile(f):
             os.remove(f)
     try:
         os.rmdir(tmp_dir)
-    except:
+    except Exception:
         pass
 
 if len(jobs) > 1 and num_threads > 1:
-    #A small "info" message for Galaxy to show the user.
+    # A small "info" message for Galaxy to show the user.
     print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
 results = run_jobs(jobs, num_threads)
 assert len(fasta_files) == len(temp_files) == len(jobs)
@@ -197,17 +201,17 @@
         output = "(no output)"
     if error_level or output.lower().startswith("error running"):
         clean_up(fasta_files + temp_files)
-        sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
+        sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
                  error_level)
 del results
 
 out_handle = open(tabular_file, "w")
 fields = ["ID"]
-#NN results:
+# NN results:
 for name in ["Cmax", "Ymax", "Smax"]:
-    fields.extend(["NN_%s_score"%name, "NN_%s_pos"%name, "NN_%s_pred"%name])
+    fields.extend(["NN_%s_score" % name, "NN_%s_pos" % name, "NN_%s_pred" % name])
 fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"])
-#HMM results:
+# HMM results:
 fields.extend(["HMM_type", "HMM_Cmax_score", "HMM_Cmax_pos", "HMM_Cmax_pred",
                "HMM_Sprob_score", "HMM_Sprob_pred"])
 out_handle.write("#" + "\t".join(fields) + "\n")
@@ -217,7 +221,7 @@
     data_handle.close()
 out_handle.close()
 
-#GFF3:
+# GFF3:
 if cut_method:
     make_gff(fasta_file, tabular_file, gff3_file, cut_method)