tmhmm_and_signalp: tools/protein_analysis/signalp3.py comparison

comparison tools/protein_analysis/signalp3.py @ 19:f3ecd80850e2 draft

v0.2.9 Python style improvements

author	peterjc
date	Wed, 01 Feb 2017 09:46:42 -0500
parents	eb6ac44d4b8e
children	a19b3ded8f33

comparison

equal deleted inserted replaced

-:eb6ac44d4b8e
+:f3ecd80850e2
 First major feature is cleaning up the output. The raw output from SignalP
 v3.0 looks like this (21 columns space separated):
 # SignalP-NN euk predictions                                   	                # SignalP-HMM euk predictions
 # name                Cmax  pos ?  Ymax  pos ?  Smax  pos ?  Smean ?  D     ? 	# name      !  Cmax  pos ?  Sprob ?
 gi|2781234|pdb|1JLY|  0.061  17 N  0.043  17 N  0.199   1 N  0.067 N  0.055 N	gi|2781234|pdb|1JLY|B  Q  0.000  17 N  0.000 N
 gi|4959044|gb|AAD342  0.099 191 N  0.012  38 N  0.023  12 N  0.014 N  0.013 N	gi|4959044|gb|AAD34209.1|AF069992_1  Q  0.000   0 N  0.000 N
 gi|671626|emb|CAA856  0.139 381 N  0.020   8 N  0.121   4 N  0.067 N  0.044 N	gi|671626|emb|CAA85685.1|  Q  0.000   0 N  0.000 N
 gi|3298468|dbj|BAA31  0.208  24 N  0.184  38 N  0.980  32 Y  0.613 Y  0.398 N	gi|3298468|dbj|BAA31520.1|  Q  0.066  24 N  0.139 N
 In order to make it easier to use in Galaxy, this wrapper script reformats
 this to use tab separators. Also it removes the redundant truncated name
 column, and assigns unique column names in the header:
 the predictors which gives a cleavage site). *WORK IN PROGRESS*
 """
 import sys
 import os
 import tempfile
-from seq_analysis_utils import sys_exit, split_fasta, fasta_iterator
+from seq_analysis_utils import split_fasta, fasta_iterator
 from seq_analysis_utils import run_jobs, thread_count
 FASTA_CHUNK = 500
-MAX_LEN = 6000 #Found by trial and error
+MAX_LEN = 6000  # Found by trial and error
-if len(sys.argv) not in  [6,8]:
+if len(sys.argv) not in [6, 8]:
-sys_exit("Require five (or 7) arguments, organism, truncate, threads, "
+sys.exit("Require five (or 7) arguments, organism, truncate, threads, "
 "input protein FASTA file & output tabular file (plus "
 "optionally cut method and GFF3 output file). "
-"Got %i arguments." % (len(sys.argv)-1))
+"Got %i arguments." % (len(sys.argv) - 1))
 organism = sys.argv[1]
 if organism not in ["euk", "gram+", "gram-"]:
-sys_exit("Organism argument %s is not one of euk, gram+ or gram-" % organism)
+sys.exit("Organism argument %s is not one of euk, gram+ or gram-" % organism)
 try:
 truncate = int(sys.argv[2])
-except:
+except ValueError:
 truncate = 0
 if truncate < 0:
-sys_exit("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2])
+sys.exit("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2])
 num_threads = thread_count(sys.argv[3], default=4)
 fasta_file = sys.argv[4]
 tabular_file = sys.argv[5]
 if len(sys.argv) == 8:
 cut_method = sys.argv[6]
 if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]:
-sys_exit("Invalid cut method %r" % cut_method)
+sys.exit("Invalid cut method %r" % cut_method)
 gff3_file = sys.argv[7]
 else:
 cut_method = None
 gff3_file = None
 tmp_dir = tempfile.mkdtemp()
 def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None):
 """Clean up SignalP output to make it tabular."""
 if cut_method:
-cut_col = {"NN_Cmax" : 2,
+cut_col = {"NN_Cmax": 2,
-"NN_Ymax" : 5,
+"NN_Ymax": 5,
-"NN_Smax" : 8,
+"NN_Smax": 8,
-"HMM_Cmax" : 16}[cut_method]
+"HMM_Cmax": 16}[cut_method]
 else:
 cut_col = None
 for line in raw_handle:
 if not line or line.startswith("#"):
 continue
 parts = line.rstrip("\r\n").split()
-assert len(parts)==21, repr(line)
+assert len(parts) == 21, repr(line)
 assert parts[14].startswith(parts[0]), \
 "Bad entry in SignalP output, ID miss-match:\n%r" % line
-#Remove redundant truncated name column (col 0)
+# Remove redundant truncated name column (col 0)
-#and put full name at start (col 14)
+# and put full name at start (col 14)
 parts = parts[14:15] + parts[1:14] + parts[15:]
 out_handle.write("\t".join(parts) + "\n")
 def make_gff(fasta_file, tabular_file, gff_file, cut_method):
-cut_col, score_col = {"NN_Cmax" : (2,1),
+cut_col, score_col = {"NN_Cmax": (2, 1),
-"NN_Ymax" : (5,4),
+"NN_Ymax": (5, 4),
-"NN_Smax" : (8,7),
+"NN_Smax": (8, 7),
-"HMM_Cmax" : (16,15),
+"HMM_Cmax": (16, 15),
 }[cut_method]
 source = "SignalP"
-strand = "." #not stranded
+strand = "."  # not stranded
-phase = "." #not phased
+phase = "."  # not phased
 tags = "Note=%s" % cut_method
 tab_handle = open(tabular_file)
 line = tab_handle.readline()
 assert line.startswith("#ID\t"), line
 gff_handle = open(gff_file, "w")
 for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle):
 parts = line.rstrip("\n").split("\t")
 seqid = parts[0]
 assert title.startswith(seqid), "%s vs %s" % (seqid, title)
-if len(seq)==0:
+if not seq:
-#Is it possible to have a zero length reference in GFF3?
+# Is it possible to have a zero length reference in GFF3?
 continue
 cut = int(parts[cut_col])
 if cut == 0:
 assert cut_method == "HMM_Cmax", cut_method
-#TODO - Why does it do this?
+# TODO - Why does it do this?
 cut = 1
 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq))
 score = parts[score_col]
-gff_handle.write("##sequence-region %s %i %i\n" \
+gff_handle.write("##sequence-region %s %i %i\n"
 % (seqid, 1, len(seq)))
-#If the cut is at the very begining, there is no signal peptide!
+# If the cut is at the very begining, there is no signal peptide!
 if cut > 1:
-#signal_peptide = SO:0000418
+# signal_peptide = SO:0000418
-gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \
+gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n"
 % (seqid, source,
-"signal_peptide", 1, cut-1,
+"signal_peptide", 1, cut - 1,
 score, strand, phase, tags))
-#mature_protein_region = SO:0000419
+# mature_protein_region = SO:0000419
-gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \
+gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n"
 % (seqid, source,
 "mature_protein_region", cut, len(seq),
 score, strand, phase, tags))
 tab_handle.close()
 gff_handle.close()
 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"),
 n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN)
-temp_files = [f+".out" for f in fasta_files]
+temp_files = [f + ".out" for f in fasta_files]
 assert len(fasta_files) == len(temp_files)
 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp)
 for (fasta, temp) in zip(fasta_files, temp_files)]
 assert len(fasta_files) == len(temp_files) == len(jobs)
 def clean_up(file_list):
+"""Remove temp files, and if possible the temp directory."""
 for f in file_list:
 if os.path.isfile(f):
 os.remove(f)
 try:
 os.rmdir(tmp_dir)
-except:
+except Exception:
 pass
 if len(jobs) > 1 and num_threads > 1:
-#A small "info" message for Galaxy to show the user.
+# A small "info" message for Galaxy to show the user.
 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
 results = run_jobs(jobs, num_threads)
 assert len(fasta_files) == len(temp_files) == len(jobs)
 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
 error_level = results[cmd]
 output = open(temp).readline()
 except IOError:
 output = "(no output)"
 if error_level or output.lower().startswith("error running"):
 clean_up(fasta_files + temp_files)
-sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
+sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
 error_level)
 del results
 out_handle = open(tabular_file, "w")
 fields = ["ID"]
-#NN results:
+# NN results:
 for name in ["Cmax", "Ymax", "Smax"]:
-fields.extend(["NN_%s_score"%name, "NN_%s_pos"%name, "NN_%s_pred"%name])
+fields.extend(["NN_%s_score" % name, "NN_%s_pos" % name, "NN_%s_pred" % name])
 fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"])
-#HMM results:
+# HMM results:
 fields.extend(["HMM_type", "HMM_Cmax_score", "HMM_Cmax_pos", "HMM_Cmax_pred",
 "HMM_Sprob_score", "HMM_Sprob_pred"])
 out_handle.write("#" + "\t".join(fields) + "\n")
 for temp in temp_files:
 data_handle = open(temp)
 clean_tabular(data_handle, out_handle)
 data_handle.close()
 out_handle.close()
-#GFF3:
+# GFF3:
 if cut_method:
 make_gff(fasta_file, tabular_file, gff3_file, cut_method)
 clean_up(fasta_files + temp_files)

Mercurial > repos > peterjc > tmhmm_and_signalp

comparison tools/protein_analysis/signalp3.py @ 19:f3ecd80850e2 draft