comparison tools/protein_analysis/signalp3.py @ 11:99b82a2b1272 draft

Uploaded v0.2.0 which added PSORTb wrapper (written with Konrad Paszkiewicz)
author peterjc
date Wed, 03 Apr 2013 10:49:10 -0400
parents e52220a9ddad
children eb6ac44d4b8e
comparison
equal deleted inserted replaced
10:09ff180d1615 11:99b82a2b1272
61 61
62 FASTA_CHUNK = 500 62 FASTA_CHUNK = 500
63 MAX_LEN = 6000 #Found by trial and error 63 MAX_LEN = 6000 #Found by trial and error
64 64
65 if len(sys.argv) not in [6,8]: 65 if len(sys.argv) not in [6,8]:
66 stop_err("Require five (or 7) arguments, organism, truncate, threads, " 66 stop_err("Require five (or 7) arguments, organism, truncate, threads, "
67 "input protein FASTA file & output tabular file (plus " 67 "input protein FASTA file & output tabular file (plus "
68 "optionally cut method and GFF3 output file). " 68 "optionally cut method and GFF3 output file). "
69 "Got %i arguments." % (len(sys.argv)-1)) 69 "Got %i arguments." % (len(sys.argv)-1))
70 70
71 organism = sys.argv[1] 71 organism = sys.argv[1]
72 if organism not in ["euk", "gram+", "gram-"]: 72 if organism not in ["euk", "gram+", "gram-"]:
73 stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism) 73 stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism)
74 74
75 try: 75 try:
76 truncate = int(sys.argv[2]) 76 truncate = int(sys.argv[2])
77 except: 77 except:
78 truncate = 0 78 truncate = 0
79 if truncate < 0: 79 if truncate < 0:
80 stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) 80 stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2])
81 81
82 num_threads = thread_count(sys.argv[3], default=4) 82 num_threads = thread_count(sys.argv[3], default=4)
83 fasta_file = sys.argv[4] 83 fasta_file = sys.argv[4]
84 tabular_file = sys.argv[5] 84 tabular_file = sys.argv[5]
85 85
86 if len(sys.argv) == 8: 86 if len(sys.argv) == 8:
87 cut_method = sys.argv[6] 87 cut_method = sys.argv[6]
88 if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]: 88 if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]:
89 stop_err("Invalid cut method %r" % cut_method) 89 stop_err("Invalid cut method %r" % cut_method)
90 gff3_file = sys.argv[7] 90 gff3_file = sys.argv[7]
91 else: 91 else:
92 cut_method = None 92 cut_method = None
93 gff3_file = None 93 gff3_file = None
94 94
95 95
96 tmp_dir = tempfile.mkdtemp() 96 tmp_dir = tempfile.mkdtemp()
97 97
98 def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None): 98 def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None):
99 """Clean up SignalP output to make it tabular.""" 99 """Clean up SignalP output to make it tabular."""
100 if cut_method: 100 if cut_method:
101 cut_col = {"NN_Cmax" : 2, 101 cut_col = {"NN_Cmax" : 2,
102 "NN_Ymax" : 5, 102 "NN_Ymax" : 5,
103 "NN_Smax" : 8, 103 "NN_Smax" : 8,
104 "HMM_Cmax" : 16}[cut_method] 104 "HMM_Cmax" : 16}[cut_method]
105 else: 105 else:
106 cut_col = None 106 cut_col = None
107 for line in raw_handle: 107 for line in raw_handle:
108 if not line or line.startswith("#"): 108 if not line or line.startswith("#"):
109 continue 109 continue
110 parts = line.rstrip("\r\n").split() 110 parts = line.rstrip("\r\n").split()
111 assert len(parts)==21, repr(line) 111 assert len(parts)==21, repr(line)
112 assert parts[14].startswith(parts[0]) 112 assert parts[14].startswith(parts[0]), \
113 "Bad entry in SignalP output, ID miss-match:\n%r" % line
113 #Remove redundant truncated name column (col 0) 114 #Remove redundant truncated name column (col 0)
114 #and put full name at start (col 14) 115 #and put full name at start (col 14)
115 parts = parts[14:15] + parts[1:14] + parts[15:] 116 parts = parts[14:15] + parts[1:14] + parts[15:]
116 out_handle.write("\t".join(parts) + "\n") 117 out_handle.write("\t".join(parts) + "\n")
117 118
216 data_handle.close() 217 data_handle.close()
217 out_handle.close() 218 out_handle.close()
218 219
219 #GFF3: 220 #GFF3:
220 if cut_method: 221 if cut_method:
221 make_gff(fasta_file, tabular_file, gff3_file, cut_method) 222 make_gff(fasta_file, tabular_file, gff3_file, cut_method)
222 223
223 clean_up(fasta_files + temp_files) 224 clean_up(fasta_files + temp_files)