comparison tools/protein_analysis/psortb.py @ 11:99b82a2b1272 draft

Uploaded v0.2.0 which added PSORTb wrapper (written with Konrad Paszkiewicz)
author peterjc
date Wed, 03 Apr 2013 10:49:10 -0400
parents
children eb6ac44d4b8e
comparison
equal deleted inserted replaced
10:09ff180d1615 11:99b82a2b1272
1 #!/usr/bin/env python
2 """Wrapper for psortb for use in Galaxy.
3
4 This script takes exactly six command line arguments - which includes the
5 number of threads, and the input protein FASTA filename and output
6 tabular filename. It then splits up the FASTA input and calls multiple
7 copies of the standalone psortb v3 program, then collates the output.
8 e.g. Rather than this,
9
10 psort $type -c $cutoff -d $divergent -o long $sequence > $outfile
11
12 Call this:
13
14 psort $threads $type $cutoff $divergent $sequence $outfile
15
16 If ommitting -c or -d options, set $cutoff and $divergent to zero or blank.
17
18 Note that this is somewhat redundant with job-splitting available in Galaxy
19 itself (see the SignalP XML file for settings), but both can be applied.
20
21 Additionally it ensures the header line (with the column names) starts
22 with a # character as used elsewhere in Galaxy.
23 """
24 import sys
25 import os
26 import tempfile
27 from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count
28
29 FASTA_CHUNK = 500
30
31 if "-v" in sys.argv or "--version" in sys.argv:
32 """Return underlying PSORTb's version"""
33 sys.exit(os.system("psort --version"))
34
35 if len(sys.argv) != 8:
36 stop_err("Require 7 arguments, number of threads (int), type (e.g. archaea), "
37 "output (e.g. terse/normal/long), cutoff, divergent, input protein "
38 "FASTA file & output tabular file")
39
40 num_threads = thread_count(sys.argv[1], default=4)
41 org_type = sys.argv[2]
42 out_type = sys.argv[3]
43 cutoff = sys.argv[4]
44 if cutoff.strip() and float(cutoff.strip()) != 0.0:
45 cutoff = "-c %s" % cutoff
46 else:
47 cutoff = ""
48 divergent = sys.argv[5]
49 if divergent.strip() and float(divergent.strip()) != 0.0:
50 divergent = "-d %s" % divergent
51 else:
52 divergent = ""
53 fasta_file = sys.argv[6]
54 tabular_file = sys.argv[7]
55
56 if out_type == "terse":
57 header = ['SeqID', 'Localization', 'Score']
58 elif out_type == "normal":
59 stop_err("Normal output not implemented yet, sorry.")
60 elif out_type == "long":
61 if org_type == "-n":
62 #Gram negative bacteria
63 header = ['SeqID', 'CMSVM-_Localization', 'CMSVM-_Details', 'CytoSVM-_Localization', 'CytoSVM-_Details',
64 'ECSVM-_Localization', 'ECSVM-_Details', 'ModHMM-_Localization', 'ModHMM-_Details',
65 'Motif-_Localization', 'Motif-_Details', 'OMPMotif-_Localization', 'OMPMotif-_Details',
66 'OMSVM-_Localization', 'OMSVM-_Details', 'PPSVM-_Localization', 'PPSVM-_Details',
67 'Profile-_Localization', 'Profile-_Details',
68 'SCL-BLAST-_Localization', 'SCL-BLAST-_Details', 'SCL-BLASTe-_Localization', 'SCL-BLASTe-_Details',
69 'Signal-_Localization', 'Signal-_Details',
70 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score',
71 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
72 'Secondary_Localization', 'PSortb_Version']
73 elif org_type == "-p":
74 #Gram positive bacteria
75 header = ['SeqID', 'CMSVM+_Localization', 'CMSVM+_Details', 'CWSVM+_Localization', 'CWSVM+_Details',
76 'CytoSVM+_Localization', 'CytoSVM+_Details', 'ECSVM+_Localization', 'ECSVM+_Details',
77 'ModHMM+_Localization', 'ModHMM+_Details', 'Motif+_Localization', 'Motif+_Details',
78 'Profile+_Localization', 'Profile+_Details',
79 'SCL-BLAST+_Localization', 'SCL-BLAST+_Details', 'SCL-BLASTe+_Localization', 'SCL-BLASTe+_Details',
80 'Signal+_Localization', 'Signal+_Details',
81 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score',
82 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
83 'Secondary_Localization', 'PSortb_Version']
84 elif org_type == "-a":
85 #Archaea
86 header = ['SeqID', 'CMSVM_a_Localization', 'CMSVM_a_Details', 'CWSVM_a_Localization', 'CWSVM_a_Details',
87 'CytoSVM_a_Localization', 'CytoSVM_a_Details', 'ECSVM_a_Localization', 'ECSVM_a_Details',
88 'ModHMM_a_Localization', 'ModHMM_a_Details', 'Motif_a_Localization', 'Motif_a_Details',
89 'Profile_a_Localization', 'Profile_a_Details',
90 'SCL-BLAST_a_Localization', 'SCL-BLAST_a_Details', 'SCL-BLASTe_a_Localization', 'SCL-BLASTe_a_Details',
91 'Signal_a_Localization', 'Signal_a_Details',
92 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score',
93 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
94 'Secondary_Localization', 'PSortb_Version']
95 else:
96 stop_err("Expected -n, -p or -a for the organism type, not %r" % org_type)
97 else:
98 stop_err("Expected terse, normal or long for the output type, not %r" % out_type)
99
100 tmp_dir = tempfile.mkdtemp()
101
102 def clean_tabular(raw_handle, out_handle):
103 """Clean up tabular TMHMM output, returns output line count."""
104 global header
105 count = 0
106 for line in raw_handle:
107 if not line.strip() or line.startswith("#"):
108 #Ignore any blank lines or comment lines
109 continue
110 parts = [x.strip() for x in line.rstrip("\r\n").split("\t")]
111 if parts == header:
112 #Ignore the header line
113 continue
114 if not parts[-1] and len(parts) == len(header) + 1:
115 #Ignore dummy blank extra column, e.g.
116 #"...2.0\t\tPSORTb version 3.0\t\n"
117 parts = parts[:-1]
118 assert len(parts) == len(header), \
119 "%i fields, not %i, in line:\n%r" % (len(line), len(header), line)
120 out_handle.write(line)
121 count += 1
122 return count
123
124 #Note that if the input FASTA file contains no sequences,
125 #split_fasta returns an empty list (i.e. zero temp files).
126 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK)
127 temp_files = [f+".out" for f in fasta_files]
128 jobs = ["psort %s %s %s -o %s %s > %s" % (org_type, cutoff, divergent, out_type, fasta, temp)
129 for fasta, temp in zip(fasta_files, temp_files)]
130
131 def clean_up(file_list):
132 for f in file_list:
133 if os.path.isfile(f):
134 os.remove(f)
135 try:
136 os.rmdir(tmp_dir)
137 except:
138 pass
139
140 if len(jobs) > 1 and num_threads > 1:
141 #A small "info" message for Galaxy to show the user.
142 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
143 results = run_jobs(jobs, num_threads)
144 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
145 error_level = results[cmd]
146 if error_level:
147 try:
148 output = open(temp).readline()
149 except IOError:
150 output = ""
151 clean_up(fasta_files + temp_files)
152 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
153 error_level)
154 del results
155 del jobs
156
157 out_handle = open(tabular_file, "w")
158 out_handle.write("#%s\n" % "\t".join(header))
159 count = 0
160 for temp in temp_files:
161 data_handle = open(temp)
162 count += clean_tabular(data_handle, out_handle)
163 data_handle.close()
164 if not count:
165 clean_up(fasta_files + temp_files)
166 stop_err("No output from psortb")
167 out_handle.close()
168 print "%i records" % count
169
170 clean_up(fasta_files + temp_files)