comparison tools/protein_analysis/wolf_psort.py @ 5:0f1c61998b22

Migrated tool version 0.0.8 from old tool shed archive to new tool shed repository
author peterjc
date Tue, 07 Jun 2011 18:06:27 -0400
parents
children a290c6d4e658
comparison
equal deleted inserted replaced
4:81caef04ce8b 5:0f1c61998b22
1 #!/usr/bin/env python
2 """Wrapper for WoLF PSORT v0.2 for use in Galaxy.
3
4 This script takes exactly four command line arguments:
5 * the organism type (animal, plant or fungi)
6 * number of threads to use (integer)
7 * an input protein FASTA filename
8 * output tabular filename.
9
10 It then calls the standalone WoLF PSORT v0.2 program runWolfPsortSummary
11 (not the webservice), and coverts the output from something like this:
12
13 # k used for kNN is: 27
14 gi|301087619|ref|XP_002894699.1| extr 12, mito 4, E.R. 3, golg 3, mito_nucl 3
15 gi|301087623|ref|XP_002894700.1| extr 21, mito 2, cyto 2, cyto_mito 2
16
17 In order to make it easier to use in Galaxy, this wrapper script reformats
18 this to use tab separators, with one line per compartment prediction:
19
20 #ID Compartment Score Rank
21 gi|301087619|ref|XP_002894699.1| extr 12 1
22 gi|301087619|ref|XP_002894699.1| mito 4 2
23 gi|301087619|ref|XP_002894699.1| E.R. 3 3
24 gi|301087619|ref|XP_002894699.1| golg 3 4
25 gi|301087619|ref|XP_002894699.1| mito_nucl 3 5
26 gi|301087623|ref|XP_002894700.1| extr 21 1
27 gi|301087623|ref|XP_002894700.1| mito 2 2
28 gi|301087623|ref|XP_002894700.1| cyto 2 3
29 gi|301087623|ref|XP_002894700.1| cyto_mito 2 4
30
31 Additionally in order to take full advantage of multiple cores, by subdividing
32 the input FASTA file multiple copies of WoLF PSORT are run in parallel. I would
33 normally use Python's multiprocessing library in this situation but it requires
34 at least Python 2.6 and at the time of writing Galaxy still supports Python 2.4.
35 """
36 import sys
37 import os
38 from seq_analysis_utils import stop_err, split_fasta, run_jobs
39
40 FASTA_CHUNK = 500
41 exe = "runWolfPsortSummary"
42
43 """
44 Note: I had trouble getting runWolfPsortSummary on the path, so used a wrapper
45 python script called runWolfPsortSummary as follows:
46
47 #!/usr/bin/env python
48 #Wrapper script to call WoLF PSORT from its own directory.
49 import os
50 import sys
51 import subprocess
52 saved_dir = os.path.abspath(os.curdir)
53 os.chdir("/opt/WoLFPSORT_package_v0.2/bin")
54 args = ["./runWolfPsortSummary"] + sys.argv[1:]
55 return_code = subprocess.call(args)
56 os.chdir(saved_dir)
57 sys.exit(return_code)
58 """
59
60 if len(sys.argv) != 5:
61 stop_err("Require four arguments, organism, threads, input protein FASTA file & output tabular file")
62
63 organism = sys.argv[1]
64 if organism not in ["animal", "plant", "fungi"]:
65 stop_err("Organism argument %s is not one of animal, plant, fungi" % organism)
66
67 try:
68 num_threads = int(sys.argv[2])
69 except:
70 num_threads = 0
71 if num_threads < 1:
72 stop_err("Threads argument %s is not a positive integer" % sys.argv[3])
73
74 fasta_file = sys.argv[3]
75
76 tabular_file = sys.argv[4]
77
78 def clean_tabular(raw_handle, out_handle):
79 """Clean up WoLF PSORT output to make it tabular."""
80 for line in raw_handle:
81 if not line or line.startswith("#"):
82 continue
83 name, data = line.rstrip("\r\n").split(None,1)
84 for rank, comp_data in enumerate(data.split(",")):
85 comp, score = comp_data.split()
86 out_handle.write("%s\t%s\t%s\t%i\n" \
87 % (name, comp, score, rank+1))
88
89 fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK)
90 temp_files = [f+".out" for f in fasta_files]
91 assert len(fasta_files) == len(temp_files)
92 jobs = ["%s %s < %s > %s" % (exe, organism, fasta, temp)
93 for (fasta, temp) in zip(fasta_files, temp_files)]
94 assert len(fasta_files) == len(temp_files) == len(jobs)
95
96 def clean_up(file_list):
97 for f in file_list:
98 if os.path.isfile(f):
99 os.remove(f)
100
101 if len(jobs) > 1 and num_threads > 1:
102 #A small "info" message for Galaxy to show the user.
103 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
104 results = run_jobs(jobs, num_threads)
105 assert len(fasta_files) == len(temp_files) == len(jobs)
106 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
107 error_level = results[cmd]
108 try:
109 output = open(temp).readline()
110 except IOError:
111 output = ""
112 if error_level or output.lower().startswith("error running"):
113 clean_up(fasta_files)
114 clean_up(temp_files)
115 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
116 error_level)
117 del results
118
119 out_handle = open(tabular_file, "w")
120 out_handle.write("#ID\tCompartment\tScore\tRank\n")
121 for temp in temp_files:
122 data_handle = open(temp)
123 clean_tabular(data_handle, out_handle)
124 data_handle.close()
125 out_handle.close()
126
127 clean_up(fasta_files)
128 clean_up(temp_files)