tmhmm_and_signalp: tools/protein_analysis/wolf

comparison tools/protein_analysis/wolf_psort.py @ 5:0f1c61998b22

Migrated tool version 0.0.8 from old tool shed archive to new tool shed repository

author	peterjc
date	Tue, 07 Jun 2011 18:06:27 -0400
parents
children	a290c6d4e658

comparison

equal deleted inserted replaced

-:81caef04ce8b
+:0f1c61998b22
+#!/usr/bin/env python
+"""Wrapper for WoLF PSORT v0.2 for use in Galaxy.
+This script takes exactly four command line arguments:
+* the organism type (animal, plant or fungi)
+* number of threads to use (integer)
+* an input protein FASTA filename
+* output tabular filename.
+It then calls the standalone WoLF PSORT v0.2 program runWolfPsortSummary
+(not the webservice), and coverts the output from something like this:
+# k used for kNN is: 27
+gi|301087619|ref|XP_002894699.1| extr 12, mito 4, E.R. 3, golg 3, mito_nucl 3
+gi|301087623|ref|XP_002894700.1| extr 21, mito 2, cyto 2, cyto_mito 2
+In order to make it easier to use in Galaxy, this wrapper script reformats
+this to use tab separators, with one line per compartment prediction:
+#ID	Compartment	Score	Rank
+gi|301087619|ref|XP_002894699.1|	extr	12	1
+gi|301087619|ref|XP_002894699.1|	mito	4	2
+gi|301087619|ref|XP_002894699.1|	E.R.	3	3
+gi|301087619|ref|XP_002894699.1|	golg	3	4
+gi|301087619|ref|XP_002894699.1|	mito_nucl	3	5
+gi|301087623|ref|XP_002894700.1|	extr	21	1
+gi|301087623|ref|XP_002894700.1|	mito	2	2
+gi|301087623|ref|XP_002894700.1|	cyto	2	3
+gi|301087623|ref|XP_002894700.1|	cyto_mito	2	4
+Additionally in order to take full advantage of multiple cores, by subdividing
+the input FASTA file multiple copies of WoLF PSORT are run in parallel. I would
+normally use Python's multiprocessing library in this situation but it requires
+at least Python 2.6 and at the time of writing Galaxy still supports Python 2.4.
+"""
+import sys
+import os
+from seq_analysis_utils import stop_err, split_fasta, run_jobs
+FASTA_CHUNK = 500
+exe = "runWolfPsortSummary"
+"""
+Note: I had trouble getting runWolfPsortSummary on the path, so used a wrapper
+python script called runWolfPsortSummary as follows:
+#!/usr/bin/env python
+#Wrapper script to call WoLF PSORT from its own directory.
+import os
+import sys
+import subprocess
+saved_dir = os.path.abspath(os.curdir)
+os.chdir("/opt/WoLFPSORT_package_v0.2/bin")
+args = ["./runWolfPsortSummary"] + sys.argv[1:]
+return_code = subprocess.call(args)
+os.chdir(saved_dir)
+sys.exit(return_code)
+"""
+if len(sys.argv) != 5:
+stop_err("Require four arguments, organism, threads, input protein FASTA file & output tabular file")
+organism = sys.argv[1]
+if organism not in ["animal", "plant", "fungi"]:
+stop_err("Organism argument %s is not one of animal, plant, fungi" % organism)
+try:
+num_threads = int(sys.argv[2])
+except:
+num_threads = 0
+if num_threads < 1:
+stop_err("Threads argument %s is not a positive integer" % sys.argv[3])
+fasta_file = sys.argv[3]
+tabular_file = sys.argv[4]
+def clean_tabular(raw_handle, out_handle):
+"""Clean up WoLF PSORT output to make it tabular."""
+for line in raw_handle:
+if not line or line.startswith("#"):
+continue
+name, data = line.rstrip("\r\n").split(None,1)
+for rank, comp_data in enumerate(data.split(",")):
+comp, score = comp_data.split()
+out_handle.write("%s\t%s\t%s\t%i\n" \
+% (name, comp, score, rank+1))
+fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK)
+temp_files = [f+".out" for f in fasta_files]
+assert len(fasta_files) == len(temp_files)
+jobs = ["%s %s < %s > %s" % (exe, organism, fasta, temp)
+for (fasta, temp) in zip(fasta_files, temp_files)]
+assert len(fasta_files) == len(temp_files) == len(jobs)
+def clean_up(file_list):
+for f in file_list:
+if os.path.isfile(f):
+os.remove(f)
+if len(jobs) > 1 and num_threads > 1:
+#A small "info" message for Galaxy to show the user.
+print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
+results = run_jobs(jobs, num_threads)
+assert len(fasta_files) == len(temp_files) == len(jobs)
+for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
+error_level = results[cmd]
+try:
+output = open(temp).readline()
+except IOError:
+output = ""
+if error_level or output.lower().startswith("error running"):
+clean_up(fasta_files)
+clean_up(temp_files)
+stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
+error_level)
+del results
+out_handle = open(tabular_file, "w")
+out_handle.write("#ID\tCompartment\tScore\tRank\n")
+for temp in temp_files:
+data_handle = open(temp)
+clean_tabular(data_handle, out_handle)
+data_handle.close()
+out_handle.close()
+clean_up(fasta_files)
+clean_up(temp_files)

Mercurial > repos > peterjc > tmhmm_and_signalp

comparison tools/protein_analysis/wolf_psort.py @ 5:0f1c61998b22