Mercurial > repos > peterjc > tmhmm_and_signalp
view tools/protein_analysis/wolf_psort.py @ 7:9b45a8743100 draft
Uploaded v0.1.0, which adds a wrapper for Promoter 2.0 (DNA tool) and enables use of Galaxy's <parallelism> tag for SignalP, TMHMM X Promoter wrappers.
author | peterjc |
---|---|
date | Mon, 30 Jul 2012 10:25:07 -0400 |
parents | a290c6d4e658 |
children | e52220a9ddad |
line wrap: on
line source
#!/usr/bin/env python """Wrapper for WoLF PSORT v0.2 for use in Galaxy. This script takes exactly four command line arguments: * the organism type (animal, plant or fungi) * number of threads to use (integer) * an input protein FASTA filename * output tabular filename. It then calls the standalone WoLF PSORT v0.2 program runWolfPsortSummary (not the webservice), and coverts the output from something like this: # k used for kNN is: 27 gi|301087619|ref|XP_002894699.1| extr 12, mito 4, E.R. 3, golg 3, mito_nucl 3 gi|301087623|ref|XP_002894700.1| extr 21, mito 2, cyto 2, cyto_mito 2 In order to make it easier to use in Galaxy, this wrapper script reformats this to use tab separators, with one line per compartment prediction: #ID Compartment Score Rank gi|301087619|ref|XP_002894699.1| extr 12 1 gi|301087619|ref|XP_002894699.1| mito 4 2 gi|301087619|ref|XP_002894699.1| E.R. 3 3 gi|301087619|ref|XP_002894699.1| golg 3 4 gi|301087619|ref|XP_002894699.1| mito_nucl 3 5 gi|301087623|ref|XP_002894700.1| extr 21 1 gi|301087623|ref|XP_002894700.1| mito 2 2 gi|301087623|ref|XP_002894700.1| cyto 2 3 gi|301087623|ref|XP_002894700.1| cyto_mito 2 4 Additionally in order to take full advantage of multiple cores, by subdividing the input FASTA file multiple copies of WoLF PSORT are run in parallel. I would normally use Python's multiprocessing library in this situation but it requires at least Python 2.6 and at the time of writing Galaxy still supports Python 2.4. """ import sys import os from seq_analysis_utils import stop_err, split_fasta, run_jobs FASTA_CHUNK = 500 exe = "runWolfPsortSummary" """ Note: I had trouble getting runWolfPsortSummary on the path (via a link, other than by including all of /opt/WoLFPSORT_package_v0.2/bin , so used a wrapper python script called runWolfPsortSummary as follows: #!/usr/bin/env python #Wrapper script to call WoLF PSORT from its own directory. import os import sys import subprocess saved_dir = os.path.abspath(os.curdir) os.chdir("/opt/WoLFPSORT_package_v0.2/bin") args = ["./runWolfPsortSummary"] + sys.argv[1:] return_code = subprocess.call(args) os.chdir(saved_dir) sys.exit(return_code) """ if len(sys.argv) != 5: stop_err("Require four arguments, organism, threads, input protein FASTA file & output tabular file") organism = sys.argv[1] if organism not in ["animal", "plant", "fungi"]: stop_err("Organism argument %s is not one of animal, plant, fungi" % organism) try: num_threads = int(sys.argv[2]) except: num_threads = 0 if num_threads < 1: stop_err("Threads argument %s is not a positive integer" % sys.argv[2]) fasta_file = sys.argv[3] tabular_file = sys.argv[4] def clean_tabular(raw_handle, out_handle): """Clean up WoLF PSORT output to make it tabular.""" for line in raw_handle: if not line or line.startswith("#"): continue name, data = line.rstrip("\r\n").split(None,1) for rank, comp_data in enumerate(data.split(",")): comp, score = comp_data.split() out_handle.write("%s\t%s\t%s\t%i\n" \ % (name, comp, score, rank+1)) fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK) temp_files = [f+".out" for f in fasta_files] assert len(fasta_files) == len(temp_files) jobs = ["%s %s < %s > %s" % (exe, organism, fasta, temp) for (fasta, temp) in zip(fasta_files, temp_files)] assert len(fasta_files) == len(temp_files) == len(jobs) def clean_up(file_list): for f in file_list: if os.path.isfile(f): os.remove(f) if len(jobs) > 1 and num_threads > 1: #A small "info" message for Galaxy to show the user. print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) results = run_jobs(jobs, num_threads) assert len(fasta_files) == len(temp_files) == len(jobs) for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): error_level = results[cmd] try: output = open(temp).readline() except IOError: output = "" if error_level or output.lower().startswith("error running"): clean_up(fasta_files) clean_up(temp_files) stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results out_handle = open(tabular_file, "w") out_handle.write("#ID\tCompartment\tScore\tRank\n") for temp in temp_files: data_handle = open(temp) clean_tabular(data_handle, out_handle) data_handle.close() out_handle.close() clean_up(fasta_files) clean_up(temp_files)