Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/tmhmm2.py @ 7:9b45a8743100 draft
Uploaded v0.1.0, which adds a wrapper for Promoter 2.0 (DNA tool) and enables use of Galaxy's <parallelism> tag for SignalP, TMHMM X Promoter wrappers.
| author | peterjc |
|---|---|
| date | Mon, 30 Jul 2012 10:25:07 -0400 |
| parents | 6901298ac16c |
| children | e52220a9ddad |
comparison
equal
deleted
inserted
replaced
| 6:a290c6d4e658 | 7:9b45a8743100 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 """Wrapper for TMHMM v2.0 for use in Galaxy. | 2 """Wrapper for TMHMM v2.0 for use in Galaxy. |
| 3 | 3 |
| 4 This script takes exactly two command line arguments - an input protein FASTA | 4 This script takes exactly three command line arguments - number of threads, |
| 5 filename and an output tabular filename. It then calls the standalone TMHMM | 5 an input protein FASTA filename, and an output tabular filename. It then |
| 6 v2.0 program (not the webservice) requesting the short output (one line per | 6 calls the standalone TMHMM v2.0 program (not the webservice) requesting |
| 7 protein). | 7 the short output (one line per protein). |
| 8 | 8 |
| 9 The first major feature is cleaning up the tabular output. The short form raw | 9 The first major feature is cleaning up the tabular output. The short form raw |
| 10 output from TMHMM v2.0 looks like this (six columns tab separated): | 10 output from TMHMM v2.0 looks like this (six columns tab separated): |
| 11 | 11 |
| 12 gi|2781234|pdb|1JLY|B len=304 ExpAA=0.01 First60=0.00 PredHel=0 Topology=o | 12 gi|2781234|pdb|1JLY|B len=304 ExpAA=0.01 First60=0.00 PredHel=0 Topology=o |
| 31 (since TMHMM v2.0 itself is single threaded) by dividing the input FASTA file | 31 (since TMHMM v2.0 itself is single threaded) by dividing the input FASTA file |
| 32 into chunks and running multiple copies of TMHMM in parallel. I would normally | 32 into chunks and running multiple copies of TMHMM in parallel. I would normally |
| 33 use Python's multiprocessing library in this situation but it requires at | 33 use Python's multiprocessing library in this situation but it requires at |
| 34 least Python 2.6 and at the time of writing Galaxy still supports Python 2.4. | 34 least Python 2.6 and at the time of writing Galaxy still supports Python 2.4. |
| 35 | 35 |
| 36 Note that this is somewhat redundant with job-splitting available in Galaxy | |
| 37 itself (see the SignalP XML file for settings). | |
| 38 | |
| 36 Also tmhmm2 can fail without returning an error code, for example if run on a | 39 Also tmhmm2 can fail without returning an error code, for example if run on a |
| 37 64 bit machine with only the 32 bit binaries installed. This script will spot | 40 64 bit machine with only the 32 bit binaries installed. This script will spot |
| 38 when there is no output from tmhmm2, and raise an error. | 41 when there is no output from tmhmm2, and raise an error. |
| 39 """ | 42 """ |
| 40 import sys | 43 import sys |
| 41 import os | 44 import os |
| 45 import tempfile | |
| 42 from seq_analysis_utils import stop_err, split_fasta, run_jobs | 46 from seq_analysis_utils import stop_err, split_fasta, run_jobs |
| 43 | 47 |
| 44 FASTA_CHUNK = 500 | 48 FASTA_CHUNK = 500 |
| 45 | 49 |
| 46 if len(sys.argv) != 4: | 50 if len(sys.argv) != 4: |
| 47 stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") | 51 stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") |
| 48 try: | 52 try: |
| 49 num_threads = int(sys.argv[1]) | 53 num_threads = int(sys.argv[1]) |
| 50 except: | 54 except: |
| 51 num_threads = 0 | 55 num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined |
| 52 if num_threads < 1: | 56 if num_threads < 1: |
| 53 stop_err("Threads argument %s is not a positive integer" % sys.argv[1]) | 57 stop_err("Threads argument %s is not a positive integer" % sys.argv[1]) |
| 54 fasta_file = sys.argv[2] | 58 fasta_file = sys.argv[2] |
| 55 tabular_file = sys.argv[3] | 59 tabular_file = sys.argv[3] |
| 60 | |
| 61 tmp_dir = tempfile.mkdtemp() | |
| 56 | 62 |
| 57 def clean_tabular(raw_handle, out_handle): | 63 def clean_tabular(raw_handle, out_handle): |
| 58 """Clean up tabular TMHMM output, returns output line count.""" | 64 """Clean up tabular TMHMM output, returns output line count.""" |
| 59 count = 0 | 65 count = 0 |
| 60 for line in raw_handle: | 66 for line in raw_handle: |
| 82 count += 1 | 88 count += 1 |
| 83 return count | 89 return count |
| 84 | 90 |
| 85 #Note that if the input FASTA file contains no sequences, | 91 #Note that if the input FASTA file contains no sequences, |
| 86 #split_fasta returns an empty list (i.e. zero temp files). | 92 #split_fasta returns an empty list (i.e. zero temp files). |
| 87 fasta_files = split_fasta(fasta_file, tabular_file, FASTA_CHUNK) | 93 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) |
| 88 temp_files = [f+".out" for f in fasta_files] | 94 temp_files = [f+".out" for f in fasta_files] |
| 89 jobs = ["tmhmm -short %s > %s" % (fasta, temp) | 95 jobs = ["tmhmm -short %s > %s" % (fasta, temp) |
| 90 for fasta, temp in zip(fasta_files, temp_files)] | 96 for fasta, temp in zip(fasta_files, temp_files)] |
| 91 | 97 |
| 92 def clean_up(file_list): | 98 def clean_up(file_list): |
| 93 for f in file_list: | 99 for f in file_list: |
| 94 if os.path.isfile(f): | 100 if os.path.isfile(f): |
| 95 os.remove(f) | 101 os.remove(f) |
| 102 try: | |
| 103 os.rmdir(tmp_dir) | |
| 104 except: | |
| 105 pass | |
| 96 | 106 |
| 97 if len(jobs) > 1 and num_threads > 1: | 107 if len(jobs) > 1 and num_threads > 1: |
| 98 #A small "info" message for Galaxy to show the user. | 108 #A small "info" message for Galaxy to show the user. |
| 99 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | 109 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) |
| 100 results = run_jobs(jobs, num_threads) | 110 results = run_jobs(jobs, num_threads) |
| 103 if error_level: | 113 if error_level: |
| 104 try: | 114 try: |
| 105 output = open(temp).readline() | 115 output = open(temp).readline() |
| 106 except IOError: | 116 except IOError: |
| 107 output = "" | 117 output = "" |
| 108 clean_up(fasta_files) | 118 clean_up(fasta_files + temp_files) |
| 109 clean_up(temp_files) | |
| 110 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | 119 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), |
| 111 error_level) | 120 error_level) |
| 112 del results | 121 del results |
| 113 del jobs | 122 del jobs |
| 114 | 123 |
| 117 for temp in temp_files: | 126 for temp in temp_files: |
| 118 data_handle = open(temp) | 127 data_handle = open(temp) |
| 119 count = clean_tabular(data_handle, out_handle) | 128 count = clean_tabular(data_handle, out_handle) |
| 120 data_handle.close() | 129 data_handle.close() |
| 121 if not count: | 130 if not count: |
| 122 clean_up(fasta_files) | 131 clean_up(fasta_files + temp_files) |
| 123 clean_up(temp_files) | |
| 124 stop_err("No output from tmhmm2") | 132 stop_err("No output from tmhmm2") |
| 125 out_handle.close() | 133 out_handle.close() |
| 126 | 134 |
| 127 clean_up(fasta_files) | 135 clean_up(fasta_files + temp_files) |
| 128 clean_up(temp_files) |
