Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/tmhmm2.py @ 7:9b45a8743100 draft
Uploaded v0.1.0, which adds a wrapper for Promoter 2.0 (DNA tool) and enables use of Galaxy's <parallelism> tag for SignalP, TMHMM X Promoter wrappers.
author | peterjc |
---|---|
date | Mon, 30 Jul 2012 10:25:07 -0400 |
parents | 6901298ac16c |
children | e52220a9ddad |
comparison
equal
deleted
inserted
replaced
6:a290c6d4e658 | 7:9b45a8743100 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 """Wrapper for TMHMM v2.0 for use in Galaxy. | 2 """Wrapper for TMHMM v2.0 for use in Galaxy. |
3 | 3 |
4 This script takes exactly two command line arguments - an input protein FASTA | 4 This script takes exactly three command line arguments - number of threads, |
5 filename and an output tabular filename. It then calls the standalone TMHMM | 5 an input protein FASTA filename, and an output tabular filename. It then |
6 v2.0 program (not the webservice) requesting the short output (one line per | 6 calls the standalone TMHMM v2.0 program (not the webservice) requesting |
7 protein). | 7 the short output (one line per protein). |
8 | 8 |
9 The first major feature is cleaning up the tabular output. The short form raw | 9 The first major feature is cleaning up the tabular output. The short form raw |
10 output from TMHMM v2.0 looks like this (six columns tab separated): | 10 output from TMHMM v2.0 looks like this (six columns tab separated): |
11 | 11 |
12 gi|2781234|pdb|1JLY|B len=304 ExpAA=0.01 First60=0.00 PredHel=0 Topology=o | 12 gi|2781234|pdb|1JLY|B len=304 ExpAA=0.01 First60=0.00 PredHel=0 Topology=o |
31 (since TMHMM v2.0 itself is single threaded) by dividing the input FASTA file | 31 (since TMHMM v2.0 itself is single threaded) by dividing the input FASTA file |
32 into chunks and running multiple copies of TMHMM in parallel. I would normally | 32 into chunks and running multiple copies of TMHMM in parallel. I would normally |
33 use Python's multiprocessing library in this situation but it requires at | 33 use Python's multiprocessing library in this situation but it requires at |
34 least Python 2.6 and at the time of writing Galaxy still supports Python 2.4. | 34 least Python 2.6 and at the time of writing Galaxy still supports Python 2.4. |
35 | 35 |
36 Note that this is somewhat redundant with job-splitting available in Galaxy | |
37 itself (see the SignalP XML file for settings). | |
38 | |
36 Also tmhmm2 can fail without returning an error code, for example if run on a | 39 Also tmhmm2 can fail without returning an error code, for example if run on a |
37 64 bit machine with only the 32 bit binaries installed. This script will spot | 40 64 bit machine with only the 32 bit binaries installed. This script will spot |
38 when there is no output from tmhmm2, and raise an error. | 41 when there is no output from tmhmm2, and raise an error. |
39 """ | 42 """ |
40 import sys | 43 import sys |
41 import os | 44 import os |
45 import tempfile | |
42 from seq_analysis_utils import stop_err, split_fasta, run_jobs | 46 from seq_analysis_utils import stop_err, split_fasta, run_jobs |
43 | 47 |
44 FASTA_CHUNK = 500 | 48 FASTA_CHUNK = 500 |
45 | 49 |
46 if len(sys.argv) != 4: | 50 if len(sys.argv) != 4: |
47 stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") | 51 stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") |
48 try: | 52 try: |
49 num_threads = int(sys.argv[1]) | 53 num_threads = int(sys.argv[1]) |
50 except: | 54 except: |
51 num_threads = 0 | 55 num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined |
52 if num_threads < 1: | 56 if num_threads < 1: |
53 stop_err("Threads argument %s is not a positive integer" % sys.argv[1]) | 57 stop_err("Threads argument %s is not a positive integer" % sys.argv[1]) |
54 fasta_file = sys.argv[2] | 58 fasta_file = sys.argv[2] |
55 tabular_file = sys.argv[3] | 59 tabular_file = sys.argv[3] |
60 | |
61 tmp_dir = tempfile.mkdtemp() | |
56 | 62 |
57 def clean_tabular(raw_handle, out_handle): | 63 def clean_tabular(raw_handle, out_handle): |
58 """Clean up tabular TMHMM output, returns output line count.""" | 64 """Clean up tabular TMHMM output, returns output line count.""" |
59 count = 0 | 65 count = 0 |
60 for line in raw_handle: | 66 for line in raw_handle: |
82 count += 1 | 88 count += 1 |
83 return count | 89 return count |
84 | 90 |
85 #Note that if the input FASTA file contains no sequences, | 91 #Note that if the input FASTA file contains no sequences, |
86 #split_fasta returns an empty list (i.e. zero temp files). | 92 #split_fasta returns an empty list (i.e. zero temp files). |
87 fasta_files = split_fasta(fasta_file, tabular_file, FASTA_CHUNK) | 93 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) |
88 temp_files = [f+".out" for f in fasta_files] | 94 temp_files = [f+".out" for f in fasta_files] |
89 jobs = ["tmhmm -short %s > %s" % (fasta, temp) | 95 jobs = ["tmhmm -short %s > %s" % (fasta, temp) |
90 for fasta, temp in zip(fasta_files, temp_files)] | 96 for fasta, temp in zip(fasta_files, temp_files)] |
91 | 97 |
92 def clean_up(file_list): | 98 def clean_up(file_list): |
93 for f in file_list: | 99 for f in file_list: |
94 if os.path.isfile(f): | 100 if os.path.isfile(f): |
95 os.remove(f) | 101 os.remove(f) |
102 try: | |
103 os.rmdir(tmp_dir) | |
104 except: | |
105 pass | |
96 | 106 |
97 if len(jobs) > 1 and num_threads > 1: | 107 if len(jobs) > 1 and num_threads > 1: |
98 #A small "info" message for Galaxy to show the user. | 108 #A small "info" message for Galaxy to show the user. |
99 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | 109 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) |
100 results = run_jobs(jobs, num_threads) | 110 results = run_jobs(jobs, num_threads) |
103 if error_level: | 113 if error_level: |
104 try: | 114 try: |
105 output = open(temp).readline() | 115 output = open(temp).readline() |
106 except IOError: | 116 except IOError: |
107 output = "" | 117 output = "" |
108 clean_up(fasta_files) | 118 clean_up(fasta_files + temp_files) |
109 clean_up(temp_files) | |
110 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | 119 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), |
111 error_level) | 120 error_level) |
112 del results | 121 del results |
113 del jobs | 122 del jobs |
114 | 123 |
117 for temp in temp_files: | 126 for temp in temp_files: |
118 data_handle = open(temp) | 127 data_handle = open(temp) |
119 count = clean_tabular(data_handle, out_handle) | 128 count = clean_tabular(data_handle, out_handle) |
120 data_handle.close() | 129 data_handle.close() |
121 if not count: | 130 if not count: |
122 clean_up(fasta_files) | 131 clean_up(fasta_files + temp_files) |
123 clean_up(temp_files) | |
124 stop_err("No output from tmhmm2") | 132 stop_err("No output from tmhmm2") |
125 out_handle.close() | 133 out_handle.close() |
126 | 134 |
127 clean_up(fasta_files) | 135 clean_up(fasta_files + temp_files) |
128 clean_up(temp_files) |