comparison tools/protein_analysis/tmhmm2.py @ 1:3ff1dcbb9440

Migrated tool version 0.0.3 from old tool shed archive to new tool shed repository
author peterjc
date Tue, 07 Jun 2011 18:04:05 -0400
parents bca9bc7fdaef
children 6901298ac16c
comparison
equal deleted inserted replaced
0:bca9bc7fdaef 1:3ff1dcbb9440
27 The second major potential feature is taking advantage of multiple cores 27 The second major potential feature is taking advantage of multiple cores
28 (since TMHMM v2.0 itself is single threaded) by dividing the input FASTA file 28 (since TMHMM v2.0 itself is single threaded) by dividing the input FASTA file
29 into chunks and running multiple copies of TMHMM in parallel. I would normally 29 into chunks and running multiple copies of TMHMM in parallel. I would normally
30 use Python's multiprocessing library in this situation but it requires at 30 use Python's multiprocessing library in this situation but it requires at
31 least Python 2.6 and at the time of writing Galaxy still supports Python 2.4. 31 least Python 2.6 and at the time of writing Galaxy still supports Python 2.4.
32
33 Also tmhmm2 can fail without returning an error code, for example if run on a
34 64 bit machine with only the 32 bit binaries installed. This script will spot
35 when there is no output from tmhmm2, and raise an error.
32 """ 36 """
33 import sys 37 import sys
34 import os 38 import os
35 from seq_analysis_utils import stop_err, split_fasta, run_jobs 39 from seq_analysis_utils import stop_err, split_fasta, run_jobs
36 40
46 stop_err("Threads argument %s is not a positive integer" % sys.argv[1]) 50 stop_err("Threads argument %s is not a positive integer" % sys.argv[1])
47 fasta_file = sys.argv[2] 51 fasta_file = sys.argv[2]
48 tabular_file = sys.argv[3] 52 tabular_file = sys.argv[3]
49 53
50 def clean_tabular(raw_handle, out_handle): 54 def clean_tabular(raw_handle, out_handle):
51 """Clean up tabular TMHMM output.""" 55 """Clean up tabular TMHMM output, returns output line count."""
56 count = 0
52 for line in raw_handle: 57 for line in raw_handle:
53 if not line: 58 if not line:
54 continue 59 continue
55 parts = line.rstrip("\r\n").split("\t") 60 parts = line.rstrip("\r\n").split("\t")
56 try: 61 try:
66 first60 = first60[8:] 71 first60 = first60[8:]
67 assert predhel.startswith("PredHel="), line 72 assert predhel.startswith("PredHel="), line
68 predhel = predhel[8:] 73 predhel = predhel[8:]
69 assert topology.startswith("Topology="), line 74 assert topology.startswith("Topology="), line
70 topology = topology[9:] 75 topology = topology[9:]
71 out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" \ 76 out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" \
72 % (identifier, length, expAA, first60, predhel, topology)) 77 % (identifier, length, expAA, first60, predhel, topology))
78 count += 1
79 return count
73 80
81 #Note that if the input FASTA file contains no sequences,
82 #split_fasta returns an empty list (i.e. zero temp files).
74 fasta_files = split_fasta(fasta_file, tabular_file, FASTA_CHUNK) 83 fasta_files = split_fasta(fasta_file, tabular_file, FASTA_CHUNK)
75 temp_files = [f+".out" for f in fasta_files] 84 temp_files = [f+".out" for f in fasta_files]
76 jobs = ["tmhmm %s > %s" % (fasta, temp) 85 jobs = ["tmhmm %s > %s" % (fasta, temp)
77 for fasta, temp in zip(fasta_files, temp_files)] 86 for fasta, temp in zip(fasta_files, temp_files)]
78 87
101 110
102 out_handle = open(tabular_file, "w") 111 out_handle = open(tabular_file, "w")
103 out_handle.write("#ID\tlen\tExpAA\tFirst60\tPredHel\tTopology\n") 112 out_handle.write("#ID\tlen\tExpAA\tFirst60\tPredHel\tTopology\n")
104 for temp in temp_files: 113 for temp in temp_files:
105 data_handle = open(temp) 114 data_handle = open(temp)
106 clean_tabular(data_handle, out_handle) 115 count = clean_tabular(data_handle, out_handle)
107 data_handle.close() 116 data_handle.close()
117 if not count:
118 clean_up(fasta_files)
119 clean_up(temp_files)
120 stop_err("No output from tmhmm2")
108 out_handle.close() 121 out_handle.close()
109 122
110 clean_up(fasta_files) 123 clean_up(fasta_files)
111 clean_up(temp_files) 124 clean_up(temp_files)