comparison tools/protein_analysis/tmhmm2.py @ 19:f3ecd80850e2 draft

v0.2.9 Python style improvements
author peterjc
date Wed, 01 Feb 2017 09:46:42 -0500
parents eb6ac44d4b8e
children a19b3ded8f33
comparison
equal deleted inserted replaced
18:eb6ac44d4b8e 19:f3ecd80850e2
19 19
20 In order to make it easier to use in Galaxy, this wrapper script simplifies 20 In order to make it easier to use in Galaxy, this wrapper script simplifies
21 this to remove the redundant tags, and instead adds a comment line at the 21 this to remove the redundant tags, and instead adds a comment line at the
22 top with the column names: 22 top with the column names:
23 23
24 #ID len ExpAA First60 PredHel Topology 24 #ID len ExpAA First60 PredHel Topology
25 gi|2781234|pdb|1JLY|B 304 0.01 60 0.00 0 o 25 gi|2781234|pdb|1JLY|B 304 0.01 60 0.00 0 o
26 gi|4959044|gb|AAD34209.1|AF069992_1 600 0.00 0 0.00 0 o 26 gi|4959044|gb|AAD34209.1|AF069992_1 600 0.00 0 0.00 0 o
27 gi|671626|emb|CAA85685.1| 473 0.19 0.00 0 o 27 gi|671626|emb|CAA85685.1| 473 0.19 0.00 0 o
28 gi|3298468|dbj|BAA31520.1| 107 59.37 31.17 3 o23-45i52-74o89-106i 28 gi|3298468|dbj|BAA31520.1| 107 59.37 31.17 3 o23-45i52-74o89-106i
29 29
41 when there is no output from tmhmm2, and raise an error. 41 when there is no output from tmhmm2, and raise an error.
42 """ 42 """
43 import sys 43 import sys
44 import os 44 import os
45 import tempfile 45 import tempfile
46 from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count 46 from seq_analysis_utils import split_fasta, run_jobs, thread_count
47 47
48 FASTA_CHUNK = 500 48 FASTA_CHUNK = 500
49 49
50 if len(sys.argv) != 4: 50 if len(sys.argv) != 4:
51 sys_exit("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") 51 sys.exit("Require three arguments, number of threads (int), input protein FASTA file & output tabular file")
52 52
53 num_threads = thread_count(sys.argv[1], default=4) 53 num_threads = thread_count(sys.argv[1], default=4)
54 fasta_file = sys.argv[2] 54 fasta_file = sys.argv[2]
55 tabular_file = sys.argv[3] 55 tabular_file = sys.argv[3]
56 56
57 tmp_dir = tempfile.mkdtemp() 57 tmp_dir = tempfile.mkdtemp()
58 58
59
59 def clean_tabular(raw_handle, out_handle): 60 def clean_tabular(raw_handle, out_handle):
60 """Clean up tabular TMHMM output, returns output line count.""" 61 """Clean up tabular TMHMM output, returns output line count."""
61 count = 0 62 count = 0
62 for line in raw_handle: 63 for line in raw_handle:
63 if not line.strip() or line.startswith("#"): 64 if not line.strip() or line.startswith("#"):
64 #Ignore any blank lines or comment lines 65 # Ignore any blank lines or comment lines
65 continue 66 continue
66 parts = line.rstrip("\r\n").split("\t") 67 parts = line.rstrip("\r\n").split("\t")
67 try: 68 try:
68 identifier, length, expAA, first60, predhel, topology = parts 69 identifier, length, exp_aa, first60, predhel, topology = parts
69 except: 70 except ValueError:
70 assert len(parts)!=6 71 assert len(parts) != 6
71 sys_exit("Bad line: %r" % line) 72 sys.exit("Bad line: %r" % line)
72 assert length.startswith("len="), line 73 assert length.startswith("len="), line
73 length = length[4:] 74 length = length[4:]
74 assert expAA.startswith("ExpAA="), line 75 assert exp_aa.startswith("ExpAA="), line
75 expAA = expAA[6:] 76 exp_aa = exp_aa[6:]
76 assert first60.startswith("First60="), line 77 assert first60.startswith("First60="), line
77 first60 = first60[8:] 78 first60 = first60[8:]
78 assert predhel.startswith("PredHel="), line 79 assert predhel.startswith("PredHel="), line
79 predhel = predhel[8:] 80 predhel = predhel[8:]
80 assert topology.startswith("Topology="), line 81 assert topology.startswith("Topology="), line
81 topology = topology[9:] 82 topology = topology[9:]
82 out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" \ 83 out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n"
83 % (identifier, length, expAA, first60, predhel, topology)) 84 % (identifier, length, exp_aa, first60, predhel, topology))
84 count += 1 85 count += 1
85 return count 86 return count
86 87
87 #Note that if the input FASTA file contains no sequences, 88 # Note that if the input FASTA file contains no sequences,
88 #split_fasta returns an empty list (i.e. zero temp files). 89 # split_fasta returns an empty list (i.e. zero temp files).
89 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) 90 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK)
90 temp_files = [f+".out" for f in fasta_files] 91 temp_files = [f + ".out" for f in fasta_files]
91 jobs = ["tmhmm -short %s > %s" % (fasta, temp) 92 jobs = ["tmhmm -short %s > %s" % (fasta, temp)
92 for fasta, temp in zip(fasta_files, temp_files)] 93 for fasta, temp in zip(fasta_files, temp_files)]
93 94
95
94 def clean_up(file_list): 96 def clean_up(file_list):
97 """Remove temp files, and if possible the temp directory."""
95 for f in file_list: 98 for f in file_list:
96 if os.path.isfile(f): 99 if os.path.isfile(f):
97 os.remove(f) 100 os.remove(f)
98 try: 101 try:
99 os.rmdir(tmp_dir) 102 os.rmdir(tmp_dir)
100 except: 103 except Exception:
101 pass 104 pass
102 105
103 if len(jobs) > 1 and num_threads > 1: 106 if len(jobs) > 1 and num_threads > 1:
104 #A small "info" message for Galaxy to show the user. 107 # A small "info" message for Galaxy to show the user.
105 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) 108 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
106 results = run_jobs(jobs, num_threads) 109 results = run_jobs(jobs, num_threads)
107 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): 110 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
108 error_level = results[cmd] 111 error_level = results[cmd]
109 if error_level: 112 if error_level:
110 try: 113 try:
111 output = open(temp).readline() 114 output = open(temp).readline()
112 except IOError: 115 except IOError:
113 output = "" 116 output = ""
114 clean_up(fasta_files + temp_files) 117 clean_up(fasta_files + temp_files)
115 sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), 118 sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
116 error_level) 119 error_level)
117 del results 120 del results
118 del jobs 121 del jobs
119 122
120 out_handle = open(tabular_file, "w") 123 out_handle = open(tabular_file, "w")
123 data_handle = open(temp) 126 data_handle = open(temp)
124 count = clean_tabular(data_handle, out_handle) 127 count = clean_tabular(data_handle, out_handle)
125 data_handle.close() 128 data_handle.close()
126 if not count: 129 if not count:
127 clean_up(fasta_files + temp_files) 130 clean_up(fasta_files + temp_files)
128 sys_exit("No output from tmhmm2") 131 sys.exit("No output from tmhmm2")
129 out_handle.close() 132 out_handle.close()
130 133
131 clean_up(fasta_files + temp_files) 134 clean_up(fasta_files + temp_files)