Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/tmhmm2.py @ 19:f3ecd80850e2 draft
v0.2.9 Python style improvements
| author | peterjc | 
|---|---|
| date | Wed, 01 Feb 2017 09:46:42 -0500 | 
| parents | eb6ac44d4b8e | 
| children | a19b3ded8f33 | 
   comparison
  equal
  deleted
  inserted
  replaced
| 18:eb6ac44d4b8e | 19:f3ecd80850e2 | 
|---|---|
| 19 | 19 | 
| 20 In order to make it easier to use in Galaxy, this wrapper script simplifies | 20 In order to make it easier to use in Galaxy, this wrapper script simplifies | 
| 21 this to remove the redundant tags, and instead adds a comment line at the | 21 this to remove the redundant tags, and instead adds a comment line at the | 
| 22 top with the column names: | 22 top with the column names: | 
| 23 | 23 | 
| 24 #ID len ExpAA First60 PredHel Topology | 24 #ID len ExpAA First60 PredHel Topology | 
| 25 gi|2781234|pdb|1JLY|B 304 0.01 60 0.00 0 o | 25 gi|2781234|pdb|1JLY|B 304 0.01 60 0.00 0 o | 
| 26 gi|4959044|gb|AAD34209.1|AF069992_1 600 0.00 0 0.00 0 o | 26 gi|4959044|gb|AAD34209.1|AF069992_1 600 0.00 0 0.00 0 o | 
| 27 gi|671626|emb|CAA85685.1| 473 0.19 0.00 0 o | 27 gi|671626|emb|CAA85685.1| 473 0.19 0.00 0 o | 
| 28 gi|3298468|dbj|BAA31520.1| 107 59.37 31.17 3 o23-45i52-74o89-106i | 28 gi|3298468|dbj|BAA31520.1| 107 59.37 31.17 3 o23-45i52-74o89-106i | 
| 29 | 29 | 
| 41 when there is no output from tmhmm2, and raise an error. | 41 when there is no output from tmhmm2, and raise an error. | 
| 42 """ | 42 """ | 
| 43 import sys | 43 import sys | 
| 44 import os | 44 import os | 
| 45 import tempfile | 45 import tempfile | 
| 46 from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count | 46 from seq_analysis_utils import split_fasta, run_jobs, thread_count | 
| 47 | 47 | 
| 48 FASTA_CHUNK = 500 | 48 FASTA_CHUNK = 500 | 
| 49 | 49 | 
| 50 if len(sys.argv) != 4: | 50 if len(sys.argv) != 4: | 
| 51 sys_exit("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") | 51 sys.exit("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") | 
| 52 | 52 | 
| 53 num_threads = thread_count(sys.argv[1], default=4) | 53 num_threads = thread_count(sys.argv[1], default=4) | 
| 54 fasta_file = sys.argv[2] | 54 fasta_file = sys.argv[2] | 
| 55 tabular_file = sys.argv[3] | 55 tabular_file = sys.argv[3] | 
| 56 | 56 | 
| 57 tmp_dir = tempfile.mkdtemp() | 57 tmp_dir = tempfile.mkdtemp() | 
| 58 | 58 | 
| 59 | |
| 59 def clean_tabular(raw_handle, out_handle): | 60 def clean_tabular(raw_handle, out_handle): | 
| 60 """Clean up tabular TMHMM output, returns output line count.""" | 61 """Clean up tabular TMHMM output, returns output line count.""" | 
| 61 count = 0 | 62 count = 0 | 
| 62 for line in raw_handle: | 63 for line in raw_handle: | 
| 63 if not line.strip() or line.startswith("#"): | 64 if not line.strip() or line.startswith("#"): | 
| 64 #Ignore any blank lines or comment lines | 65 # Ignore any blank lines or comment lines | 
| 65 continue | 66 continue | 
| 66 parts = line.rstrip("\r\n").split("\t") | 67 parts = line.rstrip("\r\n").split("\t") | 
| 67 try: | 68 try: | 
| 68 identifier, length, expAA, first60, predhel, topology = parts | 69 identifier, length, exp_aa, first60, predhel, topology = parts | 
| 69 except: | 70 except ValueError: | 
| 70 assert len(parts)!=6 | 71 assert len(parts) != 6 | 
| 71 sys_exit("Bad line: %r" % line) | 72 sys.exit("Bad line: %r" % line) | 
| 72 assert length.startswith("len="), line | 73 assert length.startswith("len="), line | 
| 73 length = length[4:] | 74 length = length[4:] | 
| 74 assert expAA.startswith("ExpAA="), line | 75 assert exp_aa.startswith("ExpAA="), line | 
| 75 expAA = expAA[6:] | 76 exp_aa = exp_aa[6:] | 
| 76 assert first60.startswith("First60="), line | 77 assert first60.startswith("First60="), line | 
| 77 first60 = first60[8:] | 78 first60 = first60[8:] | 
| 78 assert predhel.startswith("PredHel="), line | 79 assert predhel.startswith("PredHel="), line | 
| 79 predhel = predhel[8:] | 80 predhel = predhel[8:] | 
| 80 assert topology.startswith("Topology="), line | 81 assert topology.startswith("Topology="), line | 
| 81 topology = topology[9:] | 82 topology = topology[9:] | 
| 82 out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" \ | 83 out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" | 
| 83 % (identifier, length, expAA, first60, predhel, topology)) | 84 % (identifier, length, exp_aa, first60, predhel, topology)) | 
| 84 count += 1 | 85 count += 1 | 
| 85 return count | 86 return count | 
| 86 | 87 | 
| 87 #Note that if the input FASTA file contains no sequences, | 88 # Note that if the input FASTA file contains no sequences, | 
| 88 #split_fasta returns an empty list (i.e. zero temp files). | 89 # split_fasta returns an empty list (i.e. zero temp files). | 
| 89 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) | 90 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) | 
| 90 temp_files = [f+".out" for f in fasta_files] | 91 temp_files = [f + ".out" for f in fasta_files] | 
| 91 jobs = ["tmhmm -short %s > %s" % (fasta, temp) | 92 jobs = ["tmhmm -short %s > %s" % (fasta, temp) | 
| 92 for fasta, temp in zip(fasta_files, temp_files)] | 93 for fasta, temp in zip(fasta_files, temp_files)] | 
| 93 | 94 | 
| 95 | |
| 94 def clean_up(file_list): | 96 def clean_up(file_list): | 
| 97 """Remove temp files, and if possible the temp directory.""" | |
| 95 for f in file_list: | 98 for f in file_list: | 
| 96 if os.path.isfile(f): | 99 if os.path.isfile(f): | 
| 97 os.remove(f) | 100 os.remove(f) | 
| 98 try: | 101 try: | 
| 99 os.rmdir(tmp_dir) | 102 os.rmdir(tmp_dir) | 
| 100 except: | 103 except Exception: | 
| 101 pass | 104 pass | 
| 102 | 105 | 
| 103 if len(jobs) > 1 and num_threads > 1: | 106 if len(jobs) > 1 and num_threads > 1: | 
| 104 #A small "info" message for Galaxy to show the user. | 107 # A small "info" message for Galaxy to show the user. | 
| 105 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | 108 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | 
| 106 results = run_jobs(jobs, num_threads) | 109 results = run_jobs(jobs, num_threads) | 
| 107 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | 110 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | 
| 108 error_level = results[cmd] | 111 error_level = results[cmd] | 
| 109 if error_level: | 112 if error_level: | 
| 110 try: | 113 try: | 
| 111 output = open(temp).readline() | 114 output = open(temp).readline() | 
| 112 except IOError: | 115 except IOError: | 
| 113 output = "" | 116 output = "" | 
| 114 clean_up(fasta_files + temp_files) | 117 clean_up(fasta_files + temp_files) | 
| 115 sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | 118 sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | 
| 116 error_level) | 119 error_level) | 
| 117 del results | 120 del results | 
| 118 del jobs | 121 del jobs | 
| 119 | 122 | 
| 120 out_handle = open(tabular_file, "w") | 123 out_handle = open(tabular_file, "w") | 
| 123 data_handle = open(temp) | 126 data_handle = open(temp) | 
| 124 count = clean_tabular(data_handle, out_handle) | 127 count = clean_tabular(data_handle, out_handle) | 
| 125 data_handle.close() | 128 data_handle.close() | 
| 126 if not count: | 129 if not count: | 
| 127 clean_up(fasta_files + temp_files) | 130 clean_up(fasta_files + temp_files) | 
| 128 sys_exit("No output from tmhmm2") | 131 sys.exit("No output from tmhmm2") | 
| 129 out_handle.close() | 132 out_handle.close() | 
| 130 | 133 | 
| 131 clean_up(fasta_files + temp_files) | 134 clean_up(fasta_files + temp_files) | 
