comparison tools/blast_rbh/blast_rbh.py @ 4:d8d9a9069586 draft

v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
author peterjc
date Wed, 19 Apr 2017 07:44:47 -0400
parents 14b2e159b310
children 8f4500f6f2aa
comparison
equal deleted inserted replaced
3:9ba8ebb636f4 4:d8d9a9069586
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files. 2 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files.
3 3
4 Run "blast_rbh.py -h" to see the help text, or read the associated 4 Run "blast_rbh.py -h" to see the help text, or read the associated
5 README.rst file which is also available on GitHub at: 5 blast_rbh.xml and README.rst files which are available on GitHub at:
6 https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh 6 https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh
7 7
8 This requires Python and the NCBI BLAST+ tools to be installed 8 This requires Python and the NCBI BLAST+ tools to be installed
9 and on the $PATH. 9 and on the $PATH.
10 10
16 # TODO - Output more columns, e.g. pident, qcovs, descriptions? 16 # TODO - Output more columns, e.g. pident, qcovs, descriptions?
17 # TODO - Use new -qcov_hsp_perc option in BLAST+ 2.2.30 to filter 17 # TODO - Use new -qcov_hsp_perc option in BLAST+ 2.2.30 to filter
18 # results, rather than doing minimum HSP coverage in Python. 18 # results, rather than doing minimum HSP coverage in Python.
19 # [Not doing this right now as would break on older BLAST+] 19 # [Not doing this right now as would break on older BLAST+]
20 20
21 from __future__ import print_function
22
21 import os 23 import os
24 import shutil
22 import sys 25 import sys
23 import tempfile 26 import tempfile
24 import shutil 27
25 from optparse import OptionParser 28 from optparse import OptionParser
26 29
27 def stop_err( msg ):
28 sys.stderr.write("%s\n" % msg)
29 sys.exit(1)
30 30
31 def run(cmd): 31 def run(cmd):
32 return_code = os.system(cmd) 32 return_code = os.system(cmd)
33 if return_code: 33 if return_code:
34 stop_err("Error %i from: %s" % (return_code, cmd)) 34 sys.exit("Error %i from: %s" % (return_code, cmd))
35
35 36
36 if "--version" in sys.argv[1:]: 37 if "--version" in sys.argv[1:]:
37 #TODO - Capture version of BLAST+ binaries too? 38 # TODO - Capture version of BLAST+ binaries too?
38 print "BLAST RBH v0.1.6" 39 print("BLAST RBH v0.1.11")
39 sys.exit(0) 40 sys.exit(0)
40 41
41 #Parse Command Line 42 try:
43 threads = int(os.environ.get("GALAXY_SLOTS", "1"))
44 except ValueError:
45 threads = 1
46 assert 1 <= threads, threads
47
48 # Parse Command Line
42 usage = """Use as follows: 49 usage = """Use as follows:
43 50
44 $ python blast_rbh.py [options] A.fasta B.fasta 51 $ python blast_rbh.py [options] A.fasta B.fasta
52
53 Many of the options are required. Example with proteins and blastp:
54
55 $ python blast_rbh.py -a prot -t blasp -o output.tsv protA.fasta protB.fasta
56
57 There is additional guideance in the help text in the blast_rbh.xml file,
58 which is shown to the user via the Galaxy interface to this tool.
45 """ 59 """
46 60
47 parser = OptionParser(usage=usage) 61 parser = OptionParser(usage=usage)
48 parser.add_option("-a", "--alphabet", dest="dbtype", 62 parser.add_option("-a", "--alphabet", dest="dbtype",
49 default=None, 63 default=None,
50 help="Alphabet type (nucl or prot)") 64 help="Alphabet type (nucl or prot; required)")
51 parser.add_option("-t", "--task", dest="task", 65 parser.add_option("-t", "--task", dest="task",
52 default=None, 66 default=None,
53 help="BLAST task (e.g. blastp, blastn, megablast)") 67 help="BLAST task (e.g. blastp, blastn, megablast; required)")
54 parser.add_option("-i","--identity", dest="min_identity", 68 parser.add_option("-i", "--identity", dest="min_identity",
55 default="70", 69 default="70",
56 help="Minimum percentage identity (optional, default 70)") 70 help="Minimum percentage identity (optional, default 70)")
57 parser.add_option("-c", "--coverage", dest="min_coverage", 71 parser.add_option("-c", "--coverage", dest="min_coverage",
58 default="50", 72 default="50",
59 help="Minimum HSP coverage (optional, default 50)") 73 help="Minimum HSP coverage (optional, default 50)")
60 parser.add_option("--nr", dest="nr", default=False, action="store_true", 74 parser.add_option("--nr", dest="nr", default=False, action="store_true",
61 help="Preprocess FASTA files to collapse identifical " 75 help="Preprocess FASTA files to collapse identitical "
62 "entries (make sequences non-redundant)") 76 "entries (make sequences non-redundant)")
63 parser.add_option("-o", "--output", dest="output", 77 parser.add_option("-o", "--output", dest="output",
64 default=None, metavar="FILE", 78 default=None, metavar="FILE",
65 help="Output filename") 79 help="Output filename (required)")
80 parser.add_option("--threads", dest="threads",
81 default=threads,
82 help="Number of threads when running BLAST. Defaults to the "
83 "$GALAXY_SLOTS environment variable if set, or 1.")
66 options, args = parser.parse_args() 84 options, args = parser.parse_args()
67 85
68 if len(args) != 2: 86 if len(args) != 2:
69 stop_err("Expects two input FASTA filenames") 87 sys.exit("Expects two input FASTA filenames")
70 fasta_a, fasta_b = args 88 fasta_a, fasta_b = args
71 if not os.path.isfile(fasta_a): 89 if not os.path.isfile(fasta_a):
72 stop_err("Missing input file for species A: %r" % fasta_a) 90 sys.exit("Missing input file for species A: %r" % fasta_a)
73 if not os.path.isfile(fasta_b): 91 if not os.path.isfile(fasta_b):
74 stop_err("Missing input file for species B: %r" % fasta_b) 92 sys.exit("Missing input file for species B: %r" % fasta_b)
75 if os.path.abspath(fasta_a) == os.path.abspath(fasta_b): 93 if os.path.abspath(fasta_a) == os.path.abspath(fasta_b):
76 self_comparison = True 94 self_comparison = True
77 print("Doing self comparison; ignoring self matches.") 95 print("Doing self comparison; ignoring self matches.")
78 else: 96 else:
79 self_comparison = False 97 self_comparison = False
80 98
81 if not options.output: 99 if not options.output:
82 stop_err("Output filename required, e.g. -o example.tab") 100 sys.exit("Output filename required, e.g. -o example.tab")
83 out_file = options.output 101 out_file = options.output
84 102
85 try: 103 try:
86 min_identity = float(options.min_identity) 104 min_identity = float(options.min_identity)
87 except ValueError: 105 except ValueError:
88 stop_err("Expected number between 0 and 100 for minimum identity, not %r" % min_identity) 106 sys.exit("Expected number between 0 and 100 for minimum identity, not %r" % min_identity)
89 if not (0 <= min_identity <= 100): 107 if not (0 <= min_identity <= 100):
90 stop_err("Expected minimum identity between 0 and 100, not %0.2f" % min_identity) 108 sys.exit("Expected minimum identity between 0 and 100, not %0.2f" % min_identity)
91 try: 109 try:
92 min_coverage = float(options.min_coverage) 110 min_coverage = float(options.min_coverage)
93 except ValueError: 111 except ValueError:
94 stop_err("Expected number between 0 and 100 for minimum coverage, not %r" % min_coverage) 112 sys.exit("Expected number between 0 and 100 for minimum coverage, not %r" % min_coverage)
95 if not (0 <= min_coverage <= 100): 113 if not (0 <= min_coverage <= 100):
96 stop_err("Expected minimum coverage between 0 and 100, not %0.2f" % min_coverage) 114 sys.exit("Expected minimum coverage between 0 and 100, not %0.2f" % min_coverage)
97 115
98 if not options.task: 116 if not options.task:
99 stop_err("Missing BLAST task, e.g. -t blastp") 117 sys.exit("Missing BLAST task, e.g. -t blastp")
100 blast_type = options.task 118 blast_type = options.task
101 119
102 if not options.dbtype: 120 if not options.dbtype:
103 stop_err("Missing database type, -a nucl, or -a prot") 121 sys.exit("Missing database type, -a nucl, or -a prot")
104 dbtype = options.dbtype 122 dbtype = options.dbtype
105 if dbtype == "nucl": 123 if dbtype == "nucl":
106 if blast_type in ["megablast", "blastn", "blastn-short", "dc-megablast"]: 124 if blast_type in ["megablast", "blastn", "blastn-short", "dc-megablast"]:
107 blast_cmd = "blastn -task %s" % blast_type 125 blast_cmd = "blastn -task %s" % blast_type
108 elif blast_type == "tblastx": 126 elif blast_type == "tblastx":
109 blast_cmd = "tblastx" 127 blast_cmd = "tblastx"
110 else: 128 else:
111 stop_err("Invalid BLAST type for BLASTN: %r" % blast_type) 129 sys.exit("Invalid BLAST type for BLASTN: %r" % blast_type)
112 elif dbtype == "prot": 130 elif dbtype == "prot":
113 if blast_type not in ["blastp", "blastp-fast", "blastp-short"]: 131 if blast_type not in ["blastp", "blastp-fast", "blastp-short"]:
114 stop_err("Invalid BLAST type for BLASTP: %r" % blast_type) 132 sys.exit("Invalid BLAST type for BLASTP: %r" % blast_type)
115 blast_cmd = "blastp -task %s" % blast_type 133 blast_cmd = "blastp -task %s" % blast_type
116 else: 134 else:
117 stop_err("Expected 'nucl' or 'prot' for BLAST database type, not %r" % blast_type) 135 sys.exit("Expected 'nucl' or 'prot' for BLAST database type, not %r" % blast_type)
118 136
119 try: 137 try:
120 threads = int(os.environ.get("GALAXY_SLOTS", "1")) 138 threads = int(options.threads)
121 except: 139 except ValueError:
122 threads = 1 140 sys.exit("Expected positive integer for number of threads, not %r" % options.threads)
123 assert 1 <= threads, threads 141 if threads < 1:
142 sys.exit("Expected positive integer for number of threads, not %r" % threads)
124 143
125 makeblastdb_exe = "makeblastdb" 144 makeblastdb_exe = "makeblastdb"
126 145
127 base_path = tempfile.mkdtemp() 146 base_path = tempfile.mkdtemp()
128 tmp_a = os.path.join(base_path, "SpeciesA.fasta") 147 tmp_a = os.path.join(base_path, "SpeciesA.fasta")
136 tmp_b = os.path.join(base_path, "SpeciesB.fasta") 155 tmp_b = os.path.join(base_path, "SpeciesB.fasta")
137 db_b = os.path.join(base_path, "SpeciesB") 156 db_b = os.path.join(base_path, "SpeciesB")
138 b_vs_a = os.path.join(base_path, "B_vs_A.tabular") 157 b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
139 log = os.path.join(base_path, "blast.log") 158 log = os.path.join(base_path, "blast.log")
140 159
141 cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs? 160 cols = "qseqid sseqid bitscore pident qcovhsp qlen length" # Or qcovs?
142 c_query = 0 161 c_query = 0
143 c_match = 1 162 c_match = 1
144 c_score = 2 163 c_score = 2
145 c_identity = 3 164 c_identity = 3
146 c_coverage = 4 165 c_coverage = 4
147 c_qlen = 5 166 c_qlen = 5
148 c_length = 6 167 c_length = 6
149 168
150 tie_warning = 0 169 tie_warning = 0
151 170
171
152 def best_hits(blast_tabular, ignore_self=False): 172 def best_hits(blast_tabular, ignore_self=False):
153 """Iterate over BLAST tabular output, returns best hits as 2-tuples. 173 """Iterate over BLAST tabular output, returns best hits as 2-tuples.
154 174
155 Each return value is (query name, tuple of value for the best hit). 175 Each return value is (query name, tuple of value for the best hit).
156 176
161 """ 181 """
162 global tie_warning 182 global tie_warning
163 current = None 183 current = None
164 best_score = None 184 best_score = None
165 best = None 185 best = None
186 col_count = len(cols.split())
166 with open(blast_tabular) as h: 187 with open(blast_tabular) as h:
167 for line in h: 188 for line in h:
168 if line.startswith("#"): 189 if line.startswith("#"):
169 continue 190 continue
170 parts = line.rstrip("\n").split("\t") 191 parts = line.rstrip("\n").split("\t")
192 if len(parts) != col_count:
193 # Using NCBI BLAST+ 2.2.27 the undefined field is ignored
194 # Even NCBI BLAST+ 2.5.0 silently ignores unknown fields :(
195 sys.exit("Old version of NCBI BLAST? Expected %i columns, got %i:\n%s\n"
196 "Note the qcovhsp field was only added in version 2.2.28\n"
197 % (col_count, len(parts), line))
171 if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage: 198 if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage:
172 continue 199 continue
173 a = parts[c_query] 200 a = parts[c_query]
174 b = parts[c_match] 201 b = parts[c_match]
175 if ignore_self and a == b: 202 if ignore_self and a == b:
176 continue 203 continue
177 score = float(parts[c_score]) 204 score = float(parts[c_score])
178 qlen = int(parts[c_qlen]) 205 qlen = int(parts[c_qlen])
179 length = int(parts[c_length]) 206 length = int(parts[c_length])
180 #print("Considering hit for %s to %s with score %s..." % (a, b, score)) 207 # print("Considering hit for %s to %s with score %s..." % (a, b, score))
181 if current is None: 208 if current is None:
182 #First hit 209 # First hit
183 assert best is None 210 assert best is None
184 assert best_score is None 211 assert best_score is None
185 best = dict() 212 best = dict()
186 #Now append this hit... 213 # Now append this hit...
187 elif a != current: 214 elif a != current:
188 #New hit 215 # New hit
189 if len(best) == 1: 216 if len(best) == 1:
190 #Unambiguous (no tied matches) 217 # Unambiguous (no tied matches)
191 yield current, list(best.values())[0] 218 yield current, list(best.values())[0]
192 else: 219 else:
193 #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) 220 # print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
194 tie_warning += 1 221 tie_warning += 1
195 best = dict() 222 best = dict()
196 #Now append this hit... 223 # Now append this hit...
197 elif score < best_score: 224 elif score < best_score:
198 #print("No improvement for %s, %s < %s" % (a, score, best_score)) 225 # print("No improvement for %s, %s < %s" % (a, score, best_score))
199 continue 226 continue
200 elif score > best_score: 227 elif score > best_score:
201 #This is better, discard old best 228 # This is better, discard old best
202 best = dict() 229 best = dict()
203 #Now append this hit... 230 # Now append this hit...
204 else: 231 else:
205 #print("Tied best hits for %s" % a) 232 # print("Tied best hits for %s" % a)
206 assert best_score == score 233 assert best_score == score
207 #Now append this hit... 234 # Now append this hit...
208 current = a 235 current = a
209 best_score = score 236 best_score = score
210 #This will collapse two equally good hits to the same target (e.g. duplicated domain) 237 # This will collapse two equally good hits to the same target (e.g. duplicated domain)
211 best[b] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length) 238 best[b] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length)
212 #Best hit for final query, if unambiguous: 239 # Best hit for final query, if unambiguous:
213 if current is not None: 240 if current is not None:
214 if len(best)==1: 241 if len(best) == 1:
215 yield current, list(best.values())[0] 242 yield current, list(best.values())[0]
216 else: 243 else:
217 #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) 244 # print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
218 tie_warning += 1 245 tie_warning += 1
246
219 247
220 def check_duplicate_ids(filename): 248 def check_duplicate_ids(filename):
221 # Copied from tools/ncbi_blast_plus/check_no_duplicates.py 249 # Copied from tools/ncbi_blast_plus/check_no_duplicates.py
222 # TODO - just use Biopython's FASTA parser? 250 # TODO - just use Biopython's FASTA parser?
223 if not os.path.isfile(filename): 251 if not os.path.isfile(filename):
224 stop_err("Missing FASTA file %r" % filename, 2) 252 sys.stderr.write("Missing FASTA file %r\n" % filename)
253 sys.exit(2)
225 identifiers = set() 254 identifiers = set()
226 handle = open(filename) 255 handle = open(filename)
227 for line in handle: 256 for line in handle:
228 if line.startswith(">"): 257 if line.startswith(">"):
229 # The split will also take care of the new line character, 258 # The split will also take care of the new line character,
230 # e.g. ">test\n" and ">test description here\n" both give "test" 259 # e.g. ">test\n" and ">test description here\n" both give "test"
231 seq_id = line[1:].split(None, 1)[0] 260 seq_id = line[1:].split(None, 1)[0]
232 if seq_id in identifiers: 261 if seq_id in identifiers:
233 handle.close() 262 handle.close()
234 stop_err("Repeated identifiers, e.g. %r" % seq_id, 3) 263 sys.stderr.write("Repeated identifiers, e.g. %r\n" % seq_id)
264 sys.exit(3)
235 identifiers.add(seq_id) 265 identifiers.add(seq_id)
236 handle.close() 266 handle.close()
237 267
268
238 def make_nr(input_fasta, output_fasta, sep=";"): 269 def make_nr(input_fasta, output_fasta, sep=";"):
239 #TODO - seq-hash based to avoid loading everything into RAM? 270 # TODO - seq-hash based to avoid loading everything into RAM?
240 by_seq = dict() 271 by_seq = dict()
241 try: 272 try:
242 from Bio import SeqIO 273 from Bio import SeqIO
243 except KeyError: 274 except KeyError:
244 stop_err("Missing Biopython") 275 sys.exit("Missing Biopython")
245 for record in SeqIO.parse(input_fasta, "fasta"): 276 for record in SeqIO.parse(input_fasta, "fasta"):
246 s = str(record.seq).upper() 277 s = str(record.seq).upper()
247 try: 278 try:
248 by_seq[s].append(record.id) 279 by_seq[s].append(record.id)
249 except KeyError: 280 except KeyError:
257 duplicates.update(cluster[1:]) 288 duplicates.update(cluster[1:])
258 else: 289 else:
259 unique += 1 290 unique += 1
260 del by_seq 291 del by_seq
261 if duplicates: 292 if duplicates:
262 #TODO - refactor as a generator with single SeqIO.write(...) call 293 # TODO - refactor as a generator with single SeqIO.write(...) call
263 with open(output_fasta, "w") as handle: 294 with open(output_fasta, "w") as handle:
264 for record in SeqIO.parse(input_fasta, "fasta"): 295 for record in SeqIO.parse(input_fasta, "fasta"):
265 if record.id in representatives: 296 if record.id in representatives:
266 cluster = representatives[record.id] 297 cluster = representatives[record.id]
267 record.id = sep.join(cluster) 298 record.id = sep.join(cluster)
268 record.description = "representing %i records" % len(cluster) 299 record.description = "representing %i records" % len(cluster)
269 elif record.id in duplicates: 300 elif record.id in duplicates:
270 continue 301 continue
271 SeqIO.write(record, handle, "fasta") 302 SeqIO.write(record, handle, "fasta")
272 print("%i unique entries; removed %i duplicates leaving %i representative records" % (unique, len(duplicates), len(representatives))) 303 print("%i unique entries; removed %i duplicates leaving %i representative records"
304 % (unique, len(duplicates), len(representatives)))
273 else: 305 else:
274 os.symlink(os.path.abspath(input_fasta), output_fasta) 306 os.symlink(os.path.abspath(input_fasta), output_fasta)
275 print("No perfect duplicates in file, %i unique entries" % unique) 307 print("No perfect duplicates in file, %i unique entries" % unique)
276 308
277 #print("Starting...") 309
310 # print("Starting...")
278 check_duplicate_ids(fasta_a) 311 check_duplicate_ids(fasta_a)
279 if not self_comparison: 312 if not self_comparison:
280 check_duplicate_ids(fasta_b) 313 check_duplicate_ids(fasta_b)
281 314
282 if options.nr: 315 if options.nr:
284 if not self_comparison: 317 if not self_comparison:
285 make_nr(fasta_b, tmp_b) 318 make_nr(fasta_b, tmp_b)
286 fasta_a = tmp_a 319 fasta_a = tmp_a
287 fasta_b = tmp_b 320 fasta_b = tmp_b
288 321
289 #TODO - Report log in case of error? 322 # TODO - Report log in case of error?
290 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log)) 323 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log))
291 if not self_comparison: 324 if not self_comparison:
292 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log)) 325 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log))
293 #print("BLAST databases prepared.") 326 # print("BLAST databases prepared.")
294 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' 327 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
295 % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads)) 328 % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads))
296 #print("BLAST species A vs species B done.") 329 # print("BLAST species A vs species B done.")
297 if not self_comparison: 330 if not self_comparison:
298 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' 331 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
299 % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads)) 332 % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads))
300 #print("BLAST species B vs species A done.") 333 # print("BLAST species B vs species A done.")
301 334
302 335
303 best_b_vs_a = dict(best_hits(b_vs_a, self_comparison)) 336 best_b_vs_a = dict(best_hits(b_vs_a, self_comparison))
304 337
305 338
306 count = 0 339 count = 0
307 outfile = open(out_file, 'w') 340 outfile = open(out_file, 'w')
308 outfile.write("#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n") 341 outfile.write("#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n")
309 for a, (b, a_score_float, a_score_str, a_identity_str, a_coverage_str, a_qlen, a_length) in best_hits(a_vs_b, self_comparison): 342 for a, (b, a_score_float, a_score_str,
343 a_identity_str, a_coverage_str, a_qlen, a_length) in best_hits(a_vs_b, self_comparison):
310 if b not in best_b_vs_a: 344 if b not in best_b_vs_a:
311 #Match b has no best hit 345 # Match b has no best hit
312 continue 346 continue
313 a2, b_score_float, b_score_str, b_identity_str, b_coverage_str, b_qlen, b_length = best_b_vs_a[b] 347 a2, b_score_float, b_score_str, b_identity_str, b_coverage_str, b_qlen, b_length = best_b_vs_a[b]
314 if a != a2: 348 if a != a2:
315 #Not an RBH 349 # Not an RBH
316 continue 350 continue
317 #Start with IDs, lengths, coverage 351 # Start with IDs, lengths, coverage
318 values = [a, b, a_qlen, b_qlen, a_coverage_str, b_coverage_str] 352 values = [a, b, a_qlen, b_qlen, a_coverage_str, b_coverage_str]
319 #Alignment length was an integer so don't care about original string 353 # Alignment length was an integer so don't care about original string
320 values.append(min(a_length, b_length)) 354 values.append(min(a_length, b_length))
321 #Output the original string versions of the scores 355 # Output the original string versions of the scores
322 if float(a_identity_str) < float(b_identity_str): 356 if float(a_identity_str) < float(b_identity_str):
323 values.append(a_identity_str) 357 values.append(a_identity_str)
324 else: 358 else:
325 values.append(b_identity_str) 359 values.append(b_identity_str)
326 if a_score_float < b_score_float: 360 if a_score_float < b_score_float:
328 else: 362 else:
329 values.append(b_score_str) 363 values.append(b_score_str)
330 outfile.write("%s\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\n" % tuple(values)) 364 outfile.write("%s\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\n" % tuple(values))
331 count += 1 365 count += 1
332 outfile.close() 366 outfile.close()
333 print "Done, %i RBH found" % count 367 print("Done, %i RBH found" % count)
334 if tie_warning: 368 if tie_warning:
335 sys.stderr.write("Warning: Sequences with tied best hits found, you may have duplicates/clusters\n") 369 sys.stderr.write("Warning: Sequences with tied best hits found, you may have duplicates/clusters\n")
336 370
337 #Remove temp files... 371 # Remove temp files...
338 shutil.rmtree(base_path) 372 shutil.rmtree(base_path)