Mercurial > repos > peterjc > blast_rbh
annotate tools/blast_rbh/blast_rbh.py @ 4:d8d9a9069586 draft
v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
author | peterjc |
---|---|
date | Wed, 19 Apr 2017 07:44:47 -0400 |
parents | 14b2e159b310 |
children | 8f4500f6f2aa |
rev | line source |
---|---|
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
1 #!/usr/bin/env python |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
2 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files. |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
3 |
1 | 4 Run "blast_rbh.py -h" to see the help text, or read the associated |
4 | 5 blast_rbh.xml and README.rst files which are available on GitHub at: |
1 | 6 https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh |
7 | |
8 This requires Python and the NCBI BLAST+ tools to be installed | |
9 and on the $PATH. | |
10 | |
11 You can also run this tool via Galaxy using the "blast_rbh.xml" | |
12 definition file. This is available as a package on the Galaxy | |
13 Tool Shed: http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh | |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
14 """ |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
15 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
16 # TODO - Output more columns, e.g. pident, qcovs, descriptions? |
2 | 17 # TODO - Use new -qcov_hsp_perc option in BLAST+ 2.2.30 to filter |
18 # results, rather than doing minimum HSP coverage in Python. | |
19 # [Not doing this right now as would break on older BLAST+] | |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
20 |
4 | 21 from __future__ import print_function |
22 | |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
23 import os |
4 | 24 import shutil |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
25 import sys |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
26 import tempfile |
4 | 27 |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
28 from optparse import OptionParser |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
29 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
30 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
31 def run(cmd): |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
32 return_code = os.system(cmd) |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
33 if return_code: |
4 | 34 sys.exit("Error %i from: %s" % (return_code, cmd)) |
35 | |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
36 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
37 if "--version" in sys.argv[1:]: |
4 | 38 # TODO - Capture version of BLAST+ binaries too? |
39 print("BLAST RBH v0.1.11") | |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
40 sys.exit(0) |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
41 |
4 | 42 try: |
43 threads = int(os.environ.get("GALAXY_SLOTS", "1")) | |
44 except ValueError: | |
45 threads = 1 | |
46 assert 1 <= threads, threads | |
47 | |
48 # Parse Command Line | |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
49 usage = """Use as follows: |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
50 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
51 $ python blast_rbh.py [options] A.fasta B.fasta |
4 | 52 |
53 Many of the options are required. Example with proteins and blastp: | |
54 | |
55 $ python blast_rbh.py -a prot -t blasp -o output.tsv protA.fasta protB.fasta | |
56 | |
57 There is additional guideance in the help text in the blast_rbh.xml file, | |
58 which is shown to the user via the Galaxy interface to this tool. | |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
59 """ |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
60 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
61 parser = OptionParser(usage=usage) |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
62 parser.add_option("-a", "--alphabet", dest="dbtype", |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
63 default=None, |
4 | 64 help="Alphabet type (nucl or prot; required)") |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
65 parser.add_option("-t", "--task", dest="task", |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
66 default=None, |
4 | 67 help="BLAST task (e.g. blastp, blastn, megablast; required)") |
68 parser.add_option("-i", "--identity", dest="min_identity", | |
1 | 69 default="70", |
70 help="Minimum percentage identity (optional, default 70)") | |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
71 parser.add_option("-c", "--coverage", dest="min_coverage", |
1 | 72 default="50", |
73 help="Minimum HSP coverage (optional, default 50)") | |
74 parser.add_option("--nr", dest="nr", default=False, action="store_true", | |
4 | 75 help="Preprocess FASTA files to collapse identitical " |
1 | 76 "entries (make sequences non-redundant)") |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
77 parser.add_option("-o", "--output", dest="output", |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
78 default=None, metavar="FILE", |
4 | 79 help="Output filename (required)") |
80 parser.add_option("--threads", dest="threads", | |
81 default=threads, | |
82 help="Number of threads when running BLAST. Defaults to the " | |
83 "$GALAXY_SLOTS environment variable if set, or 1.") | |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
84 options, args = parser.parse_args() |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
85 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
86 if len(args) != 2: |
4 | 87 sys.exit("Expects two input FASTA filenames") |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
88 fasta_a, fasta_b = args |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
89 if not os.path.isfile(fasta_a): |
4 | 90 sys.exit("Missing input file for species A: %r" % fasta_a) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
91 if not os.path.isfile(fasta_b): |
4 | 92 sys.exit("Missing input file for species B: %r" % fasta_b) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
93 if os.path.abspath(fasta_a) == os.path.abspath(fasta_b): |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
94 self_comparison = True |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
95 print("Doing self comparison; ignoring self matches.") |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
96 else: |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
97 self_comparison = False |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
98 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
99 if not options.output: |
4 | 100 sys.exit("Output filename required, e.g. -o example.tab") |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
101 out_file = options.output |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
102 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
103 try: |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
104 min_identity = float(options.min_identity) |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
105 except ValueError: |
4 | 106 sys.exit("Expected number between 0 and 100 for minimum identity, not %r" % min_identity) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
107 if not (0 <= min_identity <= 100): |
4 | 108 sys.exit("Expected minimum identity between 0 and 100, not %0.2f" % min_identity) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
109 try: |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
110 min_coverage = float(options.min_coverage) |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
111 except ValueError: |
4 | 112 sys.exit("Expected number between 0 and 100 for minimum coverage, not %r" % min_coverage) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
113 if not (0 <= min_coverage <= 100): |
4 | 114 sys.exit("Expected minimum coverage between 0 and 100, not %0.2f" % min_coverage) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
115 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
116 if not options.task: |
4 | 117 sys.exit("Missing BLAST task, e.g. -t blastp") |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
118 blast_type = options.task |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
119 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
120 if not options.dbtype: |
4 | 121 sys.exit("Missing database type, -a nucl, or -a prot") |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
122 dbtype = options.dbtype |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
123 if dbtype == "nucl": |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
124 if blast_type in ["megablast", "blastn", "blastn-short", "dc-megablast"]: |
4 | 125 blast_cmd = "blastn -task %s" % blast_type |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
126 elif blast_type == "tblastx": |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
127 blast_cmd = "tblastx" |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
128 else: |
4 | 129 sys.exit("Invalid BLAST type for BLASTN: %r" % blast_type) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
130 elif dbtype == "prot": |
2 | 131 if blast_type not in ["blastp", "blastp-fast", "blastp-short"]: |
4 | 132 sys.exit("Invalid BLAST type for BLASTP: %r" % blast_type) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
133 blast_cmd = "blastp -task %s" % blast_type |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
134 else: |
4 | 135 sys.exit("Expected 'nucl' or 'prot' for BLAST database type, not %r" % blast_type) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
136 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
137 try: |
4 | 138 threads = int(options.threads) |
139 except ValueError: | |
140 sys.exit("Expected positive integer for number of threads, not %r" % options.threads) | |
141 if threads < 1: | |
142 sys.exit("Expected positive integer for number of threads, not %r" % threads) | |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
143 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
144 makeblastdb_exe = "makeblastdb" |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
145 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
146 base_path = tempfile.mkdtemp() |
1 | 147 tmp_a = os.path.join(base_path, "SpeciesA.fasta") |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
148 db_a = os.path.join(base_path, "SpeciesA") |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
149 a_vs_b = os.path.join(base_path, "A_vs_B.tabular") |
1 | 150 if self_comparison: |
151 tmp_b = tmp_a | |
152 db_b = db_a | |
153 b_vs_a = a_vs_b | |
154 else: | |
155 tmp_b = os.path.join(base_path, "SpeciesB.fasta") | |
156 db_b = os.path.join(base_path, "SpeciesB") | |
157 b_vs_a = os.path.join(base_path, "B_vs_A.tabular") | |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
158 log = os.path.join(base_path, "blast.log") |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
159 |
4 | 160 cols = "qseqid sseqid bitscore pident qcovhsp qlen length" # Or qcovs? |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
161 c_query = 0 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
162 c_match = 1 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
163 c_score = 2 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
164 c_identity = 3 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
165 c_coverage = 4 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
166 c_qlen = 5 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
167 c_length = 6 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
168 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
169 tie_warning = 0 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
170 |
4 | 171 |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
172 def best_hits(blast_tabular, ignore_self=False): |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
173 """Iterate over BLAST tabular output, returns best hits as 2-tuples. |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
174 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
175 Each return value is (query name, tuple of value for the best hit). |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
176 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
177 Tied best hits to different sequences are NOT returned. |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
178 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
179 One hit is returned for tied best hits to the same sequence |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
180 (e.g. repeated domains). |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
181 """ |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
182 global tie_warning |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
183 current = None |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
184 best_score = None |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
185 best = None |
4 | 186 col_count = len(cols.split()) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
187 with open(blast_tabular) as h: |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
188 for line in h: |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
189 if line.startswith("#"): |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
190 continue |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
191 parts = line.rstrip("\n").split("\t") |
4 | 192 if len(parts) != col_count: |
193 # Using NCBI BLAST+ 2.2.27 the undefined field is ignored | |
194 # Even NCBI BLAST+ 2.5.0 silently ignores unknown fields :( | |
195 sys.exit("Old version of NCBI BLAST? Expected %i columns, got %i:\n%s\n" | |
196 "Note the qcovhsp field was only added in version 2.2.28\n" | |
197 % (col_count, len(parts), line)) | |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
198 if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage: |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
199 continue |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
200 a = parts[c_query] |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
201 b = parts[c_match] |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
202 if ignore_self and a == b: |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
203 continue |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
204 score = float(parts[c_score]) |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
205 qlen = int(parts[c_qlen]) |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
206 length = int(parts[c_length]) |
4 | 207 # print("Considering hit for %s to %s with score %s..." % (a, b, score)) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
208 if current is None: |
4 | 209 # First hit |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
210 assert best is None |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
211 assert best_score is None |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
212 best = dict() |
4 | 213 # Now append this hit... |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
214 elif a != current: |
4 | 215 # New hit |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
216 if len(best) == 1: |
4 | 217 # Unambiguous (no tied matches) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
218 yield current, list(best.values())[0] |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
219 else: |
4 | 220 # print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
221 tie_warning += 1 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
222 best = dict() |
4 | 223 # Now append this hit... |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
224 elif score < best_score: |
4 | 225 # print("No improvement for %s, %s < %s" % (a, score, best_score)) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
226 continue |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
227 elif score > best_score: |
4 | 228 # This is better, discard old best |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
229 best = dict() |
4 | 230 # Now append this hit... |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
231 else: |
4 | 232 # print("Tied best hits for %s" % a) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
233 assert best_score == score |
4 | 234 # Now append this hit... |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
235 current = a |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
236 best_score = score |
4 | 237 # This will collapse two equally good hits to the same target (e.g. duplicated domain) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
238 best[b] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length) |
4 | 239 # Best hit for final query, if unambiguous: |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
240 if current is not None: |
4 | 241 if len(best) == 1: |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
242 yield current, list(best.values())[0] |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
243 else: |
4 | 244 # print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
245 tie_warning += 1 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
246 |
4 | 247 |
1 | 248 def check_duplicate_ids(filename): |
249 # Copied from tools/ncbi_blast_plus/check_no_duplicates.py | |
250 # TODO - just use Biopython's FASTA parser? | |
251 if not os.path.isfile(filename): | |
4 | 252 sys.stderr.write("Missing FASTA file %r\n" % filename) |
253 sys.exit(2) | |
1 | 254 identifiers = set() |
255 handle = open(filename) | |
256 for line in handle: | |
257 if line.startswith(">"): | |
4 | 258 # The split will also take care of the new line character, |
259 # e.g. ">test\n" and ">test description here\n" both give "test" | |
1 | 260 seq_id = line[1:].split(None, 1)[0] |
261 if seq_id in identifiers: | |
262 handle.close() | |
4 | 263 sys.stderr.write("Repeated identifiers, e.g. %r\n" % seq_id) |
264 sys.exit(3) | |
1 | 265 identifiers.add(seq_id) |
266 handle.close() | |
267 | |
4 | 268 |
1 | 269 def make_nr(input_fasta, output_fasta, sep=";"): |
4 | 270 # TODO - seq-hash based to avoid loading everything into RAM? |
1 | 271 by_seq = dict() |
272 try: | |
273 from Bio import SeqIO | |
274 except KeyError: | |
4 | 275 sys.exit("Missing Biopython") |
1 | 276 for record in SeqIO.parse(input_fasta, "fasta"): |
277 s = str(record.seq).upper() | |
278 try: | |
279 by_seq[s].append(record.id) | |
280 except KeyError: | |
281 by_seq[s] = [record.id] | |
282 unique = 0 | |
283 representatives = dict() | |
284 duplicates = set() | |
285 for cluster in by_seq.values(): | |
286 if len(cluster) > 1: | |
287 representatives[cluster[0]] = cluster | |
288 duplicates.update(cluster[1:]) | |
289 else: | |
290 unique += 1 | |
291 del by_seq | |
292 if duplicates: | |
4 | 293 # TODO - refactor as a generator with single SeqIO.write(...) call |
1 | 294 with open(output_fasta, "w") as handle: |
295 for record in SeqIO.parse(input_fasta, "fasta"): | |
296 if record.id in representatives: | |
297 cluster = representatives[record.id] | |
298 record.id = sep.join(cluster) | |
299 record.description = "representing %i records" % len(cluster) | |
300 elif record.id in duplicates: | |
301 continue | |
302 SeqIO.write(record, handle, "fasta") | |
4 | 303 print("%i unique entries; removed %i duplicates leaving %i representative records" |
304 % (unique, len(duplicates), len(representatives))) | |
1 | 305 else: |
306 os.symlink(os.path.abspath(input_fasta), output_fasta) | |
307 print("No perfect duplicates in file, %i unique entries" % unique) | |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
308 |
4 | 309 |
310 # print("Starting...") | |
1 | 311 check_duplicate_ids(fasta_a) |
312 if not self_comparison: | |
313 check_duplicate_ids(fasta_b) | |
314 | |
315 if options.nr: | |
316 make_nr(fasta_a, tmp_a) | |
317 if not self_comparison: | |
318 make_nr(fasta_b, tmp_b) | |
319 fasta_a = tmp_a | |
320 fasta_b = tmp_b | |
321 | |
4 | 322 # TODO - Report log in case of error? |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
323 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log)) |
1 | 324 if not self_comparison: |
325 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log)) | |
4 | 326 # print("BLAST databases prepared.") |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
327 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
328 % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads)) |
4 | 329 # print("BLAST species A vs species B done.") |
1 | 330 if not self_comparison: |
331 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' | |
332 % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads)) | |
4 | 333 # print("BLAST species B vs species A done.") |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
334 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
335 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
336 best_b_vs_a = dict(best_hits(b_vs_a, self_comparison)) |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
337 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
338 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
339 count = 0 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
340 outfile = open(out_file, 'w') |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
341 outfile.write("#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n") |
4 | 342 for a, (b, a_score_float, a_score_str, |
343 a_identity_str, a_coverage_str, a_qlen, a_length) in best_hits(a_vs_b, self_comparison): | |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
344 if b not in best_b_vs_a: |
4 | 345 # Match b has no best hit |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
346 continue |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
347 a2, b_score_float, b_score_str, b_identity_str, b_coverage_str, b_qlen, b_length = best_b_vs_a[b] |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
348 if a != a2: |
4 | 349 # Not an RBH |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
350 continue |
4 | 351 # Start with IDs, lengths, coverage |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
352 values = [a, b, a_qlen, b_qlen, a_coverage_str, b_coverage_str] |
4 | 353 # Alignment length was an integer so don't care about original string |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
354 values.append(min(a_length, b_length)) |
4 | 355 # Output the original string versions of the scores |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
356 if float(a_identity_str) < float(b_identity_str): |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
357 values.append(a_identity_str) |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
358 else: |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
359 values.append(b_identity_str) |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
360 if a_score_float < b_score_float: |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
361 values.append(a_score_str) |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
362 else: |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
363 values.append(b_score_str) |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
364 outfile.write("%s\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\n" % tuple(values)) |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
365 count += 1 |
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
366 outfile.close() |
4 | 367 print("Done, %i RBH found" % count) |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
368 if tie_warning: |
2 | 369 sys.stderr.write("Warning: Sequences with tied best hits found, you may have duplicates/clusters\n") |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
370 |
4 | 371 # Remove temp files... |
0
b828ca44a313
Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff
changeset
|
372 shutil.rmtree(base_path) |