annotate tools/blast_rbh/blast_rbh.py @ 4:d8d9a9069586 draft

v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
author peterjc
date Wed, 19 Apr 2017 07:44:47 -0400
parents 14b2e159b310
children 8f4500f6f2aa
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
1 #!/usr/bin/env python
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
2 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files.
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
3
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
4 Run "blast_rbh.py -h" to see the help text, or read the associated
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
5 blast_rbh.xml and README.rst files which are available on GitHub at:
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
6 https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
7
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
8 This requires Python and the NCBI BLAST+ tools to be installed
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
9 and on the $PATH.
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
10
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
11 You can also run this tool via Galaxy using the "blast_rbh.xml"
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
12 definition file. This is available as a package on the Galaxy
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
13 Tool Shed: http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
14 """
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
15
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
16 # TODO - Output more columns, e.g. pident, qcovs, descriptions?
2
14b2e159b310 v0.1.7 - Updated citation & misc internal changes
peterjc
parents: 1
diff changeset
17 # TODO - Use new -qcov_hsp_perc option in BLAST+ 2.2.30 to filter
14b2e159b310 v0.1.7 - Updated citation & misc internal changes
peterjc
parents: 1
diff changeset
18 # results, rather than doing minimum HSP coverage in Python.
14b2e159b310 v0.1.7 - Updated citation & misc internal changes
peterjc
parents: 1
diff changeset
19 # [Not doing this right now as would break on older BLAST+]
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
20
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
21 from __future__ import print_function
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
22
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
23 import os
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
24 import shutil
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
25 import sys
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
26 import tempfile
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
27
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
28 from optparse import OptionParser
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
29
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
30
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
31 def run(cmd):
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
32 return_code = os.system(cmd)
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
33 if return_code:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
34 sys.exit("Error %i from: %s" % (return_code, cmd))
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
35
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
36
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
37 if "--version" in sys.argv[1:]:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
38 # TODO - Capture version of BLAST+ binaries too?
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
39 print("BLAST RBH v0.1.11")
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
40 sys.exit(0)
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
41
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
42 try:
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
43 threads = int(os.environ.get("GALAXY_SLOTS", "1"))
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
44 except ValueError:
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
45 threads = 1
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
46 assert 1 <= threads, threads
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
47
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
48 # Parse Command Line
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
49 usage = """Use as follows:
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
50
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
51 $ python blast_rbh.py [options] A.fasta B.fasta
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
52
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
53 Many of the options are required. Example with proteins and blastp:
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
54
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
55 $ python blast_rbh.py -a prot -t blasp -o output.tsv protA.fasta protB.fasta
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
56
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
57 There is additional guideance in the help text in the blast_rbh.xml file,
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
58 which is shown to the user via the Galaxy interface to this tool.
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
59 """
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
60
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
61 parser = OptionParser(usage=usage)
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
62 parser.add_option("-a", "--alphabet", dest="dbtype",
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
63 default=None,
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
64 help="Alphabet type (nucl or prot; required)")
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
65 parser.add_option("-t", "--task", dest="task",
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
66 default=None,
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
67 help="BLAST task (e.g. blastp, blastn, megablast; required)")
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
68 parser.add_option("-i", "--identity", dest="min_identity",
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
69 default="70",
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
70 help="Minimum percentage identity (optional, default 70)")
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
71 parser.add_option("-c", "--coverage", dest="min_coverage",
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
72 default="50",
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
73 help="Minimum HSP coverage (optional, default 50)")
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
74 parser.add_option("--nr", dest="nr", default=False, action="store_true",
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
75 help="Preprocess FASTA files to collapse identitical "
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
76 "entries (make sequences non-redundant)")
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
77 parser.add_option("-o", "--output", dest="output",
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
78 default=None, metavar="FILE",
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
79 help="Output filename (required)")
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
80 parser.add_option("--threads", dest="threads",
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
81 default=threads,
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
82 help="Number of threads when running BLAST. Defaults to the "
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
83 "$GALAXY_SLOTS environment variable if set, or 1.")
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
84 options, args = parser.parse_args()
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
85
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
86 if len(args) != 2:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
87 sys.exit("Expects two input FASTA filenames")
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
88 fasta_a, fasta_b = args
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
89 if not os.path.isfile(fasta_a):
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
90 sys.exit("Missing input file for species A: %r" % fasta_a)
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
91 if not os.path.isfile(fasta_b):
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
92 sys.exit("Missing input file for species B: %r" % fasta_b)
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
93 if os.path.abspath(fasta_a) == os.path.abspath(fasta_b):
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
94 self_comparison = True
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
95 print("Doing self comparison; ignoring self matches.")
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
96 else:
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
97 self_comparison = False
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
98
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
99 if not options.output:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
100 sys.exit("Output filename required, e.g. -o example.tab")
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
101 out_file = options.output
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
102
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
103 try:
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
104 min_identity = float(options.min_identity)
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
105 except ValueError:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
106 sys.exit("Expected number between 0 and 100 for minimum identity, not %r" % min_identity)
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
107 if not (0 <= min_identity <= 100):
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
108 sys.exit("Expected minimum identity between 0 and 100, not %0.2f" % min_identity)
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
109 try:
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
110 min_coverage = float(options.min_coverage)
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
111 except ValueError:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
112 sys.exit("Expected number between 0 and 100 for minimum coverage, not %r" % min_coverage)
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
113 if not (0 <= min_coverage <= 100):
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
114 sys.exit("Expected minimum coverage between 0 and 100, not %0.2f" % min_coverage)
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
115
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
116 if not options.task:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
117 sys.exit("Missing BLAST task, e.g. -t blastp")
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
118 blast_type = options.task
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
119
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
120 if not options.dbtype:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
121 sys.exit("Missing database type, -a nucl, or -a prot")
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
122 dbtype = options.dbtype
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
123 if dbtype == "nucl":
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
124 if blast_type in ["megablast", "blastn", "blastn-short", "dc-megablast"]:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
125 blast_cmd = "blastn -task %s" % blast_type
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
126 elif blast_type == "tblastx":
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
127 blast_cmd = "tblastx"
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
128 else:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
129 sys.exit("Invalid BLAST type for BLASTN: %r" % blast_type)
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
130 elif dbtype == "prot":
2
14b2e159b310 v0.1.7 - Updated citation & misc internal changes
peterjc
parents: 1
diff changeset
131 if blast_type not in ["blastp", "blastp-fast", "blastp-short"]:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
132 sys.exit("Invalid BLAST type for BLASTP: %r" % blast_type)
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
133 blast_cmd = "blastp -task %s" % blast_type
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
134 else:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
135 sys.exit("Expected 'nucl' or 'prot' for BLAST database type, not %r" % blast_type)
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
136
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
137 try:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
138 threads = int(options.threads)
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
139 except ValueError:
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
140 sys.exit("Expected positive integer for number of threads, not %r" % options.threads)
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
141 if threads < 1:
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
142 sys.exit("Expected positive integer for number of threads, not %r" % threads)
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
143
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
144 makeblastdb_exe = "makeblastdb"
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
145
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
146 base_path = tempfile.mkdtemp()
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
147 tmp_a = os.path.join(base_path, "SpeciesA.fasta")
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
148 db_a = os.path.join(base_path, "SpeciesA")
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
149 a_vs_b = os.path.join(base_path, "A_vs_B.tabular")
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
150 if self_comparison:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
151 tmp_b = tmp_a
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
152 db_b = db_a
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
153 b_vs_a = a_vs_b
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
154 else:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
155 tmp_b = os.path.join(base_path, "SpeciesB.fasta")
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
156 db_b = os.path.join(base_path, "SpeciesB")
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
157 b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
158 log = os.path.join(base_path, "blast.log")
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
159
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
160 cols = "qseqid sseqid bitscore pident qcovhsp qlen length" # Or qcovs?
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
161 c_query = 0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
162 c_match = 1
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
163 c_score = 2
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
164 c_identity = 3
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
165 c_coverage = 4
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
166 c_qlen = 5
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
167 c_length = 6
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
168
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
169 tie_warning = 0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
170
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
171
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
172 def best_hits(blast_tabular, ignore_self=False):
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
173 """Iterate over BLAST tabular output, returns best hits as 2-tuples.
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
174
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
175 Each return value is (query name, tuple of value for the best hit).
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
176
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
177 Tied best hits to different sequences are NOT returned.
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
178
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
179 One hit is returned for tied best hits to the same sequence
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
180 (e.g. repeated domains).
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
181 """
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
182 global tie_warning
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
183 current = None
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
184 best_score = None
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
185 best = None
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
186 col_count = len(cols.split())
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
187 with open(blast_tabular) as h:
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
188 for line in h:
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
189 if line.startswith("#"):
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
190 continue
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
191 parts = line.rstrip("\n").split("\t")
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
192 if len(parts) != col_count:
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
193 # Using NCBI BLAST+ 2.2.27 the undefined field is ignored
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
194 # Even NCBI BLAST+ 2.5.0 silently ignores unknown fields :(
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
195 sys.exit("Old version of NCBI BLAST? Expected %i columns, got %i:\n%s\n"
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
196 "Note the qcovhsp field was only added in version 2.2.28\n"
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
197 % (col_count, len(parts), line))
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
198 if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage:
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
199 continue
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
200 a = parts[c_query]
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
201 b = parts[c_match]
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
202 if ignore_self and a == b:
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
203 continue
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
204 score = float(parts[c_score])
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
205 qlen = int(parts[c_qlen])
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
206 length = int(parts[c_length])
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
207 # print("Considering hit for %s to %s with score %s..." % (a, b, score))
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
208 if current is None:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
209 # First hit
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
210 assert best is None
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
211 assert best_score is None
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
212 best = dict()
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
213 # Now append this hit...
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
214 elif a != current:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
215 # New hit
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
216 if len(best) == 1:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
217 # Unambiguous (no tied matches)
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
218 yield current, list(best.values())[0]
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
219 else:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
220 # print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
221 tie_warning += 1
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
222 best = dict()
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
223 # Now append this hit...
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
224 elif score < best_score:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
225 # print("No improvement for %s, %s < %s" % (a, score, best_score))
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
226 continue
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
227 elif score > best_score:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
228 # This is better, discard old best
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
229 best = dict()
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
230 # Now append this hit...
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
231 else:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
232 # print("Tied best hits for %s" % a)
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
233 assert best_score == score
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
234 # Now append this hit...
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
235 current = a
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
236 best_score = score
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
237 # This will collapse two equally good hits to the same target (e.g. duplicated domain)
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
238 best[b] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length)
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
239 # Best hit for final query, if unambiguous:
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
240 if current is not None:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
241 if len(best) == 1:
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
242 yield current, list(best.values())[0]
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
243 else:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
244 # print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
245 tie_warning += 1
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
246
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
247
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
248 def check_duplicate_ids(filename):
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
249 # Copied from tools/ncbi_blast_plus/check_no_duplicates.py
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
250 # TODO - just use Biopython's FASTA parser?
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
251 if not os.path.isfile(filename):
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
252 sys.stderr.write("Missing FASTA file %r\n" % filename)
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
253 sys.exit(2)
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
254 identifiers = set()
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
255 handle = open(filename)
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
256 for line in handle:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
257 if line.startswith(">"):
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
258 # The split will also take care of the new line character,
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
259 # e.g. ">test\n" and ">test description here\n" both give "test"
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
260 seq_id = line[1:].split(None, 1)[0]
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
261 if seq_id in identifiers:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
262 handle.close()
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
263 sys.stderr.write("Repeated identifiers, e.g. %r\n" % seq_id)
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
264 sys.exit(3)
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
265 identifiers.add(seq_id)
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
266 handle.close()
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
267
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
268
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
269 def make_nr(input_fasta, output_fasta, sep=";"):
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
270 # TODO - seq-hash based to avoid loading everything into RAM?
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
271 by_seq = dict()
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
272 try:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
273 from Bio import SeqIO
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
274 except KeyError:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
275 sys.exit("Missing Biopython")
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
276 for record in SeqIO.parse(input_fasta, "fasta"):
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
277 s = str(record.seq).upper()
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
278 try:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
279 by_seq[s].append(record.id)
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
280 except KeyError:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
281 by_seq[s] = [record.id]
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
282 unique = 0
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
283 representatives = dict()
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
284 duplicates = set()
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
285 for cluster in by_seq.values():
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
286 if len(cluster) > 1:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
287 representatives[cluster[0]] = cluster
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
288 duplicates.update(cluster[1:])
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
289 else:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
290 unique += 1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
291 del by_seq
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
292 if duplicates:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
293 # TODO - refactor as a generator with single SeqIO.write(...) call
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
294 with open(output_fasta, "w") as handle:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
295 for record in SeqIO.parse(input_fasta, "fasta"):
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
296 if record.id in representatives:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
297 cluster = representatives[record.id]
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
298 record.id = sep.join(cluster)
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
299 record.description = "representing %i records" % len(cluster)
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
300 elif record.id in duplicates:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
301 continue
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
302 SeqIO.write(record, handle, "fasta")
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
303 print("%i unique entries; removed %i duplicates leaving %i representative records"
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
304 % (unique, len(duplicates), len(representatives)))
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
305 else:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
306 os.symlink(os.path.abspath(input_fasta), output_fasta)
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
307 print("No perfect duplicates in file, %i unique entries" % unique)
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
308
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
309
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
310 # print("Starting...")
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
311 check_duplicate_ids(fasta_a)
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
312 if not self_comparison:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
313 check_duplicate_ids(fasta_b)
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
314
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
315 if options.nr:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
316 make_nr(fasta_a, tmp_a)
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
317 if not self_comparison:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
318 make_nr(fasta_b, tmp_b)
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
319 fasta_a = tmp_a
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
320 fasta_b = tmp_b
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
321
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
322 # TODO - Report log in case of error?
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
323 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log))
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
324 if not self_comparison:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
325 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log))
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
326 # print("BLAST databases prepared.")
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
327 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
328 % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads))
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
329 # print("BLAST species A vs species B done.")
1
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
330 if not self_comparison:
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
331 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
ff0b814c1320 Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
peterjc
parents: 0
diff changeset
332 % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads))
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
333 # print("BLAST species B vs species A done.")
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
334
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
335
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
336 best_b_vs_a = dict(best_hits(b_vs_a, self_comparison))
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
337
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
338
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
339 count = 0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
340 outfile = open(out_file, 'w')
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
341 outfile.write("#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n")
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
342 for a, (b, a_score_float, a_score_str,
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
343 a_identity_str, a_coverage_str, a_qlen, a_length) in best_hits(a_vs_b, self_comparison):
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
344 if b not in best_b_vs_a:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
345 # Match b has no best hit
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
346 continue
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
347 a2, b_score_float, b_score_str, b_identity_str, b_coverage_str, b_qlen, b_length = best_b_vs_a[b]
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
348 if a != a2:
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
349 # Not an RBH
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
350 continue
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
351 # Start with IDs, lengths, coverage
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
352 values = [a, b, a_qlen, b_qlen, a_coverage_str, b_coverage_str]
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
353 # Alignment length was an integer so don't care about original string
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
354 values.append(min(a_length, b_length))
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
355 # Output the original string versions of the scores
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
356 if float(a_identity_str) < float(b_identity_str):
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
357 values.append(a_identity_str)
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
358 else:
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
359 values.append(b_identity_str)
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
360 if a_score_float < b_score_float:
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
361 values.append(a_score_str)
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
362 else:
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
363 values.append(b_score_str)
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
364 outfile.write("%s\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\n" % tuple(values))
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
365 count += 1
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
366 outfile.close()
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
367 print("Done, %i RBH found" % count)
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
368 if tie_warning:
2
14b2e159b310 v0.1.7 - Updated citation & misc internal changes
peterjc
parents: 1
diff changeset
369 sys.stderr.write("Warning: Sequences with tied best hits found, you may have duplicates/clusters\n")
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
370
4
d8d9a9069586 v0.1.11 using BLAST+ 2.5.0 and Biopython 1.67
peterjc
parents: 2
diff changeset
371 # Remove temp files...
0
b828ca44a313 Uploaded v0.1.2 (previously only on the Test Tool Shed)
peterjc
parents:
diff changeset
372 shutil.rmtree(base_path)