comparison ecoli_serotyping/blastFunctions.py @ 6:fe3ceb5c4214 draft

Uploaded
author jpetteng
date Fri, 05 Jan 2018 15:43:14 -0500
parents
children
comparison
equal deleted inserted replaced
5:a202cc394af8 6:fe3ceb5c4214
1 #!/usr/bin/env python
2
3 """
4 Functions for setting up, running, and parsing blast
5 """
6 import logging
7 import os
8
9 from ectyper import subprocess_util
10
11 LOG = logging.getLogger(__name__)
12
13
14 def create_blast_db(filelist, temp_dir):
15 """http://stackoverflow.com/questions/23944657/typeerror-method-takes-1-positional-argument-but-2-were-given
16 Creating a blast DB using the makeblastdb command.
17 The database is created in the temporary folder of the system.
18
19 Args:
20 filelist: list of genomes that was given by the user on the command line.
21 temp_dir: temporary directory to store the blastdb in.
22
23 Returns:
24 Full path to the DB
25 """
26 blast_db_path = os.path.join(temp_dir, 'ectyper_blastdb')
27
28 LOG.debug("Generating the blast db at {0}".format(blast_db_path))
29 cmd = [
30 "makeblastdb",
31 "-in", ' '.join(filelist),
32 "-dbtype", "nucl",
33 "-title", "ectyper_blastdb",
34 "-out", blast_db_path]
35 subprocess_util.run_subprocess(cmd)
36
37 return blast_db_path
38
39
40 def run_blast(query_file, blast_db, args):
41 """
42 Execute a blastn run given the query files and blastdb
43
44 Args:
45 query_file (str): one or both of the VF / Serotype input files
46 blast_db (str): validated fasta files from the user, in DB form
47 args (Namespace object): parsed commadnline options from the user
48 chunck_size: number of genomes in the database
49
50 Returns:
51 The blast output file
52 """
53 percent_identity = args.percentIdentity
54 percent_length = args.percentLength
55
56 LOG.debug('Running blast query {0} against database {1} '.format(
57 query_file, blast_db))
58
59 blast_output_file = blast_db + '.output'
60
61 cmd = [
62 "blastn",
63 "-query", query_file,
64 "-db", blast_db,
65 "-out", blast_output_file,
66 '-perc_identity', str(percent_identity),
67 '-qcov_hsp_perc', str(percent_length),
68 '-max_hsps', '1', # each allele only need to hit once
69 # use default max_target_seqs=500
70 "-outfmt",
71 '6 qseqid qlen sseqid length pident sstart send sframe qcovhsp',
72 "-word_size", "11"
73 ]
74 subprocess_util.run_subprocess(cmd)
75 with open(blast_output_file, mode='rb') as fh:
76 for line in fh:
77 LOG.debug(line.decode('ascii'))
78 return blast_output_file
79
80 def run_blast_for_identification(query_file, blast_db):
81 """
82 Execute a blastn run given the query files and blastdb
83 with special configuration for high performance identification
84
85 Args:
86 query_file: one or both of the VF / Serotype input files
87 blast_db: validated fasta files from the user, in DB form
88
89 Returns:
90 blast_output_file (str): path to the blast output file
91 """
92
93 LOG.debug('Running blast query {0} against database {1} '.format(
94 query_file, blast_db))
95
96 blast_output_file = blast_db + '.output'
97
98 cmd = [
99 "blastn",
100 "-query", query_file,
101 "-db", blast_db,
102 "-out", blast_output_file,
103 '-perc_identity', '90',
104 '-qcov_hsp_perc', '90',
105 '-max_target_seqs', '1', # we only want to know hit/no hit
106 # 10 query seq, we want at most 1 hit each
107 "-outfmt",
108 '6 qseqid qlen sseqid length pident sstart send sframe',
109 "-word_size", "11"
110 ]
111 subprocess_util.run_subprocess(cmd)
112
113 return blast_output_file