Mercurial > repos > jpetteng > ectyper
comparison ecoli_serotyping/blastFunctions.py @ 6:fe3ceb5c4214 draft
Uploaded
author | jpetteng |
---|---|
date | Fri, 05 Jan 2018 15:43:14 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:a202cc394af8 | 6:fe3ceb5c4214 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 Functions for setting up, running, and parsing blast | |
5 """ | |
6 import logging | |
7 import os | |
8 | |
9 from ectyper import subprocess_util | |
10 | |
11 LOG = logging.getLogger(__name__) | |
12 | |
13 | |
14 def create_blast_db(filelist, temp_dir): | |
15 """http://stackoverflow.com/questions/23944657/typeerror-method-takes-1-positional-argument-but-2-were-given | |
16 Creating a blast DB using the makeblastdb command. | |
17 The database is created in the temporary folder of the system. | |
18 | |
19 Args: | |
20 filelist: list of genomes that was given by the user on the command line. | |
21 temp_dir: temporary directory to store the blastdb in. | |
22 | |
23 Returns: | |
24 Full path to the DB | |
25 """ | |
26 blast_db_path = os.path.join(temp_dir, 'ectyper_blastdb') | |
27 | |
28 LOG.debug("Generating the blast db at {0}".format(blast_db_path)) | |
29 cmd = [ | |
30 "makeblastdb", | |
31 "-in", ' '.join(filelist), | |
32 "-dbtype", "nucl", | |
33 "-title", "ectyper_blastdb", | |
34 "-out", blast_db_path] | |
35 subprocess_util.run_subprocess(cmd) | |
36 | |
37 return blast_db_path | |
38 | |
39 | |
40 def run_blast(query_file, blast_db, args): | |
41 """ | |
42 Execute a blastn run given the query files and blastdb | |
43 | |
44 Args: | |
45 query_file (str): one or both of the VF / Serotype input files | |
46 blast_db (str): validated fasta files from the user, in DB form | |
47 args (Namespace object): parsed commadnline options from the user | |
48 chunck_size: number of genomes in the database | |
49 | |
50 Returns: | |
51 The blast output file | |
52 """ | |
53 percent_identity = args.percentIdentity | |
54 percent_length = args.percentLength | |
55 | |
56 LOG.debug('Running blast query {0} against database {1} '.format( | |
57 query_file, blast_db)) | |
58 | |
59 blast_output_file = blast_db + '.output' | |
60 | |
61 cmd = [ | |
62 "blastn", | |
63 "-query", query_file, | |
64 "-db", blast_db, | |
65 "-out", blast_output_file, | |
66 '-perc_identity', str(percent_identity), | |
67 '-qcov_hsp_perc', str(percent_length), | |
68 '-max_hsps', '1', # each allele only need to hit once | |
69 # use default max_target_seqs=500 | |
70 "-outfmt", | |
71 '6 qseqid qlen sseqid length pident sstart send sframe qcovhsp', | |
72 "-word_size", "11" | |
73 ] | |
74 subprocess_util.run_subprocess(cmd) | |
75 with open(blast_output_file, mode='rb') as fh: | |
76 for line in fh: | |
77 LOG.debug(line.decode('ascii')) | |
78 return blast_output_file | |
79 | |
80 def run_blast_for_identification(query_file, blast_db): | |
81 """ | |
82 Execute a blastn run given the query files and blastdb | |
83 with special configuration for high performance identification | |
84 | |
85 Args: | |
86 query_file: one or both of the VF / Serotype input files | |
87 blast_db: validated fasta files from the user, in DB form | |
88 | |
89 Returns: | |
90 blast_output_file (str): path to the blast output file | |
91 """ | |
92 | |
93 LOG.debug('Running blast query {0} against database {1} '.format( | |
94 query_file, blast_db)) | |
95 | |
96 blast_output_file = blast_db + '.output' | |
97 | |
98 cmd = [ | |
99 "blastn", | |
100 "-query", query_file, | |
101 "-db", blast_db, | |
102 "-out", blast_output_file, | |
103 '-perc_identity', '90', | |
104 '-qcov_hsp_perc', '90', | |
105 '-max_target_seqs', '1', # we only want to know hit/no hit | |
106 # 10 query seq, we want at most 1 hit each | |
107 "-outfmt", | |
108 '6 qseqid qlen sseqid length pident sstart send sframe', | |
109 "-word_size", "11" | |
110 ] | |
111 subprocess_util.run_subprocess(cmd) | |
112 | |
113 return blast_output_file |