comparison scripts/patho_typing.py @ 0:c6bab5103a14 draft

"planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
author iss
date Mon, 21 Mar 2022 15:23:09 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c6bab5103a14
1 #!/usr/bin/env python3
2
3 # -*- coding: utf-8 -*-
4
5 """
6 patho_typing.py - In silico pathogenic typing directly from raw
7 Illumina reads
8 <https://github.com/B-UMMI/patho_typing/>
9
10 Copyright (C) 2018 Miguel Machado <mpmachado@medicina.ulisboa.pt>
11
12 Last modified: October 15, 2018
13
14 This program is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
18
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
23
24 You should have received a copy of the GNU General Public License
25 along with this program. If not, see <http://www.gnu.org/licenses/>.
26
27 2020-01-13 ISS
28 In order to use the module within the EURL_WGS_Typer pipeline with a
29 different virulence database for E coli, mapping against the
30 typing_rules.tab was deactivated.
31 """
32
33 import argparse
34 import os
35 import time
36 import sys
37 from pkg_resources import resource_filename
38
39 try:
40 from __init__ import __version__
41
42 import modules.utils as utils
43 import modules.run_rematch as run_rematch
44 import modules.typing as typing
45 except ImportError:
46 from pathotyping.__init__ import __version__
47
48 from pathotyping.modules import utils as utils
49 from pathotyping.modules import run_rematch as run_rematch
50 from pathotyping.modules import typing as typing
51
52
53 def set_reference(species, outdir, script_path, trueCoverage):
54 reference_file = None
55 trueCoverage_file = None
56 trueCoverage_sequences = None
57 trueCoverage_headers = None
58 trueCoverage_config = None
59 typing_file = None
60 typing_sequences = None
61 typing_headers = None
62 typing_rules = None
63 typing_config = None
64
65 species_folder = os.path.join(os.path.dirname(script_path), 'modules', 'seq_conf',
66 '_'.join([s.lower() for s in species]), '')
67
68 if os.path.isdir(species_folder):
69 typing_rules = os.path.join(species_folder, 'typing_rules.tab')
70 typing_file = os.path.join(species_folder, 'typing.fasta')
71 typing_sequences, ignore = utils.get_sequence_information(typing_file, 0)
72 typing_sequences, typing_headers = utils.clean_headers_sequences(typing_sequences)
73 typing_sequences = utils.simplify_sequence_dict(typing_sequences)
74 typing_config = os.path.join(species_folder, 'typing.config')
75 if trueCoverage:
76 if os.path.isfile(os.path.join(species_folder, 'trueCoverage.fasta')):
77 trueCoverage_file = os.path.join(species_folder, 'trueCoverage.fasta')
78 trueCoverage_sequences, ignore = utils.get_sequence_information(trueCoverage_file, 0)
79 trueCoverage_sequences, trueCoverage_headers = utils.clean_headers_sequences(trueCoverage_sequences)
80 trueCoverage_sequences = utils.simplify_sequence_dict(trueCoverage_sequences)
81 trueCoverage_config = os.path.join(species_folder, 'trueCoverage.config')
82
83 trueCoverage_typing_sequences = trueCoverage_sequences.copy()
84 for header in typing_sequences:
85 if header not in trueCoverage_sequences:
86 trueCoverage_typing_sequences[header] = typing_sequences[header]
87 else:
88 print('Sequence {header} of typing.fasta already present in'
89 ' trueCoverage.fasta'.format(header=header))
90
91 reference_file = os.path.join(outdir, 'trueCoverage_typing.fasta')
92 write_sequeces(reference_file, trueCoverage_typing_sequences)
93 else:
94 reference_file = os.path.join(outdir, 'typing.fasta')
95 write_sequeces(reference_file, typing_sequences)
96 else:
97 species_present = []
98 seq_conf_folder = os.path.join(os.path.dirname(script_path), 'modules', 'seq_conf', '')
99 species_folder = [d for d in os.listdir(seq_conf_folder) if
100 not d.startswith('.') and os.path.isdir(os.path.join(seq_conf_folder, d, ''))]
101 for species in species_folder:
102 species = species.split('_')
103 species[0] = species[0][0].upper() + species[0][1:]
104 species_present.append(' '.join(species))
105 sys.exit('Only these species are available:' + '\n' + '\n'.join(species_present))
106
107 return reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, \
108 typing_file, typing_sequences, typing_headers, typing_rules, typing_config
109
110
111 def index_fasta_samtools(fasta, region_None, region_outfile_none, print_comand_True):
112 command = ['samtools', 'faidx', fasta, '', '', '']
113 shell_true = False
114 if region_None is not None:
115 command[3] = region_None
116 if region_outfile_none is not None:
117 command[4] = '>'
118 command[5] = region_outfile_none
119 shell_true = True
120 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, shell_true, None, print_comand_True)
121 return run_successfully, stdout
122
123
124 def indexSequenceBowtie2(referenceFile, threads):
125 if os.path.isfile(str(referenceFile + '.1.bt2')):
126 run_successfully = True
127 else:
128 command = ['bowtie2-build', '--threads', str(threads), referenceFile, referenceFile]
129 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
130 return run_successfully
131
132
133 def run_bowtie(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc):
134 sam_file = os.path.join(outdir, str('alignment.sam'))
135
136 run_successfully = indexSequenceBowtie2(referenceFile, threads)
137 if run_successfully:
138 command = ['bowtie2', '-k', str(numMapLoc), '-q', '', '--threads', str(threads), '-x', referenceFile, '',
139 '--no-unal', '-S', sam_file]
140
141 if len(fastq_files) == 1:
142 command[9] = '-U ' + fastq_files[0]
143 elif len(fastq_files) == 2:
144 command[9] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1]
145 else:
146 return False, None
147
148 if conserved_True:
149 command[4] = '--sensitive'
150 else:
151 command[4] = '--very-sensitive-local'
152
153 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
154
155 if not run_successfully:
156 sam_file = None
157
158 return run_successfully, sam_file
159
160
161 def sortAlignment(alignment_file, output_file, sortByName_True, threads):
162 outFormat_string = os.path.splitext(output_file)[1][1:].lower()
163 command = ['samtools', 'sort', '-o', output_file, '-O', outFormat_string, '', '-@', str(threads), alignment_file]
164 if sortByName_True:
165 command[6] = '-n'
166 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
167 if not run_successfully:
168 output_file = None
169 return run_successfully, output_file
170
171
172 def indexAlignment(alignment_file):
173 command = ['samtools', 'index', alignment_file]
174 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
175 return run_successfully
176
177
178 def mapping_reads(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc):
179 print('\n' + 'Mapping the reads' + '\n')
180 run_successfully, sam_file = run_bowtie(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc)
181 bam_file = None
182 if run_successfully:
183 run_successfully, bam_file = sortAlignment(sam_file, str(os.path.splitext(sam_file)[0] + '.bam'), False,
184 threads)
185 if run_successfully:
186 os.remove(sam_file)
187 run_successfully = indexAlignment(bam_file)
188 if run_successfully:
189 index_fasta_samtools(referenceFile, None, None, True)
190 return run_successfully, bam_file
191
192
193 def include_rematch_dependencies_path():
194 original_rematch = None
195 command = ['which', 'rematch.py']
196 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False)
197 if run_successfully:
198 original_rematch = stdout.splitlines()[0]
199
200 resource_rematch = None
201 try:
202 resource_rematch = resource_filename('ReMatCh', 'rematch.py')
203 except ModuleNotFoundError:
204 resource_rematch = original_rematch
205 else:
206 print('\n'
207 'Using ReMatCh "{resource_rematch}" via "{original_rematch}"\n'.format(resource_rematch=resource_rematch,
208 original_rematch=original_rematch))
209
210 if resource_rematch is not None:
211 utils.setPATHvariable(False, resource_rematch)
212 else:
213 sys.exit('ReMatCh not found in the PATH')
214
215 return resource_rematch
216
217
218 def split_bam(bam_file, list_sequences, outdir, threads):
219 new_bam = os.path.join(outdir, 'partial.bam')
220 command = ['samtools', 'view', '-b', '-u', '-h', '-o', new_bam, '-@', str(threads), bam_file,
221 ' '.join(list_sequences)]
222 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
223 return run_successfully, new_bam
224
225
226 def parse_config(config_file):
227 config = {'reference_file': None, 'length_extra_seq': None, 'maximum_number_absent_genes': None,
228 'maximum_number_genes_multiple_alleles': None, 'minimum_read_coverage': None,
229 'minimum_depth_presence': None, 'minimum_depth_call': None,
230 'minimum_depth_frequency_dominant_allele': None, 'minimum_gene_coverage': None,
231 'minimum_gene_identity': None}
232
233 with open(config_file, 'rt') as reader:
234 field = None
235 for line in reader:
236 line = line.splitlines()[0]
237 if len(line) > 0:
238 line = line.split(' ')[0]
239 if line.startswith('#'):
240 line = line[1:].split(' ')[0]
241 field = line
242 else:
243 if field is not None:
244 if field in ['length_extra_seq', 'maximum_number_absent_genes',
245 'maximum_number_genes_multiple_alleles', 'minimum_read_coverage',
246 'minimum_depth_presence', 'minimum_depth_call', 'minimum_gene_coverage',
247 'minimum_gene_identity']:
248 line = int(line)
249 if field in ['minimum_gene_coverage', 'minimum_gene_identity']:
250 if line < 0 or line > 100:
251 sys.exit('minimum_gene_coverage in trueCoverage_rematch config file must be an'
252 ' integer between 0 and 100')
253 elif field == 'minimum_depth_frequency_dominant_allele':
254 line = float(line)
255 if line < 0 or line > 1:
256 sys.exit('minimum_depth_frequency_dominant_allele in trueCoverage_rematch config file'
257 ' must be a double between 0 and 1')
258 config[field] = line
259 field = None
260
261 for field in config:
262 if config[field] is None:
263 sys.exit(field + ' in trueCoverage_rematch config file is missing')
264
265 return config
266
267
268 def clean_pathotyping_folder(outdir, reference_file, debug_mode_true):
269 if not debug_mode_true:
270 files = [f for f in os.listdir(outdir) if not f.startswith('.') and os.path.isfile(os.path.join(outdir, f))]
271 for file_found in files:
272 if file_found.startswith(('alignment.', os.path.splitext(os.path.basename(reference_file))[0])):
273 file_found = os.path.join(outdir, file_found)
274 os.remove(file_found)
275
276
277 def write_sequeces(out_file, sequences_dict):
278 with open(out_file, 'wt') as writer:
279 for header in sequences_dict:
280 writer.write('>' + header + '\n')
281 writer.write('\n'.join(utils.chunkstring(sequences_dict[header]['sequence'], 80)) + '\n')
282
283
284 def main():
285 parser = argparse.ArgumentParser(prog='patho_typing.py',
286 description='In silico pathogenic typing directly from raw Illumina reads',
287 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
288 parser.add_argument('--version', help='Version information', action='version',
289 version='{prog} v{version}'.format(prog=parser.prog, version=__version__))
290
291 parser_required = parser.add_argument_group('Required options')
292 parser_required.add_argument('-f', '--fastq', nargs='+', action=utils.required_length((1, 2), '--fastq'),
293 type=argparse.FileType('r'), metavar=('/path/to/input/file.fq.gz'),
294 help='Path to single OR paired-end fastq files. If two files are passed, they will be'
295 ' assumed as being the paired fastq files', required=True)
296 parser_required.add_argument('-s', '--species', nargs=2, type=str, metavar=('Yersinia', 'enterocolitica'),
297 help='Species name', required=True)
298
299 parser_optional_general = parser.add_argument_group('General facultative options')
300 parser_optional_general.add_argument('-o', '--outdir', type=str, metavar='/path/to/output/directory/',
301 help='Path to the directory where the information will be stored',
302 required=False, default='.')
303 parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use',
304 required=False, default=1)
305 parser_optional_general.add_argument('--trueCoverage', action='store_true',
306 help='Assess true coverage before continue typing')
307 parser_optional_general.add_argument('--noCheckPoint', action='store_true',
308 help='Ignore the true coverage checking point')
309 parser_optional_general.add_argument('--minGeneCoverage', type=int, metavar='N',
310 help='Minimum typing percentage of target reference gene sequence covered to'
311 ' consider a gene to be present (value between [0, 100])', required=False)
312 parser_optional_general.add_argument('--minGeneIdentity', type=int, metavar='N',
313 help='Minimum typing percentage of identity of reference gene sequence covered'
314 ' to consider a gene to be present (value between [0, 100]). One INDEL'
315 ' will be considered as one difference', required=False)
316 parser_optional_general.add_argument('--minGeneDepth', type=int, metavar='N',
317 help='Minimum typing gene average coverage depth of present positions to'
318 ' consider a gene to be present (default is 1/3 of average sample'
319 ' coverage or 15x)', required=False)
320 parser_optional_general.add_argument('--doNotRemoveConsensus', action='store_true',
321 help='Do not remove ReMatCh consensus sequences')
322 parser_optional_general.add_argument('--debug', action='store_true',
323 help='DeBug Mode: do not remove temporary files')
324
325 args = parser.parse_args()
326
327 if args.minGeneCoverage is not None and (args.minGeneCoverage < 0 or args.minGeneCoverage > 100):
328 parser.error('--minGeneCoverage should be a value between [0, 100]')
329 if args.minGeneIdentity is not None and (args.minGeneIdentity < 0 or args.minGeneIdentity > 100):
330 parser.error('--minGeneIdentity should be a value between [0, 100]')
331
332 start_time = time.time()
333
334 args.outdir = os.path.abspath(args.outdir)
335 if not os.path.isdir(args.outdir):
336 os.makedirs(args.outdir)
337
338 # Start logger
339 logfile, time_str = utils.start_logger(args.outdir)
340
341 script_path = utils.general_information(logfile, __version__, args.outdir, time_str)
342 print('\n')
343
344 rematch = include_rematch_dependencies_path()
345
346 args.fastq = [fastq.name for fastq in args.fastq]
347
348 reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, typing_file, \
349 typing_sequences, typing_headers, typing_rules, typing_config = \
350 set_reference(args.species, args.outdir, script_path, args.trueCoverage)
351 original_reference_file = str(reference_file)
352
353 run_successfully, bam_file = mapping_reads(args.fastq, reference_file, args.threads, args.outdir, False, 1)
354 if run_successfully:
355 rematch_dir = os.path.join(args.outdir, 'rematch', '')
356 if not os.path.isdir(rematch_dir):
357 os.makedirs(rematch_dir)
358
359 if args.trueCoverage:
360 if trueCoverage_file is not None:
361 trueCoverage_dir = os.path.join(rematch_dir, 'trueCoverage', '')
362 if not os.path.isdir(trueCoverage_dir):
363 os.makedirs(trueCoverage_dir)
364
365 print('\n')
366 run_successfully, trueCoverage_bam = split_bam(bam_file, trueCoverage_headers, trueCoverage_dir,
367 args.threads)
368 if run_successfully:
369 run_successfully = indexAlignment(trueCoverage_bam)
370 if run_successfully:
371 reference_file = os.path.join(trueCoverage_dir, 'reference.fasta')
372 write_sequeces(reference_file, trueCoverage_sequences)
373 index_fasta_samtools(reference_file, None, None, True)
374 config = parse_config(trueCoverage_config)
375 runtime, run_successfully, sample_data_general, data_by_gene = \
376 run_rematch.run_rematch(rematch, trueCoverage_dir, reference_file, trueCoverage_bam,
377 args.threads, config['length_extra_seq'],
378 config['minimum_depth_presence'], config['minimum_depth_call'],
379 config['minimum_depth_frequency_dominant_allele'],
380 config['minimum_gene_coverage'], config['minimum_gene_identity'],
381 args.debug, args.doNotRemoveConsensus)
382
383 if run_successfully and sample_data_general['mean_sample_coverage'] is not None and \
384 sample_data_general['number_absent_genes'] is not None and \
385 sample_data_general['number_genes_multiple_alleles'] is not None:
386 if args.minGeneDepth is None:
387 args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \
388 sample_data_general['mean_sample_coverage'] / 3 > 15 else \
389 15
390
391 exit_info = []
392 if sample_data_general['mean_sample_coverage'] < config['minimum_read_coverage']:
393 exit_info.append('Sample coverage ({mean}) lower than the minimum'
394 ' required ({minimum})'
395 ''.format(mean=sample_data_general['mean_sample_coverage'],
396 minimum=config['minimum_read_coverage']))
397 if sample_data_general['number_absent_genes'] > config['maximum_number_absent_genes']:
398 exit_info.append('Number of absent genes ({number}) higher than the'
399 ' maximum allowed ({maximum})'
400 ''.format(number=sample_data_general['number_absent_genes'],
401 maximum=config['maximum_number_absent_genes']))
402 if sample_data_general['number_genes_multiple_alleles'] > \
403 config['maximum_number_genes_multiple_alleles']:
404 exit_info.append('Number of genes with multiple alleles'
405 ' ({number}) higher than the maximum'
406 ' allowed ({maximum})'
407 ''.format(number=sample_data_general['number_genes_multiple_alleles'],
408 maximum=config['maximum_number_genes_multiple_alleles']))
409
410 if len(exit_info) > 0:
411 print('\n' + '\n'.join(exit_info) + '\n')
412 e = 'TrueCoverage requirements not fulfilled'
413 print('\n' + e + '\n')
414 if not args.noCheckPoint:
415 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
416 _ = utils.runTime(start_time)
417 sys.exit(e)
418 else:
419 e = 'TrueCoverage module did not run successfully'
420 print('\n' + e + '\n')
421 if not args.noCheckPoint:
422 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
423 _ = utils.runTime(start_time)
424 sys.exit(e)
425
426 print('\n')
427 typing_dir = os.path.join(rematch_dir, 'typing', '')
428 if not os.path.isdir(typing_dir):
429 os.makedirs(typing_dir)
430 run_successfully, bam_file = split_bam(bam_file, typing_headers, typing_dir, args.threads)
431 if run_successfully:
432 run_successfully = indexAlignment(bam_file)
433 if run_successfully:
434 reference_file = os.path.join(typing_dir, 'reference.fasta')
435 write_sequeces(reference_file, typing_sequences)
436 index_fasta_samtools(reference_file, None, None, True)
437 rematch_dir = str(typing_dir)
438 if not run_successfully:
439 if args.noCheckPoint:
440 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
441 _ = utils.runTime(start_time)
442 sys.exit('Something in the required TrueCoverage analysis went wrong')
443 else:
444 print('\n'
445 'WARNING: it was not found trueCoverage target files. trueCoverage will not run.'
446 '\n')
447
448 if run_successfully:
449 config = parse_config(typing_config)
450 if args.minGeneCoverage is not None:
451 config['minimum_gene_coverage'] = args.minGeneCoverage
452 if args.minGeneIdentity is not None:
453 config['minimum_gene_identity'] = args.minGeneIdentity
454
455 runtime, run_successfully, sample_data_general, data_by_gene = \
456 run_rematch.run_rematch(rematch, rematch_dir, reference_file, bam_file, args.threads,
457 config['length_extra_seq'], config['minimum_depth_presence'],
458 config['minimum_depth_call'], config['minimum_depth_frequency_dominant_allele'],
459 config['minimum_gene_coverage'], config['minimum_gene_identity'],
460 args.debug, args.doNotRemoveConsensus)
461 if run_successfully and data_by_gene is not None:
462 if args.minGeneDepth is None:
463 args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \
464 sample_data_general['mean_sample_coverage'] / 3 > 15 else \
465 15
466 else:
467 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
468 _ = utils.runTime(start_time)
469 sys.exit('ReMatCh run for pathotyping did not run successfully')
470 else:
471 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
472 _ = utils.runTime(start_time)
473 sys.exit('Something did not run successfully')
474
475 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
476
477 print('\n')
478 _ = utils.runTime(start_time)
479
480
481 if __name__ == "__main__":
482 main()