comparison scripts/ReMatCh/modules/rematch_module.py @ 0:c6bab5103a14 draft

"planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
author iss
date Mon, 21 Mar 2022 15:23:09 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c6bab5103a14
1 import os.path
2 import multiprocessing
3 import functools
4 import sys
5 import pickle
6
7 # https://chrisyeh96.github.io/2017/08/08/definitive-guide-python-imports.html#case-2-syspath-could-change
8 sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
9 import utils
10
11
12 def index_fasta_samtools(fasta, region_none, region_outfile_none, print_comand_true):
13 command = ['samtools', 'faidx', fasta, '', '', '']
14 shell_true = False
15 if region_none is not None:
16 command[3] = region_none
17 if region_outfile_none is not None:
18 command[4] = '>'
19 command[5] = region_outfile_none
20 shell_true = True
21 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, shell_true, None, print_comand_true)
22 return run_successfully, stdout
23
24
25 # Indexing reference file using Bowtie2
26 def index_sequence_bowtie2(reference_file, threads):
27 if os.path.isfile(str(reference_file + '.1.bt2')):
28 run_successfully = True
29 else:
30 command = ['bowtie2-build', '--threads', str(threads), reference_file, reference_file]
31 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True)
32 return run_successfully
33
34
35 # Mapping with Bowtie2
36 def mapping_bowtie2(fastq_files, reference_file, threads, outdir, num_map_loc,
37 bowtie_algorithm='--very-sensitive-local', bowtie_opt=None):
38 sam_file = os.path.join(outdir, str('alignment.sam'))
39
40 # Index reference file
41 run_successfully = index_sequence_bowtie2(reference_file, threads)
42
43 if run_successfully:
44 command = ['bowtie2', '', '', '-q', bowtie_algorithm, '--threads', str(threads), '-x',
45 reference_file, '', '--no-unal', '', '-S', sam_file]
46
47 if num_map_loc is not None and num_map_loc > 1:
48 command[1] = '-k'
49 command[2] = str(num_map_loc)
50
51 if len(fastq_files) == 1:
52 command[9] = '-U ' + fastq_files[0]
53 elif len(fastq_files) == 2:
54 command[9] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1]
55 else:
56 return False, None
57
58 if bowtie_opt is not None:
59 command[11] = bowtie_opt
60
61 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True)
62
63 if not run_successfully:
64 sam_file = None
65
66 return run_successfully, sam_file
67
68
69 def split_cigar(cigar):
70 cigars = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X']
71
72 splited_cigars = []
73 numbers = ''
74 for char in cigar:
75 if char not in cigars:
76 numbers += char
77 else:
78 splited_cigars.append([int(numbers), char])
79 numbers = ''
80
81 return splited_cigars
82
83
84 def recode_cigar_based_on_base_quality(cigar, bases_quality, soft_clip_base_quality, mapping_position,
85 direct_strand_true, soft_clip_cigar_flag_recode):
86 cigar = split_cigar(cigar)
87 soft_left = []
88 soft_right = []
89 cigar_flags_for_reads_length = ('M', 'I', 'S', '=', 'X')
90 read_length_without_right_s = sum([cigar_part[0] for cigar_part in cigar if
91 cigar_part[1] in cigar_flags_for_reads_length]) - \
92 (cigar[len(cigar) - 1][0] if cigar[len(cigar) - 1][1] == 'S' else 0)
93 for x, base in enumerate(bases_quality):
94 if ord(base) - 33 >= soft_clip_base_quality:
95 if x <= cigar[0][0] - 1:
96 if cigar[0][1] == 'S':
97 soft_left.append(x)
98 elif x > read_length_without_right_s - 1:
99 if cigar[len(cigar) - 1][1] == 'S':
100 soft_right.append(x)
101
102 left_changed = (False, 0)
103 if len(soft_left) > 0:
104 soft_left = min(soft_left) + 1
105 if soft_left == 1:
106 cigar = [[cigar[0][0], soft_clip_cigar_flag_recode]] + cigar[1:]
107 left_changed = (True, cigar[0][0])
108 elif cigar[0][0] - soft_left > 0:
109 cigar = [[soft_left, 'S']] + [[cigar[0][0] - soft_left, soft_clip_cigar_flag_recode]] + cigar[1:]
110 left_changed = (True, cigar[0][0] - soft_left)
111
112 right_changed = (False, 0)
113 if len(soft_right) > 0:
114 soft_right = max(soft_right) + 1
115 cigar = cigar[:-1]
116 if soft_right - read_length_without_right_s > 0:
117 cigar.append([soft_right - read_length_without_right_s, soft_clip_cigar_flag_recode])
118 right_changed = (True, soft_right - read_length_without_right_s)
119 if len(bases_quality) - soft_right > 0:
120 cigar.append([len(bases_quality) - soft_right, 'S'])
121
122 if left_changed[0]:
123 # if direct_strand_true:
124 mapping_position = mapping_position - left_changed[1]
125 # if right_changed[0]:
126 # if not direct_strand_true:
127 # mapping_position = mapping_position + right_changed[1]
128
129 return ''.join([str(cigar_part[0]) + cigar_part[1] for cigar_part in cigar]), str(mapping_position)
130
131
132 def verify_is_forward_read(sam_flag_bit):
133 # 64 = 1000000
134 forward_read = False
135 bit = format(sam_flag_bit, 'b').zfill(7)
136 if bit[-7] == '1':
137 forward_read = True
138 return forward_read
139
140
141 def verify_mapped_direct_strand(sam_flag_bit):
142 # 16 = 10000 -> mapped in reverse strand
143 direct_strand = False
144 bit = format(sam_flag_bit, 'b').zfill(5)
145 if bit[-5] == '0':
146 direct_strand = True
147 return direct_strand
148
149
150 def verify_mapped_tip(reference_length, mapping_position, cigar):
151 tip = False
152 if 'S' in cigar:
153 cigar = split_cigar(cigar)
154 if cigar[0][1] == 'S':
155 if mapping_position - cigar[0][0] < 0:
156 tip = True
157 if cigar[len(cigar) - 1][1] == 'S':
158 if mapping_position + cigar[len(cigar) - 1][0] > reference_length:
159 tip = True
160 return tip
161
162
163 def change_sam_flag_bit_mapped_reverse_strand_2_direct_strand(sam_flag_bit):
164 bit = list(format(sam_flag_bit, 'b').zfill(5))
165 bit[-5] = '0'
166 return int(''.join(bit), 2)
167
168
169 def change_sam_flag_bit_mate_reverse_strand_2_direct_strand(sam_flag_bit):
170 bit = list(format(sam_flag_bit, 'b').zfill(6))
171 bit[-6] = '0'
172 return int(''.join(bit), 2)
173
174
175 def move_read_mapped_reverse_strand_2_direct_strand(seq, bases_quality, sam_flag_bit, cigar):
176 seq = utils.reverse_complement(seq)
177 bases_quality = ''.join(reversed(list(bases_quality)))
178 sam_flag_bit = change_sam_flag_bit_mapped_reverse_strand_2_direct_strand(sam_flag_bit)
179 cigar = ''.join([str(cigar_part[0]) + cigar_part[1] for cigar_part in reversed(split_cigar(cigar))])
180 return seq, bases_quality, str(sam_flag_bit), cigar
181
182
183 @utils.trace_unhandled_exceptions
184 def parallelized_recode_soft_clipping(line_collection, pickle_file, soft_clip_base_quality, sequences_length,
185 soft_clip_cigar_flag_recode):
186 lines_sam = []
187 for line in line_collection:
188 line = line.rstrip('\r\n')
189 if len(line) > 0:
190 if line.startswith('@'):
191 lines_sam.append(line)
192 else:
193 line = line.split('\t')
194 if not verify_mapped_tip(sequences_length[line[2]], int(line[3]), line[5]):
195 line[5], line[3] = recode_cigar_based_on_base_quality(line[5], line[10], soft_clip_base_quality,
196 int(line[3]),
197 verify_mapped_direct_strand(int(line[1])),
198 soft_clip_cigar_flag_recode)
199 lines_sam.append('\t'.join(line))
200 with open(pickle_file, 'wb') as writer:
201 pickle.dump(lines_sam, writer)
202
203
204 def recode_soft_clipping_from_sam(sam_file, outdir, threads, soft_clip_base_quality, reference_dict,
205 soft_clip_cigar_flag_recode):
206 pickle_files = []
207 sequences_length = {}
208 for x, seq_info in list(reference_dict.items()):
209 sequences_length[seq_info['header']] = seq_info['length']
210
211 with open(sam_file, 'rtU') as reader:
212 pool = multiprocessing.Pool(processes=threads)
213 line_collection = []
214 x = 0
215 for x, line in enumerate(reader):
216 line_collection.append(line)
217 if x % 10000 == 0:
218 pickle_file = os.path.join(outdir, 'remove_soft_clipping.' + str(x) + '.pkl')
219 pickle_files.append(pickle_file)
220 pool.apply_async(parallelized_recode_soft_clipping, args=(line_collection, pickle_file,
221 soft_clip_base_quality, sequences_length,
222 soft_clip_cigar_flag_recode,))
223 line_collection = []
224 if len(line_collection) > 0:
225 pickle_file = os.path.join(outdir, 'remove_soft_clipping.' + str(x) + '.pkl')
226 pickle_files.append(pickle_file)
227 pool.apply_async(parallelized_recode_soft_clipping, args=(line_collection, pickle_file,
228 soft_clip_base_quality, sequences_length,
229 soft_clip_cigar_flag_recode,))
230 pool.close()
231 pool.join()
232
233 os.remove(sam_file)
234
235 new_sam_file = os.path.join(outdir, 'alignment_with_soft_clipping_recoded.sam')
236 with open(new_sam_file, 'wt') as writer:
237 for pickle_file in pickle_files:
238 if os.path.isfile(pickle_file):
239 lines_sam = None
240 with open(pickle_file, 'rb') as reader:
241 lines_sam = pickle.load(reader)
242 if lines_sam is not None:
243 for line in lines_sam:
244 writer.write(line + '\n')
245 os.remove(pickle_file)
246
247 return new_sam_file
248
249
250 # Sort alignment file
251 def sort_alignment(alignment_file, output_file, sort_by_name_true, threads):
252 out_format_string = os.path.splitext(output_file)[1][1:].lower()
253 command = ['samtools', 'sort', '-o', output_file, '-O', out_format_string, '', '-@', str(threads), alignment_file]
254 if sort_by_name_true:
255 command[6] = '-n'
256 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True)
257 if not run_successfully:
258 output_file = None
259 return run_successfully, output_file
260
261
262 # Index alignment file
263 def index_alignment(alignment_file):
264 command = ['samtools', 'index', alignment_file]
265 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True)
266 return run_successfully
267
268
269 def mapping_reads(fastq_files, reference_file, threads, outdir, num_map_loc, rematch_run,
270 soft_clip_base_quality, soft_clip_recode_run, reference_dict, soft_clip_cigar_flag_recode,
271 bowtie_algorithm, bowtie_opt, clean_run=True):
272 # Create a symbolic link to the reference_file
273 if clean_run:
274 reference_link = os.path.join(outdir, os.path.basename(reference_file))
275 if os.path.islink(reference_link):
276 os.unlink(reference_link)
277 os.symlink(reference_file, reference_link)
278 reference_file = reference_link
279
280 bam_file = None
281 # Mapping reads using Bowtie2
282 run_successfully, sam_file = mapping_bowtie2(fastq_files=fastq_files, reference_file=reference_file,
283 threads=threads, outdir=outdir, num_map_loc=num_map_loc,
284 bowtie_algorithm=bowtie_algorithm, bowtie_opt=bowtie_opt)
285
286 if run_successfully:
287 # Remove soft clipping
288 if rematch_run == soft_clip_recode_run or soft_clip_recode_run == 'both':
289 print('Recoding soft clipped regions')
290 sam_file = recode_soft_clipping_from_sam(sam_file, outdir, threads, soft_clip_base_quality, reference_dict,
291 soft_clip_cigar_flag_recode)
292
293 # Convert sam to bam and sort bam
294 run_successfully, bam_file = sort_alignment(sam_file, str(os.path.splitext(sam_file)[0] + '.bam'), False,
295 threads)
296
297 if run_successfully:
298 os.remove(sam_file)
299 # Index bam
300 run_successfully = index_alignment(bam_file)
301
302 return run_successfully, bam_file, reference_file
303
304
305 def create_vcf(bam_file, sequence_to_analyse, outdir, counter, reference_file):
306 gene_vcf = os.path.join(outdir, 'samtools_mpileup.sequence_' + str(counter) + '.vcf')
307
308 command = ['samtools', 'mpileup', '--count-orphans', '--no-BAQ', '--min-BQ', '0', '--min-MQ', str(7), '--fasta-ref',
309 reference_file, '--region', sequence_to_analyse, '--output', gene_vcf, '--VCF', '--uncompressed',
310 '--output-tags', 'INFO/AD,AD,DP', bam_file]
311
312 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False)
313 if not run_successfully:
314 gene_vcf = None
315 return run_successfully, gene_vcf
316
317
318 # Read vcf file
319 class Vcf:
320 def __init__(self, vcf_file, encoding=None, newline=None):
321 try:
322 self.vcf = open(vcf_file, 'rt', encoding=encoding, newline=newline)
323 except TypeError:
324 self.vcf = open(vcf_file, 'rt')
325 self.line_read = self.vcf.readline()
326 self.contigs_info_dict = {}
327 while self.line_read.startswith('#'):
328 if self.line_read.startswith('##contig=<ID='):
329 seq = self.line_read.split('=')[2].split(',')[0]
330 seq_len = self.line_read.split('=')[3].split('>')[0]
331 self.contigs_info_dict[seq] = int(seq_len)
332 self.line_read = self.vcf.readline()
333 self.line = self.line_read
334
335 def readline(self):
336 line_stored = self.line
337 self.line = self.vcf.readline()
338 return line_stored
339
340 def close(self):
341 self.vcf.close()
342
343 def get_contig_legth(self, contig):
344 return self.contigs_info_dict[contig]
345
346
347 def get_variants(gene_vcf, seq_name, encoding=None, newline=None):
348 variants = {}
349
350 vfc_file = Vcf(vcf_file=gene_vcf, encoding=encoding, newline=newline)
351 line = vfc_file.readline()
352 counter = 1
353 while len(line) > 0:
354 fields = line.rstrip('\r\n').split('\t')
355 if len(fields) > 0:
356 fields[1] = int(fields[1])
357
358 info_field = {}
359 try:
360 for i in fields[7].split(';'):
361 i = i.split('=')
362 if len(i) > 1:
363 info_field[i[0]] = i[1]
364 else:
365 info_field[i[0]] = None
366 except IndexError:
367 if counter > vfc_file.get_contig_legth(contig=seq_name):
368 break
369 else:
370 raise IndexError
371
372 format_field = {}
373 format_field_name = fields[8].split(':')
374 format_data = fields[9].split(':')
375
376 for i in range(0, len(format_data)):
377 format_field[format_field_name[i]] = format_data[i].split(',')
378
379 fields_to_store = {'REF': fields[3], 'ALT': fields[4].split(','), 'info': info_field,
380 'format': format_field}
381 if fields[1] in variants:
382 variants[fields[1]][len(variants[fields[1]])] = fields_to_store
383 else:
384 variants[fields[1]] = {0: fields_to_store}
385
386 try:
387 line = vfc_file.readline()
388 except UnicodeDecodeError:
389 if counter + 1 > vfc_file.get_contig_legth(contig=seq_name):
390 break
391 else:
392 raise UnicodeDecodeError
393
394 counter += 1
395 vfc_file.close()
396
397 return variants
398
399
400 def indel_entry(variant_position):
401 entry_with_indel = []
402 entry_with_snp = None
403 for i in variant_position:
404 keys = list(variant_position[i]['info'].keys())
405 if 'INDEL' in keys:
406 entry_with_indel.append(i)
407 else:
408 entry_with_snp = i
409
410 return entry_with_indel, entry_with_snp
411
412
413 def get_alt_no_matter(variant_position, indel_true):
414 dp = sum(map(int, variant_position['format']['AD']))
415 index_alleles_sorted_position = sorted(zip(list(map(int, variant_position['format']['AD'])),
416 list(range(0, len(variant_position['format']['AD'])))),
417 reverse=True)
418 index_dominant_allele = None
419 if not indel_true:
420 ad_idv = index_alleles_sorted_position[0][0]
421
422 if len([x for x in index_alleles_sorted_position if x[0] == ad_idv]) > 1:
423 index_alleles_sorted_position = sorted([x for x in index_alleles_sorted_position if x[0] == ad_idv])
424
425 index_dominant_allele = index_alleles_sorted_position[0][1]
426 if index_dominant_allele == 0:
427 alt = '.'
428 else:
429 alt = variant_position['ALT'][index_dominant_allele - 1]
430
431 else:
432 ad_idv = int(variant_position['info']['IDV'])
433
434 if float(ad_idv) / float(dp) >= 0.5:
435 if len([x for x in index_alleles_sorted_position if x[0] == index_alleles_sorted_position[0][0]]) > 1:
436 index_alleles_sorted_position = sorted([x for x in index_alleles_sorted_position if
437 x[0] == index_alleles_sorted_position[0][0]])
438
439 index_dominant_allele = index_alleles_sorted_position[0][1]
440 if index_dominant_allele == 0:
441 alt = '.'
442 else:
443 alt = variant_position['ALT'][index_dominant_allele - 1]
444 else:
445 ad_idv = int(variant_position['format']['AD'][0])
446 alt = '.'
447
448 return alt, dp, ad_idv, index_dominant_allele
449
450
451 def count_number_diferences(ref, alt):
452 number_diferences = 0
453
454 if len(ref) != len(alt):
455 number_diferences += 1
456
457 for i in range(0, min(len(ref), len(alt))):
458 if alt[i] != 'N' and ref[i] != alt[i]:
459 number_diferences += 1
460
461 return number_diferences
462
463
464 def get_alt_correct(variant_position, alt_no_matter, dp, ad_idv, index_dominant_allele, minimum_depth_presence,
465 minimum_depth_call, minimum_depth_frequency_dominant_allele):
466 alt = None
467 low_coverage = False
468 multiple_alleles = False
469
470 if dp >= minimum_depth_presence:
471 if dp < minimum_depth_call:
472 alt = 'N' * len(variant_position['REF'])
473 low_coverage = True
474 else:
475 if ad_idv < minimum_depth_call:
476 alt = 'N' * len(variant_position['REF'])
477 low_coverage = True
478 if float(ad_idv) / float(dp) < minimum_depth_frequency_dominant_allele:
479 multiple_alleles = True
480 else:
481 if float(ad_idv) / float(dp) < minimum_depth_frequency_dominant_allele:
482 alt = 'N' * len(variant_position['REF'])
483 if index_dominant_allele is not None:
484 variants_coverage = [int(variant_position['format']['AD'][i]) for i in
485 range(0, len(variant_position['ALT']) + 1) if i != index_dominant_allele]
486 if sum(variants_coverage) > 0:
487 if float(max(variants_coverage)) / float(sum(variants_coverage)) > 0.5:
488 multiple_alleles = True
489 elif float(max(variants_coverage)) / float(sum(variants_coverage)) == 0.5 and \
490 len(variants_coverage) > 2:
491 multiple_alleles = True
492 else:
493 multiple_alleles = True
494 else:
495 alt = alt_no_matter
496 else:
497 low_coverage = True
498
499 return alt, low_coverage, multiple_alleles
500
501
502 def get_alt_alignment(ref, alt):
503 if alt is None:
504 alt = 'N' * len(ref)
505 else:
506 if len(ref) != len(alt):
507 if len(alt) < len(ref):
508 if alt == '.':
509 alt = ref
510 alt += 'N' * (len(ref) - len(alt))
511 else:
512 if alt[:len(ref)] == ref:
513 alt = '.'
514 else:
515 alt = alt[:len(ref)]
516
517 return alt
518
519
520 def get_indel_more_likely(variant_position, indels_entry):
521 indel_coverage = {}
522 for i in indels_entry:
523 indel_coverage[i] = int(variant_position['info']['IDV'])
524 return indel_coverage.index(str(max(indel_coverage.values())))
525
526
527 def determine_variant(variant_position, minimum_depth_presence, minimum_depth_call,
528 minimum_depth_frequency_dominant_allele, indel_true):
529 alt_no_matter, dp, ad_idv, index_dominant_allele = get_alt_no_matter(variant_position, indel_true)
530
531 alt_correct, low_coverage, multiple_alleles = get_alt_correct(variant_position, alt_no_matter, dp, ad_idv,
532 index_dominant_allele, minimum_depth_presence,
533 minimum_depth_call,
534 minimum_depth_frequency_dominant_allele)
535
536 alt_alignment = get_alt_alignment(variant_position['REF'], alt_correct)
537
538 return variant_position['REF'], alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment
539
540
541 def confirm_nucleotides_indel(ref, alt, variants, position_start_indel, minimum_depth_presence, minimum_depth_call,
542 minimum_depth_frequency_dominant_allele, alignment_true):
543 alt = list(alt)
544
545 for i in range(0, len(alt) - 1):
546 if len(alt) < len(ref):
547 new_position = position_start_indel + len(alt) - i - 1
548 alt_position = len(alt) - i - 1
549 else:
550 if i + 1 > len(ref):
551 break
552 new_position = position_start_indel + 1 + i
553 alt_position = 1 + i
554
555 if alt[alt_position] != 'N':
556 if new_position not in variants:
557 if alignment_true:
558 alt[alt_position] = 'N'
559 else:
560 alt = alt[: alt_position]
561 break
562
563 entry_with_indel, entry_with_snp = indel_entry(variants[new_position])
564 new_ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \
565 determine_variant(variants[new_position][entry_with_snp], minimum_depth_presence, minimum_depth_call,
566 minimum_depth_frequency_dominant_allele, False)
567 if alt_no_matter != '.' and alt[alt_position] != alt_no_matter:
568 alt[alt_position] = alt_no_matter
569
570 return ''.join(alt)
571
572
573 def snp_indel(variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele):
574 entry_with_indel, entry_with_snp = indel_entry(variants[position])
575
576 if len(entry_with_indel) == 0:
577 ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \
578 determine_variant(variants[position][entry_with_snp], minimum_depth_presence, minimum_depth_call,
579 minimum_depth_frequency_dominant_allele, False)
580 else:
581 ref_snp, alt_correct_snp, low_coverage_snp, multiple_alleles_snp, alt_no_matter_snp, alt_alignment_snp = \
582 determine_variant(variants[position][entry_with_snp], minimum_depth_presence, minimum_depth_call,
583 minimum_depth_frequency_dominant_allele, False)
584
585 indel_more_likely = entry_with_indel[0]
586 if len(entry_with_indel) > 1:
587 indel_more_likely = get_indel_more_likely(variants[position], entry_with_indel)
588
589 ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \
590 determine_variant(variants[position][indel_more_likely], minimum_depth_presence, minimum_depth_call,
591 minimum_depth_frequency_dominant_allele, True)
592
593 if alt_no_matter == '.':
594 ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \
595 ref_snp, alt_correct_snp, low_coverage_snp, multiple_alleles_snp, alt_no_matter_snp, alt_alignment_snp
596 else:
597 if alt_correct is None and alt_correct_snp is not None:
598 alt_correct = alt_correct_snp
599 elif alt_correct is not None and alt_correct_snp is not None:
600 if alt_correct_snp != '.' and alt_correct[0] != alt_correct_snp:
601 alt_correct = alt_correct_snp + alt_correct[1:] if len(alt_correct) > 1 else alt_correct_snp
602 if alt_no_matter_snp != '.' and alt_no_matter[0] != alt_no_matter_snp:
603 alt_no_matter = alt_no_matter_snp + alt_no_matter[1:] if len(alt_no_matter) > 1 else alt_no_matter_snp
604 if alt_alignment_snp != '.' and alt_alignment[0] != alt_alignment_snp:
605 alt_alignment = alt_alignment_snp + alt_alignment[1:] if len(alt_alignment) > 1 else alt_alignment_snp
606
607 # if alt_no_matter != '.':
608 # alt_no_matter = confirm_nucleotides_indel(ref, alt_no_matter, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False)
609 # if alt_correct is not None and alt_correct != '.':
610 # alt_correct = confirm_nucleotides_indel(ref, alt_correct, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False)
611 # if alt_alignment != '.':
612 # alt_alignment = confirm_nucleotides_indel(ref, alt_alignment, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, True)
613
614 return ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment
615
616
617 def get_true_variants(variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele,
618 sequence):
619 variants_correct = {}
620 variants_no_matter = {}
621 variants_alignment = {}
622
623 correct_absent_positions = {}
624 correct_last_absent_position = ''
625
626 no_matter_absent_positions = {}
627 no_matter_last_absent_position = ''
628
629 multiple_alleles_found = []
630
631 counter = 1
632 while counter <= len(sequence):
633 if counter in variants:
634 no_matter_last_absent_position = ''
635
636 ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \
637 snp_indel(variants, counter, minimum_depth_presence, minimum_depth_call,
638 minimum_depth_frequency_dominant_allele)
639
640 if alt_alignment != '.':
641 variants_alignment[counter] = {'REF': ref, 'ALT': alt_alignment}
642
643 if alt_no_matter != '.':
644 variants_no_matter[counter] = {'REF': ref, 'ALT': alt_no_matter}
645
646 if alt_correct is None:
647 if counter - len(correct_last_absent_position) in correct_absent_positions:
648 correct_absent_positions[counter - len(correct_last_absent_position)]['REF'] += ref
649 else:
650 correct_absent_positions[counter] = {'REF': ref, 'ALT': ''}
651 correct_last_absent_position += ref
652 else:
653 if alt_correct != '.':
654 if len(alt_correct) < len(ref):
655 if len(alt_correct) == 1:
656 correct_absent_positions[counter + 1] = {'REF': ref[1:], 'ALT': ''}
657 else:
658 correct_absent_positions[counter + 1] = {'REF': ref[1:], 'ALT': alt_correct[1:]}
659
660 correct_last_absent_position = ref[1:]
661 else:
662 variants_correct[counter] = {'REF': ref, 'ALT': alt_correct}
663 correct_last_absent_position = ''
664 else:
665 correct_last_absent_position = ''
666
667 if multiple_alleles:
668 multiple_alleles_found.append(counter)
669
670 counter += len(ref)
671 else:
672 variants_alignment[counter] = {'REF': sequence[counter - 1], 'ALT': 'N'}
673
674 if counter - len(correct_last_absent_position) in correct_absent_positions:
675 correct_absent_positions[counter - len(correct_last_absent_position)]['REF'] += sequence[counter - 1]
676 else:
677 correct_absent_positions[counter] = {'REF': sequence[counter - 1], 'ALT': ''}
678 correct_last_absent_position += sequence[counter - 1]
679
680 if counter - len(no_matter_last_absent_position) in no_matter_absent_positions:
681 no_matter_absent_positions[counter - len(no_matter_last_absent_position)]['REF'] += \
682 sequence[counter - 1]
683 else:
684 no_matter_absent_positions[counter] = {'REF': sequence[counter - 1], 'ALT': ''}
685 no_matter_last_absent_position += sequence[counter - 1]
686
687 counter += 1
688
689 for position in correct_absent_positions:
690 if position == 1:
691 variants_correct[position] = {'REF': correct_absent_positions[position]['REF'], 'ALT': 'N'}
692 else:
693 if position - 1 not in variants_correct:
694 variants_correct[position - 1] = \
695 {'REF': sequence[position - 2] + correct_absent_positions[position]['REF'],
696 'ALT': sequence[position - 2] + correct_absent_positions[position]['ALT']}
697 else:
698 variants_correct[position - 1] = \
699 {'REF': variants_correct[position - 1]['REF'] +
700 correct_absent_positions[position]['REF'][len(variants_correct[position - 1]['REF']) - 1:],
701 'ALT': variants_correct[position - 1]['ALT'] +
702 correct_absent_positions[position]['ALT'][len(variants_correct[position - 1]['ALT']) - 1 if
703 len(variants_correct[position - 1]['ALT']) > 0
704 else 0:]}
705
706 for position in no_matter_absent_positions:
707 if position == 1:
708 variants_no_matter[position] = {'REF': no_matter_absent_positions[position]['REF'], 'ALT': 'N'}
709 else:
710 if position - 1 not in variants_no_matter:
711 variants_no_matter[position - 1] = \
712 {'REF': sequence[position - 2] + no_matter_absent_positions[position]['REF'],
713 'ALT': sequence[position - 2] + no_matter_absent_positions[position]['ALT']}
714 else:
715 variants_no_matter[position - 1] = \
716 {'REF': variants_no_matter[position - 1]['REF'] +
717 no_matter_absent_positions[position]['REF'][len(variants_no_matter[position - 1]['REF']) -
718 1:],
719 'ALT': variants_no_matter[position - 1]['ALT'] +
720 no_matter_absent_positions[position]['ALT'][len(variants_no_matter[position - 1]['ALT']) -
721 1 if
722 len(variants_no_matter[position - 1]['ALT']) > 0
723 else 0:]}
724
725 return variants_correct, variants_no_matter, variants_alignment, multiple_alleles_found
726
727
728 def clean_variant_in_extra_seq_left(variant_dict, position, length_extra_seq, multiple_alleles_found,
729 number_multi_alleles):
730 number_diferences = 0
731
732 if position + len(variant_dict[position]['REF']) - 1 > length_extra_seq:
733 if multiple_alleles_found is not None and position in multiple_alleles_found:
734 number_multi_alleles += 1
735
736 temp_variant = variant_dict[position]
737 del variant_dict[position]
738 variant_dict[length_extra_seq] = {}
739 variant_dict[length_extra_seq]['REF'] = temp_variant['REF'][length_extra_seq - position:]
740 variant_dict[length_extra_seq]['ALT'] = temp_variant['ALT'][length_extra_seq - position:] if \
741 len(temp_variant['ALT']) > length_extra_seq - position else \
742 temp_variant['REF'][length_extra_seq - position]
743 number_diferences = count_number_diferences(variant_dict[length_extra_seq]['REF'],
744 variant_dict[length_extra_seq]['ALT'])
745 else:
746 del variant_dict[position]
747
748 return variant_dict, number_multi_alleles, number_diferences
749
750
751 def clean_variant_in_extra_seq_rigth(variant_dict, position, sequence_length, length_extra_seq):
752 if position + len(variant_dict[position]['REF']) - 1 > sequence_length - length_extra_seq:
753 variant_dict[position]['REF'] = \
754 variant_dict[position]['REF'][: - (position - (sequence_length - length_extra_seq)) + 1]
755 variant_dict[position]['ALT'] = \
756 variant_dict[position]['ALT'][: - (position - (sequence_length - length_extra_seq)) + 1] if \
757 len(variant_dict[position]['ALT']) >= - (position - (sequence_length - length_extra_seq)) + 1 else \
758 variant_dict[position]['ALT']
759
760 number_diferences = count_number_diferences(variant_dict[position]['REF'], variant_dict[position]['ALT'])
761
762 return variant_dict, number_diferences
763
764
765 def cleanning_variants_extra_seq(variants_correct, variants_no_matter, variants_alignment, multiple_alleles_found,
766 length_extra_seq, sequence_length):
767 number_multi_alleles = 0
768 number_diferences = 0
769
770 counter = 1
771 while counter <= sequence_length:
772 if counter <= length_extra_seq:
773 if counter in variants_correct:
774 variants_correct, number_multi_alleles, number_diferences = \
775 clean_variant_in_extra_seq_left(variants_correct, counter, length_extra_seq, multiple_alleles_found,
776 number_multi_alleles)
777 if counter in variants_no_matter:
778 variants_no_matter, ignore, ignore = \
779 clean_variant_in_extra_seq_left(variants_no_matter, counter, length_extra_seq, None, None)
780 if counter in variants_alignment:
781 variants_alignment, ignore, ignore = \
782 clean_variant_in_extra_seq_left(variants_alignment, counter, length_extra_seq, None, None)
783 elif sequence_length - length_extra_seq >= counter > length_extra_seq:
784 if counter in variants_correct:
785 if counter in multiple_alleles_found:
786 number_multi_alleles += 1
787 variants_correct, number_diferences_found = \
788 clean_variant_in_extra_seq_rigth(variants_correct, counter, sequence_length, length_extra_seq)
789 number_diferences += number_diferences_found
790 if counter in variants_no_matter:
791 variants_no_matter, ignore = \
792 clean_variant_in_extra_seq_rigth(variants_no_matter, counter, sequence_length, length_extra_seq)
793 if counter in variants_alignment:
794 variants_alignment, ignore = \
795 clean_variant_in_extra_seq_rigth(variants_alignment, counter, sequence_length, length_extra_seq)
796 else:
797 if counter in variants_correct:
798 del variants_correct[counter]
799 if counter in variants_no_matter:
800 del variants_no_matter[counter]
801 if counter in variants_alignment:
802 del variants_alignment[counter]
803
804 counter += 1
805
806 return variants_correct, variants_no_matter, variants_alignment, number_multi_alleles, number_diferences
807
808
809 def get_coverage(gene_coverage):
810 coverage = {}
811
812 with open(gene_coverage, 'rtU') as reader:
813 for line in reader:
814 line = line.rstrip('\r\n')
815 if len(line) > 0:
816 line = line.split('\t')
817 coverage[int(line[1])] = int(line[2])
818
819 return coverage
820
821
822 def get_coverage_report(coverage, sequence_length, minimum_depth_presence, minimum_depth_call, length_extra_seq):
823 if len(coverage) == 0:
824 return sequence_length - 2 * length_extra_seq, 100.0, 0.0
825
826 count_absent = 0
827 count_low_coverage = 0
828 sum_coverage = 0
829
830 counter = 1
831 while counter <= sequence_length:
832 if sequence_length - length_extra_seq >= counter > length_extra_seq:
833 if coverage[counter] < minimum_depth_presence:
834 count_absent += 1
835 else:
836 if coverage[counter] < minimum_depth_call:
837 count_low_coverage += 1
838 sum_coverage += coverage[counter]
839 counter += 1
840
841 mean_coverage = 0
842 percentage_low_coverage = 0
843 if sequence_length - 2 * length_extra_seq - count_absent > 0:
844 mean_coverage = float(sum_coverage) / float(sequence_length - 2 * length_extra_seq - count_absent)
845 percentage_low_coverage = \
846 float(count_low_coverage) / float(sequence_length - 2 * length_extra_seq - count_absent) * 100
847
848 return count_absent, percentage_low_coverage, mean_coverage
849
850
851 # Get genome coverage data
852 def compute_genome_coverage_data(alignment_file, sequence_to_analyse, outdir, counter):
853 genome_coverage_data_file = os.path.join(outdir, 'samtools_depth.sequence_' + str(counter) + '.tab')
854 command = ['samtools', 'depth', '-a', '-q', '0', '-r', sequence_to_analyse, alignment_file, '>',
855 genome_coverage_data_file]
856 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, True, None, False)
857 return run_successfully, genome_coverage_data_file
858
859
860 def write_variants_vcf(variants, outdir, sequence_to_analyse, sufix):
861 vcf_file = os.path.join(outdir, str(sequence_to_analyse + '.' + sufix + '.vcf'))
862 with open(vcf_file, 'wt') as writer:
863 writer.write('##fileformat=VCFv4.2' + '\n')
864 writer.write('#' + '\t'.join(['SEQUENCE', 'POSITION', 'ID_unused', 'REFERENCE_sequence', 'ALTERNATIVE_sequence',
865 'QUALITY_unused', 'FILTER_unused', 'INFO_unused', 'FORMAT_unused']) + '\n')
866 for i in sorted(variants.keys()):
867 writer.write('\t'.join([sequence_to_analyse, str(i), '.', variants[i]['REF'], variants[i]['ALT'], '.', '.',
868 '.', '.']) + '\n')
869
870 compressed_vcf_file = vcf_file + '.gz'
871 command = ['bcftools', 'convert', '-o', compressed_vcf_file, '-O', 'z', vcf_file]
872 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False)
873 if run_successfully:
874 command = ['bcftools', 'index', compressed_vcf_file]
875 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False)
876
877 if not run_successfully:
878 compressed_vcf_file = None
879
880 return run_successfully, compressed_vcf_file
881
882
883 def parse_fasta_in_memory(fasta_memory):
884 fasta_memory = fasta_memory.splitlines()
885 sequence_dict = {}
886 for line in fasta_memory:
887 if len(line) > 0:
888 if line.startswith('>'):
889 sequence_dict = {'header': line[1:], 'sequence': ''}
890 else:
891 sequence_dict['sequence'] += line
892
893 return sequence_dict
894
895
896 def compute_consensus_sequence(reference_file, sequence_to_analyse, compressed_vcf_file, outdir):
897 sequence_dict = None
898
899 gene_fasta = os.path.join(outdir, str(sequence_to_analyse + '.fasta'))
900
901 run_successfully, stdout = index_fasta_samtools(reference_file, sequence_to_analyse, gene_fasta, False)
902 if run_successfully:
903 command = ['bcftools', 'norm', '-c', 's', '-f', gene_fasta, '-Ov', compressed_vcf_file]
904 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False)
905 if run_successfully:
906 command = ['bcftools', 'consensus', '-f', gene_fasta, compressed_vcf_file, '-H', '1']
907 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False)
908 if run_successfully:
909 sequence_dict = parse_fasta_in_memory(stdout)
910
911 return run_successfully, sequence_dict
912
913
914 def create_sample_consensus_sequence(outdir, sequence_to_analyse, reference_file, variants, minimum_depth_presence,
915 minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence,
916 length_extra_seq):
917 variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found = \
918 get_true_variants(variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele,
919 sequence)
920
921 variants_correct, variants_noMatter, variants_alignment, number_multi_alleles, number_diferences = \
922 cleanning_variants_extra_seq(variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found,
923 length_extra_seq, len(sequence))
924
925 run_successfully = False
926 consensus = {'correct': {}, 'noMatter': {}, 'alignment': {}}
927 for variant_type in ['variants_correct', 'variants_noMatter', 'variants_alignment']:
928 run_successfully, compressed_vcf_file = \
929 write_variants_vcf(eval(variant_type), outdir, sequence_to_analyse, variant_type.split('_', 1)[1])
930 if run_successfully:
931 run_successfully, sequence_dict = \
932 compute_consensus_sequence(reference_file, sequence_to_analyse, compressed_vcf_file, outdir)
933 if run_successfully:
934 consensus[variant_type.split('_', 1)[1]] = \
935 {'header': sequence_dict['header'],
936 'sequence': sequence_dict['sequence'][length_extra_seq:len(sequence_dict['sequence']) -
937 length_extra_seq]}
938
939 return run_successfully, number_multi_alleles, consensus, number_diferences
940
941
942 @utils.trace_unhandled_exceptions
943 def analyse_sequence_data(bam_file, sequence_information, outdir, counter, reference_file, length_extra_seq,
944 minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele):
945 count_absent = None
946 percentage_low_coverage = None
947 mean_coverage = None
948 number_diferences = 0
949 number_multi_alleles = 0
950 consensus_sequence = {'correct': {}, 'noMatter': {}, 'alignment': {}}
951
952 # Create vcf file (for multiple alleles check)
953 run_successfully, gene_vcf = create_vcf(bam_file, sequence_information['header'], outdir, counter, reference_file)
954 if run_successfully:
955 # Create coverage tab file
956 run_successfully, gene_coverage = \
957 compute_genome_coverage_data(bam_file, sequence_information['header'], outdir, counter)
958
959 if run_successfully:
960 try:
961 variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
962 encoding=sys.getdefaultencoding())
963 except UnicodeDecodeError:
964 try:
965 print('It was found an enconding error while parsing the following VCF, but lets try forcing it to'
966 ' "utf_8" encoding: {}'.format(gene_vcf))
967 variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
968 encoding='utf_8')
969 except UnicodeDecodeError:
970 print('It was found an enconding error while parsing the following VCF, but lets try forcing it to'
971 ' "latin_1" encoding: {}'.format(gene_vcf))
972 variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
973 encoding='latin_1')
974
975 coverage = get_coverage(gene_coverage)
976
977 run_successfully, number_multi_alleles, consensus_sequence, number_diferences = \
978 create_sample_consensus_sequence(outdir, sequence_information['header'], reference_file, variants,
979 minimum_depth_presence, minimum_depth_call,
980 minimum_depth_frequency_dominant_allele,
981 sequence_information['sequence'], length_extra_seq)
982
983 try:
984 count_absent, percentage_low_coverage, mean_coverage = \
985 get_coverage_report(coverage, sequence_information['length'], minimum_depth_presence,
986 minimum_depth_call, length_extra_seq)
987 except KeyError:
988 print('ERROR: KeyError')
989 print(sequence_information)
990 raise KeyError
991
992 utils.save_variable_to_pickle([run_successfully, counter, number_multi_alleles, count_absent,
993 percentage_low_coverage, mean_coverage, consensus_sequence, number_diferences],
994 outdir, str('coverage_info.' + str(counter)))
995
996
997 def clean_header(header):
998 problematic_characters = ["|", " ", ",", ".", "(", ")", "'", "/", ":"]
999 new_header = str(header)
1000 if any(x in header for x in problematic_characters):
1001 for x in problematic_characters:
1002 new_header = new_header.replace(x, '_')
1003 return header, new_header
1004
1005
1006 def get_sequence_information(fasta_file, length_extra_seq):
1007 sequence_dict = {}
1008 headers = {}
1009 headers_changed = False
1010
1011 with open(fasta_file, 'rtU') as reader:
1012 blank_line_found = False
1013 sequence_counter = 0
1014 temp_sequence_dict = {}
1015 for line in reader:
1016 line = line.rstrip('\r\n')
1017 if len(line) > 0:
1018 if not blank_line_found:
1019 if line.startswith('>'):
1020 if len(temp_sequence_dict) > 0:
1021 if list(temp_sequence_dict.values())[0]['length'] - 2 * length_extra_seq > 0:
1022 sequence_dict[list(temp_sequence_dict.keys())[0]] = list(temp_sequence_dict.values())[0]
1023 else:
1024 print('{header} sequence ignored due to'
1025 ' length = 0'.format(header=list(temp_sequence_dict.values())[0]['header']))
1026 del headers[list(temp_sequence_dict.values())[0]['header']]
1027 temp_sequence_dict = {}
1028
1029 original_header, new_header = clean_header(line[1:])
1030 if new_header in headers:
1031 sys.exit('Found duplicated sequence'
1032 ' headers: {original_header}'.format(original_header=original_header))
1033
1034 sequence_counter += 1
1035 temp_sequence_dict[sequence_counter] = {'header': new_header, 'sequence': '', 'length': 0}
1036 headers[new_header] = str(original_header)
1037 if new_header != original_header:
1038 headers_changed = True
1039 else:
1040 temp_sequence_dict[sequence_counter]['sequence'] += line.replace(' ', '')
1041 temp_sequence_dict[sequence_counter]['length'] += len(line.replace(' ', ''))
1042 else:
1043 sys.exit('It was found a blank line between the fasta file above line ' + line)
1044 else:
1045 blank_line_found = True
1046
1047 if len(temp_sequence_dict) > 0:
1048 if list(temp_sequence_dict.values())[0]['length'] - 2 * length_extra_seq > 0:
1049 sequence_dict[list(temp_sequence_dict.keys())[0]] = list(temp_sequence_dict.values())[0]
1050 else:
1051 print('{header} sequence ignored due to'
1052 ' length <= 0'.format(header=list(temp_sequence_dict.values())[0]['header']))
1053 del headers[list(temp_sequence_dict.values())[0]['header']]
1054
1055 return sequence_dict, headers, headers_changed
1056
1057
1058 def sequence_data(sample, reference_file, bam_file, outdir, threads, length_extra_seq, minimum_depth_presence,
1059 minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, not_write_consensus):
1060 sequence_data_outdir = os.path.join(outdir, 'sequence_data', '')
1061 utils.remove_directory(sequence_data_outdir)
1062 os.mkdir(sequence_data_outdir)
1063
1064 sequences, headers, headers_changed = get_sequence_information(reference_file, length_extra_seq)
1065
1066 pool = multiprocessing.Pool(processes=threads)
1067 for sequence_counter in sequences:
1068 sequence_dir = os.path.join(sequence_data_outdir, str(sequence_counter), '')
1069 utils.remove_directory(sequence_dir)
1070 os.makedirs(sequence_dir)
1071 pool.apply_async(analyse_sequence_data, args=(bam_file, sequences[sequence_counter], sequence_dir,
1072 sequence_counter, reference_file, length_extra_seq,
1073 minimum_depth_presence, minimum_depth_call,
1074 minimum_depth_frequency_dominant_allele,))
1075 pool.close()
1076 pool.join()
1077
1078 run_successfully, sample_data, consensus_files, consensus_sequences = \
1079 gather_data_together(sample, sequence_data_outdir, sequences, outdir.rsplit('/', 2)[0], debug_mode_true,
1080 length_extra_seq, not_write_consensus)
1081
1082 return run_successfully, sample_data, consensus_files, consensus_sequences
1083
1084
1085 def chunkstring(string, length):
1086 return (string[0 + i:length + i] for i in range(0, len(string), length))
1087
1088
1089 def write_consensus(outdir, sample, consensus_sequence):
1090 consensus_files = {}
1091 for consensus_type in ['correct', 'noMatter', 'alignment']:
1092 consensus_files[consensus_type] = os.path.join(outdir, str(sample + '.' + consensus_type + '.fasta'))
1093 with open(consensus_files[consensus_type], 'at') as writer:
1094 writer.write('>' + consensus_sequence[consensus_type]['header'] + '\n')
1095 fasta_sequence_lines = chunkstring(consensus_sequence[consensus_type]['sequence'], 80)
1096 for line in fasta_sequence_lines:
1097 writer.write(line + '\n')
1098 return consensus_files
1099
1100
1101 def gather_data_together(sample, data_directory, sequences_information, outdir, debug_mode_true, length_extra_seq,
1102 not_write_consensus):
1103 run_successfully = True
1104 counter = 0
1105 sample_data = {}
1106
1107 consensus_files = None
1108 consensus_sequences_together = {'correct': {}, 'noMatter': {}, 'alignment': {}}
1109
1110 write_consensus_first_time = True
1111
1112 genes_directories = [d for d in os.listdir(data_directory) if
1113 not d.startswith('.') and
1114 os.path.isdir(os.path.join(data_directory, d, ''))]
1115 for gene_dir in genes_directories:
1116 gene_dir_path = os.path.join(data_directory, gene_dir, '')
1117
1118 files = [f for f in os.listdir(gene_dir_path) if
1119 not f.startswith('.') and
1120 os.path.isfile(os.path.join(gene_dir_path, f))]
1121 for file_found in files:
1122 if file_found.startswith('coverage_info.') and file_found.endswith('.pkl'):
1123 file_path = os.path.join(gene_dir_path, file_found)
1124
1125 if run_successfully:
1126 run_successfully, sequence_counter, multiple_alleles_found, count_absent, percentage_low_coverage, \
1127 mean_coverage, consensus_sequence, \
1128 number_diferences = utils.extract_variable_from_pickle(file_path)
1129
1130 if not not_write_consensus:
1131 for consensus_type in consensus_sequence:
1132 consensus_sequences_together[consensus_type][sequence_counter] = \
1133 {'header': consensus_sequence[consensus_type]['header'],
1134 'sequence': consensus_sequence[consensus_type]['sequence']}
1135
1136 if write_consensus_first_time:
1137 for consensus_type in ['correct', 'noMatter', 'alignment']:
1138 file_to_remove = os.path.join(outdir, str(sample + '.' + consensus_type + '.fasta'))
1139 if os.path.isfile(file_to_remove):
1140 os.remove(file_to_remove)
1141 write_consensus_first_time = False
1142 consensus_files = write_consensus(outdir, sample, consensus_sequence)
1143
1144 gene_identity = 0
1145 if sequences_information[sequence_counter]['length'] - 2 * length_extra_seq - count_absent > 0:
1146 gene_identity = 100 - \
1147 (float(number_diferences) /
1148 (sequences_information[sequence_counter]['length'] - 2 * length_extra_seq -
1149 count_absent)) * 100
1150
1151 sample_data[sequence_counter] = \
1152 {'header': sequences_information[sequence_counter]['header'],
1153 'gene_coverage': 100 - (float(count_absent) /
1154 (sequences_information[sequence_counter]['length'] - 2 *
1155 length_extra_seq)) * 100,
1156 'gene_low_coverage': percentage_low_coverage,
1157 'gene_number_positions_multiple_alleles': multiple_alleles_found,
1158 'gene_mean_read_coverage': mean_coverage,
1159 'gene_identity': gene_identity}
1160 counter += 1
1161
1162 if not debug_mode_true:
1163 utils.remove_directory(gene_dir_path)
1164
1165 if counter != len(sequences_information):
1166 run_successfully = False
1167
1168 return run_successfully, sample_data, consensus_files, consensus_sequences_together
1169
1170
1171 rematch_timer = functools.partial(utils.timer, name='ReMatCh module')
1172
1173
1174 @rematch_timer
1175 def run_rematch_module(sample, fastq_files, reference_file, threads, outdir, length_extra_seq, minimum_depth_presence,
1176 minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage,
1177 debug_mode_true, num_map_loc, minimum_gene_identity, rematch_run,
1178 soft_clip_base_quality, soft_clip_recode_run, reference_dict, soft_clip_cigar_flag_recode,
1179 bowtie_algorithm, bowtie_opt, gene_list_reference, not_write_consensus, clean_run=True):
1180 rematch_folder = os.path.join(outdir, 'rematch_module', '')
1181
1182 utils.remove_directory(rematch_folder)
1183 os.mkdir(rematch_folder)
1184
1185 # Map reads
1186 run_successfully, bam_file, reference_file = mapping_reads(fastq_files=fastq_files, reference_file=reference_file,
1187 threads=threads, outdir=rematch_folder,
1188 num_map_loc=num_map_loc, rematch_run=rematch_run,
1189 soft_clip_base_quality=soft_clip_base_quality,
1190 soft_clip_recode_run=soft_clip_recode_run,
1191 reference_dict=reference_dict,
1192 soft_clip_cigar_flag_recode=soft_clip_cigar_flag_recode,
1193 bowtie_algorithm=bowtie_algorithm, bowtie_opt=bowtie_opt,
1194 clean_run=clean_run)
1195 if run_successfully:
1196 # Index reference file
1197 run_successfully, stdout = index_fasta_samtools(reference_file, None, None, True)
1198 if run_successfully:
1199 print('Analysing alignment data')
1200 run_successfully, sample_data, consensus_files, consensus_sequences = \
1201 sequence_data(sample, reference_file, bam_file, rematch_folder, threads, length_extra_seq,
1202 minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele,
1203 debug_mode_true, not_write_consensus)
1204
1205 if run_successfully:
1206 print('Writing report file')
1207 number_absent_genes = 0
1208 number_genes_multiple_alleles = 0
1209 mean_sample_coverage = 0
1210 with open(os.path.join(outdir, 'rematchModule_report.txt'), 'wt') as writer:
1211 print('\n'.join(outdir, 'rematchModule_report.txt'))
1212 writer.write('\t'.join(['#gene', 'percentage_gene_coverage', 'gene_mean_read_coverage',
1213 'percentage_gene_low_coverage', 'number_positions_multiple_alleles',
1214 'percentage_gene_identity']) + '\n')
1215 for i in range(1, len(sample_data) + 1):
1216 writer.write('\t'.join([gene_list_reference[sample_data[i]['header']],
1217 str(round(sample_data[i]['gene_coverage'], 2)),
1218 str(round(sample_data[i]['gene_mean_read_coverage'], 2)),
1219 str(round(sample_data[i]['gene_low_coverage'], 2)),
1220 str(sample_data[i]['gene_number_positions_multiple_alleles']),
1221 str(round(sample_data[i]['gene_identity'], 2))]) + '\n')
1222
1223 if sample_data[i]['gene_coverage'] < minimum_gene_coverage or \
1224 sample_data[i]['gene_identity'] < minimum_gene_identity:
1225 number_absent_genes += 1
1226 else:
1227 mean_sample_coverage += sample_data[i]['gene_mean_read_coverage']
1228 if sample_data[i]['gene_number_positions_multiple_alleles'] > 0:
1229 number_genes_multiple_alleles += 1
1230
1231 if len(sample_data) - number_absent_genes > 0:
1232 mean_sample_coverage = \
1233 float(mean_sample_coverage) / float(len(sample_data) - number_absent_genes)
1234 else:
1235 mean_sample_coverage = 0
1236
1237 writer.write('\n'.join(['#general',
1238 '>number_absent_genes', str(number_absent_genes),
1239 '>number_genes_multiple_alleles', str(number_genes_multiple_alleles),
1240 '>mean_sample_coverage', str(round(mean_sample_coverage, 2))]) + '\n')
1241
1242 print('\n'.join([str('number_absent_genes: ' + str(number_absent_genes)),
1243 str('number_genes_multiple_alleles: ' + str(number_genes_multiple_alleles)),
1244 str('mean_sample_coverage: ' + str(round(mean_sample_coverage, 2)))]))
1245
1246 if not debug_mode_true:
1247 utils.remove_directory(rematch_folder)
1248
1249 return run_successfully, sample_data if 'sample_data' in locals() else None, \
1250 {'number_absent_genes': number_absent_genes if 'number_absent_genes' in locals() else None,
1251 'number_genes_multiple_alleles': number_genes_multiple_alleles if
1252 'number_genes_multiple_alleles' in locals() else None,
1253 'mean_sample_coverage': round(mean_sample_coverage, 2) if 'mean_sample_coverage' in locals() else None}, \
1254 consensus_files if 'consensus_files' in locals() else None,\
1255 consensus_sequences if 'consensus_sequences' in locals() else None