comparison varscan.py @ 7:7c5e867820cf draft

planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit f5e4d8903b6d69b0507f6a8fc4c2533fc7a15dfa
author iuc
date Mon, 21 Jan 2019 12:05:47 -0500
parents ab2f908f0b08
children 45288fb1f3f5
comparison
equal deleted inserted replaced
6:ab2f908f0b08 7:7c5e867820cf
441 self._standardize_format_fields(common_header) 441 self._standardize_format_fields(common_header)
442 # take the remaining metadata from the template header produced by 442 # take the remaining metadata from the template header produced by
443 # VarScan 443 # VarScan
444 with pysam.VariantFile(varcall_template, 'r') as original_data: 444 with pysam.VariantFile(varcall_template, 'r') as original_data:
445 varscan_header = original_data.header 445 varscan_header = original_data.header
446 # don't propagate non-standard and redundant FORMAT keys written
447 # by VarScan
448 varscan_header.formats.remove_header('AD')
449 varscan_header.formats.remove_header('FREQ')
450 varscan_header.formats.remove_header('RD')
451 varscan_header.formats.remove_header('DP4')
446 for sample in varscan_header.samples: 452 for sample in varscan_header.samples:
447 common_header.samples.add(sample) 453 common_header.samples.add(sample)
448 common_header.merge(varscan_header) 454 common_header.merge(varscan_header)
449 # don't propagate non-standard and redundant FORMAT keys written
450 # by VarScan
451 common_header.formats.remove_header('FREQ')
452 common_header.formats.remove_header('RD')
453 common_header.formats.remove_header('DP4')
454 return common_header 455 return common_header
455 456
456 def _fix_read_gt_fields(self, read): 457 def _fix_record_gt_fields(self, record):
457 """Migrate non-standard genotype fields to standard ones.""" 458 """Migrate non-standard genotype fields to standard ones."""
458 459
459 for gt_field in read.samples.values(): 460 # The key issue to consider here is that we need to modify
461 # genotype field values on a per-sample basis, but htslib
462 # reserves memory for the values of all samples upon the first
463 # modification of the field in the variant record.
464 # => We need to calculate all values first, then fill each
465 # genotype field starting with the sample with the largest value.
466 new_gt_fields = {'AD': [], 'ADF': [], 'ADR': []}
467 # store the current genotype field contents for each sample
468 per_sample_gts = record.samples.values()
469 # calculate and store the new genotype field values
470 for gt_field in per_sample_gts:
460 # generate a standard AD field by combining VarScan's 471 # generate a standard AD field by combining VarScan's
461 # RD and non-standard AD fields 472 # RD and non-standard AD fields
462 gt_field['AD'] = (gt_field['RD'], gt_field['AD'][0]) 473 new_gt_fields['AD'].append((gt_field['RD'], gt_field['AD'][0]))
463 # split VarScan's DP4 field into the standard fields 474 # split VarScan's DP4 field into the standard fields
464 # ADF and ADR 475 # ADF and ADR
465 gt_field['ADF'] = ( 476 new_gt_fields['ADF'].append(
466 int(gt_field['DP4'][0]), int(gt_field['DP4'][2]) 477 (int(gt_field['DP4'][0]), int(gt_field['DP4'][2]))
467 ) 478 )
468 gt_field['ADR'] = ( 479 new_gt_fields['ADR'].append(
469 int(gt_field['DP4'][1]), int(gt_field['DP4'][3]) 480 (int(gt_field['DP4'][1]), int(gt_field['DP4'][3]))
470 ) 481 )
482 # Modify the record's genotype fields.
483 # For each field, modify the sample containing the largest
484 # value for the field first.
485 # Without this precaution we could trigger:
486 # "bcf_update_format: Assertion `!fmt->p_free' failed."
487 # in vcf.c of htslib resulting in a core dump.
488 for field in sorted(new_gt_fields):
489 for new_field, sample_fields in sorted(
490 zip(new_gt_fields[field], per_sample_gts),
491 key=lambda x: max(x[0]),
492 reverse=True
493 ):
494 sample_fields[field] = new_field
471 # remove redundant fields 495 # remove redundant fields
472 # FREQ is easy to calculate from AD 496 # FREQ is easy to calculate from AD
473 del read.format['FREQ'] 497 del record.format['FREQ']
474 del read.format['RD'] 498 del record.format['RD']
475 del read.format['DP4'] 499 del record.format['DP4']
476 500
477 def pileup_masker(self, mask): 501 def pileup_masker(self, mask):
478 def apply_mask_on_pileup(piled_items): 502 def apply_mask_on_pileup(piled_items):
479 for item, status in zip(piled_items, mask): 503 for item, status in zip(piled_items, mask):
480 if status: 504 if status:
843 snp_invcf, 867 snp_invcf,
844 self._indel_flagged_records(indel_invcf) 868 self._indel_flagged_records(indel_invcf)
845 ), 869 ),
846 **filter_args 870 **filter_args
847 ): 871 ):
848 self._fix_read_gt_fields(record) 872 self._fix_record_gt_fields(record)
849 o.write(str(record)) 873 o.write(str(record))
850 try: 874 try:
851 os.remove(snp_f) 875 os.remove(snp_f)
852 except Exception: 876 except Exception:
853 pass 877 pass
869 self._add_filters_to_header(invcf.header) 893 self._add_filters_to_header(invcf.header)
870 self._standardize_format_fields(invcf.header) 894 self._standardize_format_fields(invcf.header)
871 for record in apply_filters( 895 for record in apply_filters(
872 invcf, **filter_args 896 invcf, **filter_args
873 ): 897 ):
874 self._fix_read_gt_fields(record) 898 self._fix_record_gt_fields(record)
875 o.write(str(record)) 899 o.write(str(record))
876 try: 900 try:
877 os.remove(f) 901 os.remove(f)
878 except Exception: 902 except Exception:
879 pass 903 pass
890 self._add_indel_info_flag_to_header(invcf.header) 914 self._add_indel_info_flag_to_header(invcf.header)
891 self._standardize_format_fields(invcf.header) 915 self._standardize_format_fields(invcf.header)
892 for record in apply_filters( 916 for record in apply_filters(
893 self._indel_flagged_records(invcf), **filter_args 917 self._indel_flagged_records(invcf), **filter_args
894 ): 918 ):
895 self._fix_read_gt_fields(record) 919 self._fix_record_gt_fields(record)
896 o.write(str(record)) 920 o.write(str(record))
897 try: 921 try:
898 os.remove(f) 922 os.remove(f)
899 except Exception: 923 except Exception:
900 pass 924 pass