Mercurial > repos > iuc > varscan_somatic
comparison varscan.py @ 6:2c66c4025db2 draft
planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit 4b7660df2384d8c8c6b5a6f02a00d13211e6ff6c
author | iuc |
---|---|
date | Fri, 18 Jan 2019 19:41:10 -0500 |
parents | 2fe9ebb98aad |
children | 2657ab48e16a |
comparison
equal
deleted
inserted
replaced
5:d37adcc2ec03 | 6:2c66c4025db2 |
---|---|
396 def _add_indel_info_flag_to_header(self, header): | 396 def _add_indel_info_flag_to_header(self, header): |
397 header.info.add( | 397 header.info.add( |
398 'INDEL', 0, 'Flag', 'Indicates that the variant is an INDEL' | 398 'INDEL', 0, 'Flag', 'Indicates that the variant is an INDEL' |
399 ) | 399 ) |
400 | 400 |
401 def _standardize_format_fields(self, header): | |
402 """Add standard FORMAT key declarations to a VCF header.""" | |
403 | |
404 format_field_specs = [ | |
405 ('GT', '1', 'String', 'Genotype'), | |
406 ('GQ', '1', 'Integer', 'Genotype quality'), | |
407 ('DP', '1', 'Integer', 'Read depth'), | |
408 ('AD', 'R', 'Integer', 'Read depth for each allele'), | |
409 ('ADF', 'R', 'Integer', | |
410 'Read depth for each allele on the forward strand'), | |
411 ('ADR', 'R', 'Integer', | |
412 'Read depth for each allele on the reverse strand') | |
413 ] | |
414 # Existing FORMAT declarations cannot be overwritten. | |
415 # The only viable strategy is to mark them as removed, | |
416 # then merge the header with another one containing the | |
417 # correct declarations. This is what is implemented below. | |
418 temp_header = pysam.VariantHeader() | |
419 for spec in format_field_specs: | |
420 temp_header.formats.add(*spec) | |
421 if spec[0] in header.formats: | |
422 header.formats.remove_header(spec[0]) | |
423 header.merge(temp_header) | |
424 | |
401 def _compile_common_header(self, varcall_template, no_filters=False): | 425 def _compile_common_header(self, varcall_template, no_filters=False): |
402 # fix the header generated by VarScan | 426 # fix the header generated by VarScan |
403 # by adding reference and contig information | 427 # by adding reference and contig information |
404 common_header = pysam.VariantHeader() | 428 common_header = pysam.VariantHeader() |
405 common_header.add_meta('reference', value=self.ref_genome) | 429 common_header.add_meta('reference', value=self.ref_genome) |
409 self._add_filters_to_header(common_header) | 433 self._add_filters_to_header(common_header) |
410 # change the source information | 434 # change the source information |
411 common_header.add_meta('source', value='varscan.py') | 435 common_header.add_meta('source', value='varscan.py') |
412 # declare an INDEL flag for record INFO fields | 436 # declare an INDEL flag for record INFO fields |
413 self._add_indel_info_flag_to_header(common_header) | 437 self._add_indel_info_flag_to_header(common_header) |
438 # generate standard FORMAT field declarations | |
439 # including a correct AD declaration to prevent VarScan's | |
440 # non-standard one from getting propagated | |
441 self._standardize_format_fields(common_header) | |
414 # take the remaining metadata from the template header produced by | 442 # take the remaining metadata from the template header produced by |
415 # VarScan | 443 # VarScan |
416 with pysam.VariantFile(varcall_template, 'r') as original_data: | 444 with pysam.VariantFile(varcall_template, 'r') as original_data: |
417 varscan_header = original_data.header | 445 varscan_header = original_data.header |
418 for sample in varscan_header.samples: | 446 for sample in varscan_header.samples: |
419 common_header.samples.add(sample) | 447 common_header.samples.add(sample) |
420 common_header.merge(varscan_header) | 448 common_header.merge(varscan_header) |
449 # don't propagate non-standard and redundant FORMAT keys written | |
450 # by VarScan | |
451 common_header.formats.remove_header('FREQ') | |
452 common_header.formats.remove_header('RD') | |
453 common_header.formats.remove_header('DP4') | |
421 return common_header | 454 return common_header |
455 | |
456 def _fix_read_gt_fields(self, read): | |
457 """Migrate non-standard genotype fields to standard ones.""" | |
458 | |
459 for gt_field in read.samples.values(): | |
460 # generate a standard AD field by combining VarScan's | |
461 # RD and non-standard AD fields | |
462 gt_field['AD'] = (gt_field['RD'], gt_field['AD'][0]) | |
463 # split VarScan's DP4 field into the standard fields | |
464 # ADF and ADR | |
465 gt_field['ADF'] = ( | |
466 int(gt_field['DP4'][0]), int(gt_field['DP4'][2]) | |
467 ) | |
468 gt_field['ADR'] = ( | |
469 int(gt_field['DP4'][1]), int(gt_field['DP4'][3]) | |
470 ) | |
471 # remove redundant fields | |
472 # FREQ is easy to calculate from AD | |
473 del read.format['FREQ'] | |
474 del read.format['RD'] | |
475 del read.format['DP4'] | |
422 | 476 |
423 def pileup_masker(self, mask): | 477 def pileup_masker(self, mask): |
424 def apply_mask_on_pileup(piled_items): | 478 def apply_mask_on_pileup(piled_items): |
425 for item, status in zip(piled_items, mask): | 479 for item, status in zip(piled_items, mask): |
426 if status: | 480 if status: |
771 # to avoid Warnings from htslib about missing contig | 825 # to avoid Warnings from htslib about missing contig |
772 # info | 826 # info |
773 self._add_ref_contigs_to_header(snp_invcf.header) | 827 self._add_ref_contigs_to_header(snp_invcf.header) |
774 self._add_filters_to_header(snp_invcf.header) | 828 self._add_filters_to_header(snp_invcf.header) |
775 self._add_indel_info_flag_to_header(snp_invcf.header) | 829 self._add_indel_info_flag_to_header(snp_invcf.header) |
830 self._standardize_format_fields(snp_invcf.header) | |
776 with pysam.VariantFile(indel_f, 'r') as indel_invcf: | 831 with pysam.VariantFile(indel_f, 'r') as indel_invcf: |
777 # fix the input header on the fly | 832 # fix the input header on the fly |
778 # to avoid Warnings from htslib about missing | 833 # to avoid Warnings from htslib about missing |
779 # contig info | 834 # contig info |
780 self._add_ref_contigs_to_header(indel_invcf.header) | 835 self._add_ref_contigs_to_header(indel_invcf.header) |
781 self._add_filters_to_header(indel_invcf.header) | 836 self._add_filters_to_header(indel_invcf.header) |
782 self._add_indel_info_flag_to_header( | 837 self._add_indel_info_flag_to_header( |
783 indel_invcf.header | 838 indel_invcf.header |
784 ) | 839 ) |
840 self._standardize_format_fields(indel_invcf.header) | |
785 for record in apply_filters( | 841 for record in apply_filters( |
786 self._merge_generator( | 842 self._merge_generator( |
787 snp_invcf, | 843 snp_invcf, |
788 self._indel_flagged_records(indel_invcf) | 844 self._indel_flagged_records(indel_invcf) |
789 ), | 845 ), |
790 **filter_args | 846 **filter_args |
791 ): | 847 ): |
848 self._fix_read_gt_fields(record) | |
792 o.write(str(record)) | 849 o.write(str(record)) |
793 try: | 850 try: |
794 os.remove(snp_f) | 851 os.remove(snp_f) |
795 except Exception: | 852 except Exception: |
796 pass | 853 pass |
808 # to avoid Warnings from htslib about missing | 865 # to avoid Warnings from htslib about missing |
809 # contig info and errors because of undeclared | 866 # contig info and errors because of undeclared |
810 # filters | 867 # filters |
811 self._add_ref_contigs_to_header(invcf.header) | 868 self._add_ref_contigs_to_header(invcf.header) |
812 self._add_filters_to_header(invcf.header) | 869 self._add_filters_to_header(invcf.header) |
870 self._standardize_format_fields(invcf.header) | |
813 for record in apply_filters( | 871 for record in apply_filters( |
814 invcf, **filter_args | 872 invcf, **filter_args |
815 ): | 873 ): |
874 self._fix_read_gt_fields(record) | |
816 o.write(str(record)) | 875 o.write(str(record)) |
817 try: | 876 try: |
818 os.remove(f) | 877 os.remove(f) |
819 except Exception: | 878 except Exception: |
820 pass | 879 pass |
827 # contig info and errors because of undeclared | 886 # contig info and errors because of undeclared |
828 # filters | 887 # filters |
829 self._add_ref_contigs_to_header(invcf.header) | 888 self._add_ref_contigs_to_header(invcf.header) |
830 self._add_filters_to_header(invcf.header) | 889 self._add_filters_to_header(invcf.header) |
831 self._add_indel_info_flag_to_header(invcf.header) | 890 self._add_indel_info_flag_to_header(invcf.header) |
891 self._standardize_format_fields(invcf.header) | |
832 for record in apply_filters( | 892 for record in apply_filters( |
833 self._indel_flagged_records(invcf), **filter_args | 893 self._indel_flagged_records(invcf), **filter_args |
834 ): | 894 ): |
895 self._fix_read_gt_fields(record) | |
835 o.write(str(record)) | 896 o.write(str(record)) |
836 try: | 897 try: |
837 os.remove(f) | 898 os.remove(f) |
838 except Exception: | 899 except Exception: |
839 pass | 900 pass |