Mercurial > repos > iuc > varscan_somatic
comparison varscan.py @ 6:2c66c4025db2 draft
planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit 4b7660df2384d8c8c6b5a6f02a00d13211e6ff6c
| author | iuc |
|---|---|
| date | Fri, 18 Jan 2019 19:41:10 -0500 |
| parents | 2fe9ebb98aad |
| children | 2657ab48e16a |
comparison
equal
deleted
inserted
replaced
| 5:d37adcc2ec03 | 6:2c66c4025db2 |
|---|---|
| 396 def _add_indel_info_flag_to_header(self, header): | 396 def _add_indel_info_flag_to_header(self, header): |
| 397 header.info.add( | 397 header.info.add( |
| 398 'INDEL', 0, 'Flag', 'Indicates that the variant is an INDEL' | 398 'INDEL', 0, 'Flag', 'Indicates that the variant is an INDEL' |
| 399 ) | 399 ) |
| 400 | 400 |
| 401 def _standardize_format_fields(self, header): | |
| 402 """Add standard FORMAT key declarations to a VCF header.""" | |
| 403 | |
| 404 format_field_specs = [ | |
| 405 ('GT', '1', 'String', 'Genotype'), | |
| 406 ('GQ', '1', 'Integer', 'Genotype quality'), | |
| 407 ('DP', '1', 'Integer', 'Read depth'), | |
| 408 ('AD', 'R', 'Integer', 'Read depth for each allele'), | |
| 409 ('ADF', 'R', 'Integer', | |
| 410 'Read depth for each allele on the forward strand'), | |
| 411 ('ADR', 'R', 'Integer', | |
| 412 'Read depth for each allele on the reverse strand') | |
| 413 ] | |
| 414 # Existing FORMAT declarations cannot be overwritten. | |
| 415 # The only viable strategy is to mark them as removed, | |
| 416 # then merge the header with another one containing the | |
| 417 # correct declarations. This is what is implemented below. | |
| 418 temp_header = pysam.VariantHeader() | |
| 419 for spec in format_field_specs: | |
| 420 temp_header.formats.add(*spec) | |
| 421 if spec[0] in header.formats: | |
| 422 header.formats.remove_header(spec[0]) | |
| 423 header.merge(temp_header) | |
| 424 | |
| 401 def _compile_common_header(self, varcall_template, no_filters=False): | 425 def _compile_common_header(self, varcall_template, no_filters=False): |
| 402 # fix the header generated by VarScan | 426 # fix the header generated by VarScan |
| 403 # by adding reference and contig information | 427 # by adding reference and contig information |
| 404 common_header = pysam.VariantHeader() | 428 common_header = pysam.VariantHeader() |
| 405 common_header.add_meta('reference', value=self.ref_genome) | 429 common_header.add_meta('reference', value=self.ref_genome) |
| 409 self._add_filters_to_header(common_header) | 433 self._add_filters_to_header(common_header) |
| 410 # change the source information | 434 # change the source information |
| 411 common_header.add_meta('source', value='varscan.py') | 435 common_header.add_meta('source', value='varscan.py') |
| 412 # declare an INDEL flag for record INFO fields | 436 # declare an INDEL flag for record INFO fields |
| 413 self._add_indel_info_flag_to_header(common_header) | 437 self._add_indel_info_flag_to_header(common_header) |
| 438 # generate standard FORMAT field declarations | |
| 439 # including a correct AD declaration to prevent VarScan's | |
| 440 # non-standard one from getting propagated | |
| 441 self._standardize_format_fields(common_header) | |
| 414 # take the remaining metadata from the template header produced by | 442 # take the remaining metadata from the template header produced by |
| 415 # VarScan | 443 # VarScan |
| 416 with pysam.VariantFile(varcall_template, 'r') as original_data: | 444 with pysam.VariantFile(varcall_template, 'r') as original_data: |
| 417 varscan_header = original_data.header | 445 varscan_header = original_data.header |
| 418 for sample in varscan_header.samples: | 446 for sample in varscan_header.samples: |
| 419 common_header.samples.add(sample) | 447 common_header.samples.add(sample) |
| 420 common_header.merge(varscan_header) | 448 common_header.merge(varscan_header) |
| 449 # don't propagate non-standard and redundant FORMAT keys written | |
| 450 # by VarScan | |
| 451 common_header.formats.remove_header('FREQ') | |
| 452 common_header.formats.remove_header('RD') | |
| 453 common_header.formats.remove_header('DP4') | |
| 421 return common_header | 454 return common_header |
| 455 | |
| 456 def _fix_read_gt_fields(self, read): | |
| 457 """Migrate non-standard genotype fields to standard ones.""" | |
| 458 | |
| 459 for gt_field in read.samples.values(): | |
| 460 # generate a standard AD field by combining VarScan's | |
| 461 # RD and non-standard AD fields | |
| 462 gt_field['AD'] = (gt_field['RD'], gt_field['AD'][0]) | |
| 463 # split VarScan's DP4 field into the standard fields | |
| 464 # ADF and ADR | |
| 465 gt_field['ADF'] = ( | |
| 466 int(gt_field['DP4'][0]), int(gt_field['DP4'][2]) | |
| 467 ) | |
| 468 gt_field['ADR'] = ( | |
| 469 int(gt_field['DP4'][1]), int(gt_field['DP4'][3]) | |
| 470 ) | |
| 471 # remove redundant fields | |
| 472 # FREQ is easy to calculate from AD | |
| 473 del read.format['FREQ'] | |
| 474 del read.format['RD'] | |
| 475 del read.format['DP4'] | |
| 422 | 476 |
| 423 def pileup_masker(self, mask): | 477 def pileup_masker(self, mask): |
| 424 def apply_mask_on_pileup(piled_items): | 478 def apply_mask_on_pileup(piled_items): |
| 425 for item, status in zip(piled_items, mask): | 479 for item, status in zip(piled_items, mask): |
| 426 if status: | 480 if status: |
| 771 # to avoid Warnings from htslib about missing contig | 825 # to avoid Warnings from htslib about missing contig |
| 772 # info | 826 # info |
| 773 self._add_ref_contigs_to_header(snp_invcf.header) | 827 self._add_ref_contigs_to_header(snp_invcf.header) |
| 774 self._add_filters_to_header(snp_invcf.header) | 828 self._add_filters_to_header(snp_invcf.header) |
| 775 self._add_indel_info_flag_to_header(snp_invcf.header) | 829 self._add_indel_info_flag_to_header(snp_invcf.header) |
| 830 self._standardize_format_fields(snp_invcf.header) | |
| 776 with pysam.VariantFile(indel_f, 'r') as indel_invcf: | 831 with pysam.VariantFile(indel_f, 'r') as indel_invcf: |
| 777 # fix the input header on the fly | 832 # fix the input header on the fly |
| 778 # to avoid Warnings from htslib about missing | 833 # to avoid Warnings from htslib about missing |
| 779 # contig info | 834 # contig info |
| 780 self._add_ref_contigs_to_header(indel_invcf.header) | 835 self._add_ref_contigs_to_header(indel_invcf.header) |
| 781 self._add_filters_to_header(indel_invcf.header) | 836 self._add_filters_to_header(indel_invcf.header) |
| 782 self._add_indel_info_flag_to_header( | 837 self._add_indel_info_flag_to_header( |
| 783 indel_invcf.header | 838 indel_invcf.header |
| 784 ) | 839 ) |
| 840 self._standardize_format_fields(indel_invcf.header) | |
| 785 for record in apply_filters( | 841 for record in apply_filters( |
| 786 self._merge_generator( | 842 self._merge_generator( |
| 787 snp_invcf, | 843 snp_invcf, |
| 788 self._indel_flagged_records(indel_invcf) | 844 self._indel_flagged_records(indel_invcf) |
| 789 ), | 845 ), |
| 790 **filter_args | 846 **filter_args |
| 791 ): | 847 ): |
| 848 self._fix_read_gt_fields(record) | |
| 792 o.write(str(record)) | 849 o.write(str(record)) |
| 793 try: | 850 try: |
| 794 os.remove(snp_f) | 851 os.remove(snp_f) |
| 795 except Exception: | 852 except Exception: |
| 796 pass | 853 pass |
| 808 # to avoid Warnings from htslib about missing | 865 # to avoid Warnings from htslib about missing |
| 809 # contig info and errors because of undeclared | 866 # contig info and errors because of undeclared |
| 810 # filters | 867 # filters |
| 811 self._add_ref_contigs_to_header(invcf.header) | 868 self._add_ref_contigs_to_header(invcf.header) |
| 812 self._add_filters_to_header(invcf.header) | 869 self._add_filters_to_header(invcf.header) |
| 870 self._standardize_format_fields(invcf.header) | |
| 813 for record in apply_filters( | 871 for record in apply_filters( |
| 814 invcf, **filter_args | 872 invcf, **filter_args |
| 815 ): | 873 ): |
| 874 self._fix_read_gt_fields(record) | |
| 816 o.write(str(record)) | 875 o.write(str(record)) |
| 817 try: | 876 try: |
| 818 os.remove(f) | 877 os.remove(f) |
| 819 except Exception: | 878 except Exception: |
| 820 pass | 879 pass |
| 827 # contig info and errors because of undeclared | 886 # contig info and errors because of undeclared |
| 828 # filters | 887 # filters |
| 829 self._add_ref_contigs_to_header(invcf.header) | 888 self._add_ref_contigs_to_header(invcf.header) |
| 830 self._add_filters_to_header(invcf.header) | 889 self._add_filters_to_header(invcf.header) |
| 831 self._add_indel_info_flag_to_header(invcf.header) | 890 self._add_indel_info_flag_to_header(invcf.header) |
| 891 self._standardize_format_fields(invcf.header) | |
| 832 for record in apply_filters( | 892 for record in apply_filters( |
| 833 self._indel_flagged_records(invcf), **filter_args | 893 self._indel_flagged_records(invcf), **filter_args |
| 834 ): | 894 ): |
| 895 self._fix_read_gt_fields(record) | |
| 835 o.write(str(record)) | 896 o.write(str(record)) |
| 836 try: | 897 try: |
| 837 os.remove(f) | 898 os.remove(f) |
| 838 except Exception: | 899 except Exception: |
| 839 pass | 900 pass |
