Mercurial > repos > iuc > gemini_stats
diff gemini_mafify.py @ 7:5c9efee9cb1e draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 5ea789e5342c3ad1afd2e0068c88f2b6dc4f7246"
author | iuc |
---|---|
date | Tue, 10 Mar 2020 06:17:06 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_mafify.py Tue Mar 10 06:17:06 2020 -0400 @@ -0,0 +1,270 @@ +import string +import sys + + +so_to_maf = { + 'splice_acceptor_variant': 'Splice_Site', + 'splice_donor_variant': 'Splice_Site', + 'transcript_ablation': 'Splice_Site', + 'exon_loss_variant': 'Splice_Site', + 'stop_gained': 'Nonsense_Mutation', + 'stop_lost': 'Nonstop_Mutation', + 'frameshift_variant': 'Frame_Shift_', + 'initiator_codon_variant': 'Translation_Start_Site', + 'start_lost': 'Translation_Start_Site', + 'inframe_insertion': 'In_Frame_Ins', + 'inframe_deletion': 'In_Frame_Del', + 'conservative_inframe_insertion': 'In_Frame_Ins', + 'conservative_inframe_deletion': 'In_Frame_Del', + 'disruptive_inframe_insertion': 'In_Frame_Ins', + 'disruptive_inframe_deletion': 'In_Frame_Del', + 'missense_variant': 'Missense_Mutation', + 'coding_sequence_variant': 'Missense_Mutation', + 'conservative_missense_variant': 'Missense_Mutation', + 'rare_amino_acid_variant': 'Missense_Mutation', + 'transcript_amplification': 'Intron', + 'intron_variant': 'Intron', + 'INTRAGENIC': 'Intron', + 'intragenic_variant': 'Intron', + 'splice_region_variant': 'Splice_Region', + 'mature_miRNA_variant': 'RNA', + 'exon_variant': 'RNA', + 'non_coding_exon_variant': 'RNA', + 'non_coding_transcript_exon_variant': 'RNA', + 'non_coding_transcript_variant': 'RNA', + 'nc_transcript_variant': 'RNA', + 'stop_retained_variant': 'Silent', + 'synonymous_variant': 'Silent', + 'NMD_transcript_variant': 'Silent', + 'incomplete_terminal_codon_variant': 'Silent', + '5_prime_UTR_variant': "5'UTR", + '5_prime_UTR_premature_start_codon_gain_variant': "5'UTR", + '3_prime_UTR_variant': "3'UTR", + 'intergenic_variant': 'IGR', + 'intergenic_region': 'IGR', + 'regulatory_region_variant': 'IGR', + 'regulatory_region': 'IGR', + 'TF_binding_site_variant': 'IGR', + 'upstream_gene_variant': "5'Flank", + 'downstream_gene_variant': "3'Flank", +} + + +class VariantEffect(): + def __init__(self, variant_type): + self.variant_type = variant_type.capitalize() + assert self.variant_type in ['Snp', 'Ins', 'Del'] + + def __getitem__(self, so_effect): + if so_effect not in so_to_maf or ( + 'frame' in so_effect and self.variant_type == 'Snp' + ): + return 'Targeted_Region' + + ret = so_to_maf[so_effect] + if ret == 'Frame_Shift_': + ret += self.variant_type + return ret + + +infile = sys.argv[1] +if len(sys.argv) > 2: + tumor_sample_name = sys.argv[2] +if len(sys.argv) > 3: + normal_sample_name = sys.argv[3] + +start_pos_idx = None +ref_idx = None +alt_idx = None +variant_type_idx = None +variant_classification_idx = None +gt_alt_depths_idx = {} +gt_ref_depths_idx = {} +gts_idx = {} +samples = set() +required_fields = [ + 'Hugo_Symbol', + 'NCBI_Build', + 'Variant_Type', + 'Variant_Classification', + 'Tumor_Sample_Barcode', + 'HGVSp_Short' +] + + +with open(infile) as data_in: + cols = data_in.readline().rstrip().split('\t') + for field in required_fields: + if field not in cols: + raise IndexError( + 'Cannot generate valid MAF without the following input ' + 'columns: {0}.\n' + 'Missing column: "{1}"' + .format(required_fields, field) + ) + for i, col in enumerate(cols): + if col == 'Variant_Type': + variant_type_idx = i + elif col == 'Variant_Classification': + variant_classification_idx = i + elif col == 'Start_Position': + start_pos_idx = i + elif col == 'Reference_Allele': + ref_idx = i + elif col == 'alt': + alt_idx = i + else: + column, _, sample = col.partition('.') + if sample: + if column == 'gt_alt_depths': + gt_alt_depths_idx[sample] = i + elif column == 'gt_ref_depths': + gt_ref_depths_idx[sample] = i + elif column == 'gts': + gts_idx[sample] = i + else: + # not a recognized sample-specific column + continue + samples.add(sample) + + if ref_idx is None: + raise IndexError('Input file does not have a column "Reference_Allele".') + + if not tumor_sample_name: + if normal_sample_name: + raise ValueError( + 'Normal sample name requires the tumor sample name to be ' + 'specified, too.' + ) + if len(samples) > 1: + raise ValueError( + 'A tumor sample name is required with more than one sample ' + 'in the input.' + ) + if samples: + # There is a single sample with genotype data. + # Assume its the tumor sample. + tumor_sample_name = next(iter(samples)) + else: + if tumor_sample_name not in samples: + raise ValueError( + 'Could not find information about the specified tumor sample ' + 'in the input.' + ) + if tumor_sample_name == normal_sample_name: + raise ValueError( + 'Need different names for the normal and the tumor sample.' + ) + + if normal_sample_name and normal_sample_name not in samples: + raise ValueError( + 'Could not find information about the specified normal sample ' + 'in the input.' + ) + + # All input data checks passed! + # Now extract just the relevant index numbers for the tumor/normal pair + gts_idx = ( + gts_idx.get(tumor_sample_name, alt_idx), + gts_idx.get(normal_sample_name) + ) + gt_alt_depths_idx = ( + gt_alt_depths_idx.get(tumor_sample_name), + gt_alt_depths_idx.get(normal_sample_name) + ) + gt_ref_depths_idx = ( + gt_ref_depths_idx.get(tumor_sample_name), + gt_ref_depths_idx.get(normal_sample_name) + ) + + # Echo all MAF column names + cols_to_print = [] + for n in range(len(cols)): + if n in gts_idx: + continue + if n in gt_alt_depths_idx: + continue + if n in gt_ref_depths_idx: + continue + if n != alt_idx: + cols_to_print.append(n) + + print('\t'.join([cols[n] for n in cols_to_print])) + + for line in data_in: + cols = line.rstrip().split('\t') + + gt_alt_depths = [ + int(cols[ad_idx]) if ad_idx else '' + for ad_idx in gt_alt_depths_idx + ] + gt_ref_depths = [ + int(cols[rd_idx]) if rd_idx else '' + for rd_idx in gt_ref_depths_idx + ] + + gts = [ + ['', ''], + ['', ''] + ] + for n, gt_idx in enumerate(gts_idx): + if gt_idx: + gt_sep = '/' if '/' in cols[gt_idx] else '|' + allele1, _, allele2 = [ + '' if allele == '.' else allele + for allele in cols[gt_idx].partition(gt_sep) + ] + # follow cBioportal recommendation to leave allele1 empty + # when information is not avaliable + if not allele2: + gts[n] = [allele2, allele1] + else: + gts[n] = [allele1, allele2] + if not gts: + gts = [['', ''], ['', '']] + + if cols[variant_type_idx].lower() in ['ins', 'del']: + # transform VCF-style indel representations into MAF ones + ref_allele = cols[ref_idx] + for n, nucs in enumerate( + zip( + ref_allele, + *[allele for gt in gts for allele in gt if allele] + ) + ): + if any(nuc != nucs[0] for nuc in nucs[1:]): + break + else: + n += 1 + if n > 0: + cols[ref_idx] = cols[ref_idx][n:] or '-' + for gt in gts: + for idx, allele in enumerate(gt): + if allele: + gt[idx] = allele[n:] or '-' + if cols[ref_idx] == '-': + n -= 1 + cols[start_pos_idx] = str(int(cols[start_pos_idx]) + n) + + # in-place substitution of so_effect with MAF effect + cols[variant_classification_idx] = VariantEffect( + cols[variant_type_idx] + )[cols[variant_classification_idx]] + ret_line = '\t'.join([cols[n] for n in cols_to_print]) + + field_formatters = { + 'tumor_seq_allele1': gts[0][0], + 'tumor_seq_allele2': gts[0][1], + 'match_norm_seq_allele1': gts[1][0], + 'match_norm_seq_allele2': gts[1][1], + 't_alt_count': gt_alt_depths[0], + 'n_alt_count': gt_alt_depths[1], + 't_ref_count': gt_ref_depths[0], + 'n_ref_count': gt_ref_depths[1], + } + + print( + # use safe_substitute here to avoid key errors with column content + # looking like unknown placeholders + string.Template(ret_line).safe_substitute(field_formatters) + )