Repository 'gemini_burden'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/gemini_burden

Changeset 7:df57188562f0 (2020-03-10)
Previous changeset 6:12112e6e5ea4 (2020-01-24) Next changeset 8:f0bf88e9e689 (2022-07-13)
Commit message:
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 5ea789e5342c3ad1afd2e0068c88f2b6dc4f7246"
added:
gemini_mafify.py
test-data/gemini_query_as_maf_result.tabular
b
diff -r 12112e6e5ea4 -r df57188562f0 gemini_mafify.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gemini_mafify.py Tue Mar 10 06:17:40 2020 -0400
[
b'@@ -0,0 +1,270 @@\n+import string\n+import sys\n+\n+\n+so_to_maf = {\n+    \'splice_acceptor_variant\': \'Splice_Site\',\n+    \'splice_donor_variant\': \'Splice_Site\',\n+    \'transcript_ablation\': \'Splice_Site\',\n+    \'exon_loss_variant\': \'Splice_Site\',\n+    \'stop_gained\': \'Nonsense_Mutation\',\n+    \'stop_lost\': \'Nonstop_Mutation\',\n+    \'frameshift_variant\': \'Frame_Shift_\',\n+    \'initiator_codon_variant\': \'Translation_Start_Site\',\n+    \'start_lost\': \'Translation_Start_Site\',\n+    \'inframe_insertion\': \'In_Frame_Ins\',\n+    \'inframe_deletion\': \'In_Frame_Del\',\n+    \'conservative_inframe_insertion\': \'In_Frame_Ins\',\n+    \'conservative_inframe_deletion\': \'In_Frame_Del\',\n+    \'disruptive_inframe_insertion\': \'In_Frame_Ins\',\n+    \'disruptive_inframe_deletion\': \'In_Frame_Del\',\n+    \'missense_variant\': \'Missense_Mutation\',\n+    \'coding_sequence_variant\': \'Missense_Mutation\',\n+    \'conservative_missense_variant\': \'Missense_Mutation\',\n+    \'rare_amino_acid_variant\': \'Missense_Mutation\',\n+    \'transcript_amplification\': \'Intron\',\n+    \'intron_variant\': \'Intron\',\n+    \'INTRAGENIC\': \'Intron\',\n+    \'intragenic_variant\': \'Intron\',\n+    \'splice_region_variant\': \'Splice_Region\',\n+    \'mature_miRNA_variant\': \'RNA\',\n+    \'exon_variant\': \'RNA\',\n+    \'non_coding_exon_variant\': \'RNA\',\n+    \'non_coding_transcript_exon_variant\': \'RNA\',\n+    \'non_coding_transcript_variant\': \'RNA\',\n+    \'nc_transcript_variant\': \'RNA\',\n+    \'stop_retained_variant\': \'Silent\',\n+    \'synonymous_variant\': \'Silent\',\n+    \'NMD_transcript_variant\': \'Silent\',\n+    \'incomplete_terminal_codon_variant\': \'Silent\',\n+    \'5_prime_UTR_variant\': "5\'UTR",\n+    \'5_prime_UTR_premature_start_codon_gain_variant\': "5\'UTR",\n+    \'3_prime_UTR_variant\': "3\'UTR",\n+    \'intergenic_variant\': \'IGR\',\n+    \'intergenic_region\': \'IGR\',\n+    \'regulatory_region_variant\': \'IGR\',\n+    \'regulatory_region\': \'IGR\',\n+    \'TF_binding_site_variant\': \'IGR\',\n+    \'upstream_gene_variant\': "5\'Flank",\n+    \'downstream_gene_variant\': "3\'Flank",\n+}\n+\n+\n+class VariantEffect():\n+    def __init__(self, variant_type):\n+        self.variant_type = variant_type.capitalize()\n+        assert self.variant_type in [\'Snp\', \'Ins\', \'Del\']\n+\n+    def __getitem__(self, so_effect):\n+        if so_effect not in so_to_maf or (\n+            \'frame\' in so_effect and self.variant_type == \'Snp\'\n+        ):\n+            return \'Targeted_Region\'\n+\n+        ret = so_to_maf[so_effect]\n+        if ret == \'Frame_Shift_\':\n+            ret += self.variant_type\n+        return ret\n+\n+\n+infile = sys.argv[1]\n+if len(sys.argv) > 2:\n+    tumor_sample_name = sys.argv[2]\n+if len(sys.argv) > 3:\n+    normal_sample_name = sys.argv[3]\n+\n+start_pos_idx = None\n+ref_idx = None\n+alt_idx = None\n+variant_type_idx = None\n+variant_classification_idx = None\n+gt_alt_depths_idx = {}\n+gt_ref_depths_idx = {}\n+gts_idx = {}\n+samples = set()\n+required_fields = [\n+    \'Hugo_Symbol\',\n+    \'NCBI_Build\',\n+    \'Variant_Type\',\n+    \'Variant_Classification\',\n+    \'Tumor_Sample_Barcode\',\n+    \'HGVSp_Short\'\n+]\n+\n+\n+with open(infile) as data_in:\n+    cols = data_in.readline().rstrip().split(\'\\t\')\n+    for field in required_fields:\n+        if field not in cols:\n+            raise IndexError(\n+                \'Cannot generate valid MAF without the following input \'\n+                \'columns: {0}.\\n\'\n+                \'Missing column: "{1}"\'\n+                .format(required_fields, field)\n+            )\n+    for i, col in enumerate(cols):\n+        if col == \'Variant_Type\':\n+            variant_type_idx = i\n+        elif col == \'Variant_Classification\':\n+            variant_classification_idx = i\n+        elif col == \'Start_Position\':\n+            start_pos_idx = i\n+        elif col == \'Reference_Allele\':\n+            ref_idx = i\n+        elif col == \'alt\':\n+            alt_idx = i\n+        else:\n+            column, _, sample = col.partition(\'.\')\n+            if sample:\n+                if column == \'gt_alt_depths\':\n+                    gt_alt_depths_idx[sample] = i\n+                elif column == \'gt_ref'..b"\n+            )\n+\n+    if normal_sample_name and normal_sample_name not in samples:\n+        raise ValueError(\n+            'Could not find information about the specified normal sample '\n+            'in the input.'\n+        )\n+\n+    # All input data checks passed!\n+    # Now extract just the relevant index numbers for the tumor/normal pair\n+    gts_idx = (\n+        gts_idx.get(tumor_sample_name, alt_idx),\n+        gts_idx.get(normal_sample_name)\n+    )\n+    gt_alt_depths_idx = (\n+        gt_alt_depths_idx.get(tumor_sample_name),\n+        gt_alt_depths_idx.get(normal_sample_name)\n+    )\n+    gt_ref_depths_idx = (\n+        gt_ref_depths_idx.get(tumor_sample_name),\n+        gt_ref_depths_idx.get(normal_sample_name)\n+    )\n+\n+    # Echo all MAF column names\n+    cols_to_print = []\n+    for n in range(len(cols)):\n+        if n in gts_idx:\n+            continue\n+        if n in gt_alt_depths_idx:\n+            continue\n+        if n in gt_ref_depths_idx:\n+            continue\n+        if n != alt_idx:\n+            cols_to_print.append(n)\n+\n+    print('\\t'.join([cols[n] for n in cols_to_print]))\n+\n+    for line in data_in:\n+        cols = line.rstrip().split('\\t')\n+\n+        gt_alt_depths = [\n+            int(cols[ad_idx]) if ad_idx else ''\n+            for ad_idx in gt_alt_depths_idx\n+        ]\n+        gt_ref_depths = [\n+            int(cols[rd_idx]) if rd_idx else ''\n+            for rd_idx in gt_ref_depths_idx\n+        ]\n+\n+        gts = [\n+            ['', ''],\n+            ['', '']\n+        ]\n+        for n, gt_idx in enumerate(gts_idx):\n+            if gt_idx:\n+                gt_sep = '/' if '/' in cols[gt_idx] else '|'\n+                allele1, _, allele2 = [\n+                    '' if allele == '.' else allele\n+                    for allele in cols[gt_idx].partition(gt_sep)\n+                ]\n+                # follow cBioportal recommendation to leave allele1 empty\n+                # when information is not avaliable\n+                if not allele2:\n+                    gts[n] = [allele2, allele1]\n+                else:\n+                    gts[n] = [allele1, allele2]\n+        if not gts:\n+            gts = [['', ''], ['', '']]\n+\n+        if cols[variant_type_idx].lower() in ['ins', 'del']:\n+            # transform VCF-style indel representations into MAF ones\n+            ref_allele = cols[ref_idx]\n+            for n, nucs in enumerate(\n+                zip(\n+                    ref_allele,\n+                    *[allele for gt in gts for allele in gt if allele]\n+                )\n+            ):\n+                if any(nuc != nucs[0] for nuc in nucs[1:]):\n+                    break\n+            else:\n+                n += 1\n+            if n > 0:\n+                cols[ref_idx] = cols[ref_idx][n:] or '-'\n+                for gt in gts:\n+                    for idx, allele in enumerate(gt):\n+                        if allele:\n+                            gt[idx] = allele[n:] or '-'\n+                if cols[ref_idx] == '-':\n+                    n -= 1\n+                cols[start_pos_idx] = str(int(cols[start_pos_idx]) + n)\n+\n+        # in-place substitution of so_effect with MAF effect\n+        cols[variant_classification_idx] = VariantEffect(\n+            cols[variant_type_idx]\n+        )[cols[variant_classification_idx]]\n+        ret_line = '\\t'.join([cols[n] for n in cols_to_print])\n+\n+        field_formatters = {\n+            'tumor_seq_allele1': gts[0][0],\n+            'tumor_seq_allele2': gts[0][1],\n+            'match_norm_seq_allele1': gts[1][0],\n+            'match_norm_seq_allele2': gts[1][1],\n+            't_alt_count': gt_alt_depths[0],\n+            'n_alt_count': gt_alt_depths[1],\n+            't_ref_count': gt_ref_depths[0],\n+            'n_ref_count': gt_ref_depths[1],\n+        }\n+\n+        print(\n+            # use safe_substitute here to avoid key errors with column content\n+            # looking like unknown placeholders\n+            string.Template(ret_line).safe_substitute(field_formatters)\n+        )\n"
b
diff -r 12112e6e5ea4 -r df57188562f0 test-data/gemini_query_as_maf_result.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gemini_query_as_maf_result.tabular Tue Mar 10 06:17:40 2020 -0400
b
@@ -0,0 +1,39 @@
+Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer HGVSp_Short t_alt_count t_ref_count n_alt_count n_ref_count
+unknown 37 1 10583 10583 5'Flank snp G A novel test Somatic p.=
+unknown 37 1 10611 10611 5'Flank snp C G novel test Somatic p.=
+unknown 37 1 13302 13302 RNA snp C T novel test Somatic p.=
+unknown 37 1 13327 13327 RNA snp G C novel test Somatic p.=
+unknown 37 1 13958 13958 RNA del C - novel test Somatic p.=
+unknown 37 1 13980 13980 RNA snp T C novel test Somatic p.=
+unknown 37 1 30923 30923 3'Flank snp G T novel test Somatic p.=
+unknown 37 1 46402 46402 IGR ins - TGT novel test Somatic p.=
+unknown 37 1 47190 47190 IGR ins - A novel test Somatic p.=
+unknown 37 1 51476 51476 IGR snp T C novel test Somatic p.=
+unknown 37 1 51479 51479 IGR snp T A novel test Somatic p.=
+unknown 37 1 51914 51914 IGR snp T G novel test Somatic p.=
+unknown 37 1 51935 51935 IGR snp C T novel test Somatic p.=
+unknown 37 1 51954 51954 IGR snp G C novel test Somatic p.=
+unknown 37 1 52058 52058 IGR snp G C novel test Somatic p.=
+unknown 37 1 52144 52144 IGR snp T A novel test Somatic p.=
+unknown 37 1 52186 52188 IGR del TAA - novel test Somatic p.=
+unknown 37 1 52238 52238 IGR snp T G novel test Somatic p.=
+unknown 37 1 53235 53236 IGR del AT - novel test Somatic p.=
+unknown 37 1 54353 54353 IGR snp C A novel test Somatic p.=
+unknown 37 1 54421 54421 IGR snp A G novel test Somatic p.=
+unknown 37 1 54490 54490 IGR snp G A novel test Somatic p.=
+unknown 37 1 54676 54676 IGR snp C T novel test Somatic p.=
+unknown 37 1 54753 54753 IGR snp T G novel test Somatic p.=
+unknown 37 1 55164 55164 IGR snp C A novel test Somatic p.=
+unknown 37 1 55249 55249 IGR ins - TATGG novel test Somatic p.=
+unknown 37 1 55299 55299 IGR snp C T novel test Somatic p.=
+unknown 37 1 55313 55313 IGR snp A T novel test Somatic p.=
+unknown 37 1 55326 55326 IGR snp T C novel test Somatic p.=
+unknown 37 1 55330 55330 IGR snp G A novel test Somatic p.=
+unknown 37 1 55367 55367 IGR snp G A novel test Somatic p.=
+unknown 37 1 55388 55388 IGR snp C T novel test Somatic p.=
+unknown 37 1 55394 55394 IGR snp T A novel test Somatic p.=
+unknown 37 1 55416 55416 IGR snp G A novel test Somatic p.=
+unknown 37 1 55427 55427 IGR snp T C novel test Somatic p.=
+unknown 37 1 55816 55816 IGR snp G A novel test Somatic p.=
+unknown 37 1 55850 55850 IGR snp C G novel test Somatic p.=
+unknown 37 1 55852 55852 IGR snp G C novel test Somatic p.=