Repository 'gemini_query'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/gemini_query

Changeset 7:da74170c55c7 (2020-03-10)
Previous changeset 6:840fb4850be3 (2020-01-24) Next changeset 8:77a1e60fd1de (2022-07-13)
Commit message:
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 5ea789e5342c3ad1afd2e0068c88f2b6dc4f7246"
modified:
gemini_query.xml
added:
gemini_mafify.py
test-data/gemini_query_as_maf_result.tabular
b
diff -r 840fb4850be3 -r da74170c55c7 gemini_mafify.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gemini_mafify.py Tue Mar 10 06:14:55 2020 -0400
[
b'@@ -0,0 +1,270 @@\n+import string\n+import sys\n+\n+\n+so_to_maf = {\n+    \'splice_acceptor_variant\': \'Splice_Site\',\n+    \'splice_donor_variant\': \'Splice_Site\',\n+    \'transcript_ablation\': \'Splice_Site\',\n+    \'exon_loss_variant\': \'Splice_Site\',\n+    \'stop_gained\': \'Nonsense_Mutation\',\n+    \'stop_lost\': \'Nonstop_Mutation\',\n+    \'frameshift_variant\': \'Frame_Shift_\',\n+    \'initiator_codon_variant\': \'Translation_Start_Site\',\n+    \'start_lost\': \'Translation_Start_Site\',\n+    \'inframe_insertion\': \'In_Frame_Ins\',\n+    \'inframe_deletion\': \'In_Frame_Del\',\n+    \'conservative_inframe_insertion\': \'In_Frame_Ins\',\n+    \'conservative_inframe_deletion\': \'In_Frame_Del\',\n+    \'disruptive_inframe_insertion\': \'In_Frame_Ins\',\n+    \'disruptive_inframe_deletion\': \'In_Frame_Del\',\n+    \'missense_variant\': \'Missense_Mutation\',\n+    \'coding_sequence_variant\': \'Missense_Mutation\',\n+    \'conservative_missense_variant\': \'Missense_Mutation\',\n+    \'rare_amino_acid_variant\': \'Missense_Mutation\',\n+    \'transcript_amplification\': \'Intron\',\n+    \'intron_variant\': \'Intron\',\n+    \'INTRAGENIC\': \'Intron\',\n+    \'intragenic_variant\': \'Intron\',\n+    \'splice_region_variant\': \'Splice_Region\',\n+    \'mature_miRNA_variant\': \'RNA\',\n+    \'exon_variant\': \'RNA\',\n+    \'non_coding_exon_variant\': \'RNA\',\n+    \'non_coding_transcript_exon_variant\': \'RNA\',\n+    \'non_coding_transcript_variant\': \'RNA\',\n+    \'nc_transcript_variant\': \'RNA\',\n+    \'stop_retained_variant\': \'Silent\',\n+    \'synonymous_variant\': \'Silent\',\n+    \'NMD_transcript_variant\': \'Silent\',\n+    \'incomplete_terminal_codon_variant\': \'Silent\',\n+    \'5_prime_UTR_variant\': "5\'UTR",\n+    \'5_prime_UTR_premature_start_codon_gain_variant\': "5\'UTR",\n+    \'3_prime_UTR_variant\': "3\'UTR",\n+    \'intergenic_variant\': \'IGR\',\n+    \'intergenic_region\': \'IGR\',\n+    \'regulatory_region_variant\': \'IGR\',\n+    \'regulatory_region\': \'IGR\',\n+    \'TF_binding_site_variant\': \'IGR\',\n+    \'upstream_gene_variant\': "5\'Flank",\n+    \'downstream_gene_variant\': "3\'Flank",\n+}\n+\n+\n+class VariantEffect():\n+    def __init__(self, variant_type):\n+        self.variant_type = variant_type.capitalize()\n+        assert self.variant_type in [\'Snp\', \'Ins\', \'Del\']\n+\n+    def __getitem__(self, so_effect):\n+        if so_effect not in so_to_maf or (\n+            \'frame\' in so_effect and self.variant_type == \'Snp\'\n+        ):\n+            return \'Targeted_Region\'\n+\n+        ret = so_to_maf[so_effect]\n+        if ret == \'Frame_Shift_\':\n+            ret += self.variant_type\n+        return ret\n+\n+\n+infile = sys.argv[1]\n+if len(sys.argv) > 2:\n+    tumor_sample_name = sys.argv[2]\n+if len(sys.argv) > 3:\n+    normal_sample_name = sys.argv[3]\n+\n+start_pos_idx = None\n+ref_idx = None\n+alt_idx = None\n+variant_type_idx = None\n+variant_classification_idx = None\n+gt_alt_depths_idx = {}\n+gt_ref_depths_idx = {}\n+gts_idx = {}\n+samples = set()\n+required_fields = [\n+    \'Hugo_Symbol\',\n+    \'NCBI_Build\',\n+    \'Variant_Type\',\n+    \'Variant_Classification\',\n+    \'Tumor_Sample_Barcode\',\n+    \'HGVSp_Short\'\n+]\n+\n+\n+with open(infile) as data_in:\n+    cols = data_in.readline().rstrip().split(\'\\t\')\n+    for field in required_fields:\n+        if field not in cols:\n+            raise IndexError(\n+                \'Cannot generate valid MAF without the following input \'\n+                \'columns: {0}.\\n\'\n+                \'Missing column: "{1}"\'\n+                .format(required_fields, field)\n+            )\n+    for i, col in enumerate(cols):\n+        if col == \'Variant_Type\':\n+            variant_type_idx = i\n+        elif col == \'Variant_Classification\':\n+            variant_classification_idx = i\n+        elif col == \'Start_Position\':\n+            start_pos_idx = i\n+        elif col == \'Reference_Allele\':\n+            ref_idx = i\n+        elif col == \'alt\':\n+            alt_idx = i\n+        else:\n+            column, _, sample = col.partition(\'.\')\n+            if sample:\n+                if column == \'gt_alt_depths\':\n+                    gt_alt_depths_idx[sample] = i\n+                elif column == \'gt_ref'..b"\n+            )\n+\n+    if normal_sample_name and normal_sample_name not in samples:\n+        raise ValueError(\n+            'Could not find information about the specified normal sample '\n+            'in the input.'\n+        )\n+\n+    # All input data checks passed!\n+    # Now extract just the relevant index numbers for the tumor/normal pair\n+    gts_idx = (\n+        gts_idx.get(tumor_sample_name, alt_idx),\n+        gts_idx.get(normal_sample_name)\n+    )\n+    gt_alt_depths_idx = (\n+        gt_alt_depths_idx.get(tumor_sample_name),\n+        gt_alt_depths_idx.get(normal_sample_name)\n+    )\n+    gt_ref_depths_idx = (\n+        gt_ref_depths_idx.get(tumor_sample_name),\n+        gt_ref_depths_idx.get(normal_sample_name)\n+    )\n+\n+    # Echo all MAF column names\n+    cols_to_print = []\n+    for n in range(len(cols)):\n+        if n in gts_idx:\n+            continue\n+        if n in gt_alt_depths_idx:\n+            continue\n+        if n in gt_ref_depths_idx:\n+            continue\n+        if n != alt_idx:\n+            cols_to_print.append(n)\n+\n+    print('\\t'.join([cols[n] for n in cols_to_print]))\n+\n+    for line in data_in:\n+        cols = line.rstrip().split('\\t')\n+\n+        gt_alt_depths = [\n+            int(cols[ad_idx]) if ad_idx else ''\n+            for ad_idx in gt_alt_depths_idx\n+        ]\n+        gt_ref_depths = [\n+            int(cols[rd_idx]) if rd_idx else ''\n+            for rd_idx in gt_ref_depths_idx\n+        ]\n+\n+        gts = [\n+            ['', ''],\n+            ['', '']\n+        ]\n+        for n, gt_idx in enumerate(gts_idx):\n+            if gt_idx:\n+                gt_sep = '/' if '/' in cols[gt_idx] else '|'\n+                allele1, _, allele2 = [\n+                    '' if allele == '.' else allele\n+                    for allele in cols[gt_idx].partition(gt_sep)\n+                ]\n+                # follow cBioportal recommendation to leave allele1 empty\n+                # when information is not avaliable\n+                if not allele2:\n+                    gts[n] = [allele2, allele1]\n+                else:\n+                    gts[n] = [allele1, allele2]\n+        if not gts:\n+            gts = [['', ''], ['', '']]\n+\n+        if cols[variant_type_idx].lower() in ['ins', 'del']:\n+            # transform VCF-style indel representations into MAF ones\n+            ref_allele = cols[ref_idx]\n+            for n, nucs in enumerate(\n+                zip(\n+                    ref_allele,\n+                    *[allele for gt in gts for allele in gt if allele]\n+                )\n+            ):\n+                if any(nuc != nucs[0] for nuc in nucs[1:]):\n+                    break\n+            else:\n+                n += 1\n+            if n > 0:\n+                cols[ref_idx] = cols[ref_idx][n:] or '-'\n+                for gt in gts:\n+                    for idx, allele in enumerate(gt):\n+                        if allele:\n+                            gt[idx] = allele[n:] or '-'\n+                if cols[ref_idx] == '-':\n+                    n -= 1\n+                cols[start_pos_idx] = str(int(cols[start_pos_idx]) + n)\n+\n+        # in-place substitution of so_effect with MAF effect\n+        cols[variant_classification_idx] = VariantEffect(\n+            cols[variant_type_idx]\n+        )[cols[variant_classification_idx]]\n+        ret_line = '\\t'.join([cols[n] for n in cols_to_print])\n+\n+        field_formatters = {\n+            'tumor_seq_allele1': gts[0][0],\n+            'tumor_seq_allele2': gts[0][1],\n+            'match_norm_seq_allele1': gts[1][0],\n+            'match_norm_seq_allele2': gts[1][1],\n+            't_alt_count': gt_alt_depths[0],\n+            'n_alt_count': gt_alt_depths[1],\n+            't_ref_count': gt_ref_depths[0],\n+            'n_ref_count': gt_ref_depths[1],\n+        }\n+\n+        print(\n+            # use safe_substitute here to avoid key errors with column content\n+            # looking like unknown placeholders\n+            string.Template(ret_line).safe_substitute(field_formatters)\n+        )\n"
b
diff -r 840fb4850be3 -r da74170c55c7 gemini_query.xml
--- a/gemini_query.xml Fri Jan 24 17:31:00 2020 -0500
+++ b/gemini_query.xml Tue Mar 10 06:14:55 2020 -0400
[
b'@@ -1,4 +1,4 @@\n-<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@">\n+<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@+galaxy1">\n     <description>Querying the GEMINI database</description>\n     <macros>\n         <import>gemini_macros.xml</import>\n@@ -27,6 +27,13 @@\n             <param argument="--dgidb" name="dgidb" type="boolean" truevalue="--dgidb" falsevalue="" checked="False"\n             label="Request drug-gene interaction info from DGIdb" help="" />\n         </xml>\n+        <xml name="maf_extra_info">\n+            <param name="tumor_sample_name" type="text"\n+            label="Name of the tumor sample in the (multi-sample) GEMINI database"\n+            help="Specify only if the tumor sample is not the only sample in the database." />\n+            <param name="normal_sample_name" type="text"\n+            label="Name of the normal sample in the GEMINI database (for matched tumor/normal sample pair analyses only)" />\n+        </xml>\n     </macros>\n     <expand macro="requirements" />\n     <expand macro="stdio" />\n@@ -69,7 +76,7 @@\n                 #else:\n                     affected\n                 #end if\n-            #else:\n+            #elif str($query.oformat.report.format) != \'maf\':\n                 --format ${query.oformat.report.format}\n             #end if\n \n@@ -77,11 +84,36 @@\n                 ## build the SQL query string from its components\n                 #if str($query.oformat.report.format) in (\'vcf\', \'tped\'):\n                     #set $cols = "*"\n+                #elif str($query.oformat.report.format) == \'maf\':\n+                    #if str($query.oformat.report.tumor_sample_name):\n+                        #set $gt_string = \'gt_alt_depths.{0}, gt_ref_depths.{0}, gts.{0}\'.format(str($query.oformat.report.tumor_sample_name))\n+                        #if str($query.oformat.report.normal_sample_name):\n+                            #set $gt_string = $gt_string + \', gt_alt_depths.{0}, gt_ref_depths.{0}, gts.{0}\'.format(str($query.oformat.report.normal_sample_name))\n+                        #end if\n+                    #else:\n+                        #set $gt_string = \'(gt_alt_depths).(*), (gt_ref_depths).(*), (gts).(*)\'\n+                    #end if\n+                    #if str($query.oformat.report.mutation_status.status_select) == \'custom\':\n+                        ## Need to quote the user-specified mutation status for the SQL query\n+                        #set $mutation_status = \'"%s"\' % str($query.oformat.report.mutation_status.status_custom)\n+                    #elif str($query.oformat.report.mutation_status.status_select) == \'expression\':\n+                        ## For custom expressions, it is up to the user to ensure valid syntax\n+                        #set $mutation_status = str($query.oformat.report.mutation_status.status_expression)\n+                    #else:\n+                        ## The user selected a fixed value from the list, but\n+                        ## it still needs quoting.\n+                        #set $mutation_status = \'"%s"\' % str($query.oformat.report.mutation_status.status_select)\n+                    #end if\n+                    #set $cols = \'ifnull(g1.gene, "unknown") AS Hugo_Symbol, ifnull(ifnull(g2.entrez_id, g1.entrez_id), "") AS Entrez_Gene_Id, "" AS Center, "37" AS NCBI_Build, replace(v.chrom, "chr", "") AS Chromosome, v.start + 1 AS Start_Position, v.end AS End_Position, "" as Strand, v.impact_so AS Variant_Classification, ifnull(nullif(v.type, "indel"), v.sub_type) AS Variant_Type, v.ref AS Reference_Allele, "${tumor_seq_allele1}" AS Tumor_Seq_Allele1, "${tumor_seq_allele2}" AS Tumor_Seq_Allele2, ifnull(v.rs_ids, ifnull(nullif(ifnull(nullif(v.in_omim = 0 AND v.cosmic_ids IS NULL AND v.max_aaf_all = -1, 1), "novel"), 0), "")) AS dbSNP_RS, "" AS dbSNP_Val_Status, printf("%s", "\' + str($query.oformat.report.tumor_sample_id) + \'") AS Tumor_Sample_Barcode, printf("%s", "\' + str($query.oformat.report.norm_sample_id) + \'") AS Matched'..b'when value="Somatic" />\n+                                <when value="Germline" />\n+                                <when value="LOH" />\n+                                <when value="Wildtype" />\n+                                <when value="None" />\n+                                <when value="custom">\n+                                    <param name="status_custom" type="text"\n+                                    label="Mutation status (custom value)">\n+                                        <validator type="expression" message="Need a value for Mutation status">value.strip()</validator>\n+                                    </param>\n+                                </when>\n+                                <when value="expression">\n+                                    <param name="status_expression" type="text"\n+                                    label="SQL expression used to compute per-variant status"\n+                                    help="Enter a valid SQL result column expression to compute the mutation status from columns of the variants table in the GEMINI database. As one example, the expression ifnull(nullif(ifnull(nullif(ifnull(nullif(somatic_status, 3), \'LOH\'), 2), \'Somatic\'), 1), \'Germline\') assumes that you have a column somatic_status added to the variants table of your database, and will record \'Germline\', \'Somatic\', or \'LOH\' for variants with a value of 1, 2, or 3 in that column, respectively.">\n+                                        <expand macro="sanitize_query" />\n+                                        <validator type="expression" message="Mutation status expression cannot be empty">value.strip()</validator>\n+                                    </param>\n+                                </when>\n+                            </conditional>\n+                            <expand macro="sorting" />\n+                        </when>\n                         <when value="tped">\n                             <param name="header" type="hidden" value="" />\n                             <param name="dgidb" type="hidden" value="" />\n@@ -222,6 +305,7 @@\n                             <option value="carrier_summary">tabular with carrier summary</option>\n                             <option value="vcf">VCF (simplified)</option>\n                             <option value="json">JSON</option>\n+                            <option value="maf">MAF (cBioportal-compatible)</option>\n                             <option value="tped">TPED</option>\n                         </param>\n                         <when value="default">\n@@ -255,6 +339,11 @@\n                             <param name="header" type="hidden" value="" />\n                             <param name="dgidb" type="hidden" value="" />\n                         </when>\n+                        <when value="maf">\n+                            <param name="header" type="hidden" value="--header" />\n+                            <param name="dgidb" type="hidden" value="" />\n+                            <expand macro="maf_extra_info" />\n+                        </when>\n                         <when value="tped">\n                             <param name="header" type="hidden" value="" />\n                             <param name="dgidb" type="hidden" value="" />\n@@ -285,6 +374,19 @@\n                 </assert_contents>\n             </output>\n         </test>\n+        <test>\n+            <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" />\n+            <conditional name="query">\n+                <param name="interface" value="basic" />\n+            </conditional>\n+            <section name="oformat">\n+                <conditional name="report">\n+                    <param name="format" value="maf" />\n+                    <param name="tumor_sample_id" value="test" />\n+                </conditional>\n+            </section>\n+            <output name="outfile" file="gemini_query_as_maf_result.tabular" />\n+        </test>\n     </tests>\n     <help>\n <![CDATA[\n'
b
diff -r 840fb4850be3 -r da74170c55c7 test-data/gemini_query_as_maf_result.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gemini_query_as_maf_result.tabular Tue Mar 10 06:14:55 2020 -0400
b
@@ -0,0 +1,39 @@
+Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer HGVSp_Short t_alt_count t_ref_count n_alt_count n_ref_count
+unknown 37 1 10583 10583 5'Flank snp G A novel test Somatic p.=
+unknown 37 1 10611 10611 5'Flank snp C G novel test Somatic p.=
+unknown 37 1 13302 13302 RNA snp C T novel test Somatic p.=
+unknown 37 1 13327 13327 RNA snp G C novel test Somatic p.=
+unknown 37 1 13958 13958 RNA del C - novel test Somatic p.=
+unknown 37 1 13980 13980 RNA snp T C novel test Somatic p.=
+unknown 37 1 30923 30923 3'Flank snp G T novel test Somatic p.=
+unknown 37 1 46402 46402 IGR ins - TGT novel test Somatic p.=
+unknown 37 1 47190 47190 IGR ins - A novel test Somatic p.=
+unknown 37 1 51476 51476 IGR snp T C novel test Somatic p.=
+unknown 37 1 51479 51479 IGR snp T A novel test Somatic p.=
+unknown 37 1 51914 51914 IGR snp T G novel test Somatic p.=
+unknown 37 1 51935 51935 IGR snp C T novel test Somatic p.=
+unknown 37 1 51954 51954 IGR snp G C novel test Somatic p.=
+unknown 37 1 52058 52058 IGR snp G C novel test Somatic p.=
+unknown 37 1 52144 52144 IGR snp T A novel test Somatic p.=
+unknown 37 1 52186 52188 IGR del TAA - novel test Somatic p.=
+unknown 37 1 52238 52238 IGR snp T G novel test Somatic p.=
+unknown 37 1 53235 53236 IGR del AT - novel test Somatic p.=
+unknown 37 1 54353 54353 IGR snp C A novel test Somatic p.=
+unknown 37 1 54421 54421 IGR snp A G novel test Somatic p.=
+unknown 37 1 54490 54490 IGR snp G A novel test Somatic p.=
+unknown 37 1 54676 54676 IGR snp C T novel test Somatic p.=
+unknown 37 1 54753 54753 IGR snp T G novel test Somatic p.=
+unknown 37 1 55164 55164 IGR snp C A novel test Somatic p.=
+unknown 37 1 55249 55249 IGR ins - TATGG novel test Somatic p.=
+unknown 37 1 55299 55299 IGR snp C T novel test Somatic p.=
+unknown 37 1 55313 55313 IGR snp A T novel test Somatic p.=
+unknown 37 1 55326 55326 IGR snp T C novel test Somatic p.=
+unknown 37 1 55330 55330 IGR snp G A novel test Somatic p.=
+unknown 37 1 55367 55367 IGR snp G A novel test Somatic p.=
+unknown 37 1 55388 55388 IGR snp C T novel test Somatic p.=
+unknown 37 1 55394 55394 IGR snp T A novel test Somatic p.=
+unknown 37 1 55416 55416 IGR snp G A novel test Somatic p.=
+unknown 37 1 55427 55427 IGR snp T C novel test Somatic p.=
+unknown 37 1 55816 55816 IGR snp G A novel test Somatic p.=
+unknown 37 1 55850 55850 IGR snp C G novel test Somatic p.=
+unknown 37 1 55852 55852 IGR snp G C novel test Somatic p.=