Previous changeset 6:840fb4850be3 (2020-01-24) Next changeset 8:77a1e60fd1de (2022-07-13) |
Commit message:
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 5ea789e5342c3ad1afd2e0068c88f2b6dc4f7246" |
modified:
gemini_query.xml |
added:
gemini_mafify.py test-data/gemini_query_as_maf_result.tabular |
b |
diff -r 840fb4850be3 -r da74170c55c7 gemini_mafify.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_mafify.py Tue Mar 10 06:14:55 2020 -0400 |
[ |
b'@@ -0,0 +1,270 @@\n+import string\n+import sys\n+\n+\n+so_to_maf = {\n+ \'splice_acceptor_variant\': \'Splice_Site\',\n+ \'splice_donor_variant\': \'Splice_Site\',\n+ \'transcript_ablation\': \'Splice_Site\',\n+ \'exon_loss_variant\': \'Splice_Site\',\n+ \'stop_gained\': \'Nonsense_Mutation\',\n+ \'stop_lost\': \'Nonstop_Mutation\',\n+ \'frameshift_variant\': \'Frame_Shift_\',\n+ \'initiator_codon_variant\': \'Translation_Start_Site\',\n+ \'start_lost\': \'Translation_Start_Site\',\n+ \'inframe_insertion\': \'In_Frame_Ins\',\n+ \'inframe_deletion\': \'In_Frame_Del\',\n+ \'conservative_inframe_insertion\': \'In_Frame_Ins\',\n+ \'conservative_inframe_deletion\': \'In_Frame_Del\',\n+ \'disruptive_inframe_insertion\': \'In_Frame_Ins\',\n+ \'disruptive_inframe_deletion\': \'In_Frame_Del\',\n+ \'missense_variant\': \'Missense_Mutation\',\n+ \'coding_sequence_variant\': \'Missense_Mutation\',\n+ \'conservative_missense_variant\': \'Missense_Mutation\',\n+ \'rare_amino_acid_variant\': \'Missense_Mutation\',\n+ \'transcript_amplification\': \'Intron\',\n+ \'intron_variant\': \'Intron\',\n+ \'INTRAGENIC\': \'Intron\',\n+ \'intragenic_variant\': \'Intron\',\n+ \'splice_region_variant\': \'Splice_Region\',\n+ \'mature_miRNA_variant\': \'RNA\',\n+ \'exon_variant\': \'RNA\',\n+ \'non_coding_exon_variant\': \'RNA\',\n+ \'non_coding_transcript_exon_variant\': \'RNA\',\n+ \'non_coding_transcript_variant\': \'RNA\',\n+ \'nc_transcript_variant\': \'RNA\',\n+ \'stop_retained_variant\': \'Silent\',\n+ \'synonymous_variant\': \'Silent\',\n+ \'NMD_transcript_variant\': \'Silent\',\n+ \'incomplete_terminal_codon_variant\': \'Silent\',\n+ \'5_prime_UTR_variant\': "5\'UTR",\n+ \'5_prime_UTR_premature_start_codon_gain_variant\': "5\'UTR",\n+ \'3_prime_UTR_variant\': "3\'UTR",\n+ \'intergenic_variant\': \'IGR\',\n+ \'intergenic_region\': \'IGR\',\n+ \'regulatory_region_variant\': \'IGR\',\n+ \'regulatory_region\': \'IGR\',\n+ \'TF_binding_site_variant\': \'IGR\',\n+ \'upstream_gene_variant\': "5\'Flank",\n+ \'downstream_gene_variant\': "3\'Flank",\n+}\n+\n+\n+class VariantEffect():\n+ def __init__(self, variant_type):\n+ self.variant_type = variant_type.capitalize()\n+ assert self.variant_type in [\'Snp\', \'Ins\', \'Del\']\n+\n+ def __getitem__(self, so_effect):\n+ if so_effect not in so_to_maf or (\n+ \'frame\' in so_effect and self.variant_type == \'Snp\'\n+ ):\n+ return \'Targeted_Region\'\n+\n+ ret = so_to_maf[so_effect]\n+ if ret == \'Frame_Shift_\':\n+ ret += self.variant_type\n+ return ret\n+\n+\n+infile = sys.argv[1]\n+if len(sys.argv) > 2:\n+ tumor_sample_name = sys.argv[2]\n+if len(sys.argv) > 3:\n+ normal_sample_name = sys.argv[3]\n+\n+start_pos_idx = None\n+ref_idx = None\n+alt_idx = None\n+variant_type_idx = None\n+variant_classification_idx = None\n+gt_alt_depths_idx = {}\n+gt_ref_depths_idx = {}\n+gts_idx = {}\n+samples = set()\n+required_fields = [\n+ \'Hugo_Symbol\',\n+ \'NCBI_Build\',\n+ \'Variant_Type\',\n+ \'Variant_Classification\',\n+ \'Tumor_Sample_Barcode\',\n+ \'HGVSp_Short\'\n+]\n+\n+\n+with open(infile) as data_in:\n+ cols = data_in.readline().rstrip().split(\'\\t\')\n+ for field in required_fields:\n+ if field not in cols:\n+ raise IndexError(\n+ \'Cannot generate valid MAF without the following input \'\n+ \'columns: {0}.\\n\'\n+ \'Missing column: "{1}"\'\n+ .format(required_fields, field)\n+ )\n+ for i, col in enumerate(cols):\n+ if col == \'Variant_Type\':\n+ variant_type_idx = i\n+ elif col == \'Variant_Classification\':\n+ variant_classification_idx = i\n+ elif col == \'Start_Position\':\n+ start_pos_idx = i\n+ elif col == \'Reference_Allele\':\n+ ref_idx = i\n+ elif col == \'alt\':\n+ alt_idx = i\n+ else:\n+ column, _, sample = col.partition(\'.\')\n+ if sample:\n+ if column == \'gt_alt_depths\':\n+ gt_alt_depths_idx[sample] = i\n+ elif column == \'gt_ref'..b"\n+ )\n+\n+ if normal_sample_name and normal_sample_name not in samples:\n+ raise ValueError(\n+ 'Could not find information about the specified normal sample '\n+ 'in the input.'\n+ )\n+\n+ # All input data checks passed!\n+ # Now extract just the relevant index numbers for the tumor/normal pair\n+ gts_idx = (\n+ gts_idx.get(tumor_sample_name, alt_idx),\n+ gts_idx.get(normal_sample_name)\n+ )\n+ gt_alt_depths_idx = (\n+ gt_alt_depths_idx.get(tumor_sample_name),\n+ gt_alt_depths_idx.get(normal_sample_name)\n+ )\n+ gt_ref_depths_idx = (\n+ gt_ref_depths_idx.get(tumor_sample_name),\n+ gt_ref_depths_idx.get(normal_sample_name)\n+ )\n+\n+ # Echo all MAF column names\n+ cols_to_print = []\n+ for n in range(len(cols)):\n+ if n in gts_idx:\n+ continue\n+ if n in gt_alt_depths_idx:\n+ continue\n+ if n in gt_ref_depths_idx:\n+ continue\n+ if n != alt_idx:\n+ cols_to_print.append(n)\n+\n+ print('\\t'.join([cols[n] for n in cols_to_print]))\n+\n+ for line in data_in:\n+ cols = line.rstrip().split('\\t')\n+\n+ gt_alt_depths = [\n+ int(cols[ad_idx]) if ad_idx else ''\n+ for ad_idx in gt_alt_depths_idx\n+ ]\n+ gt_ref_depths = [\n+ int(cols[rd_idx]) if rd_idx else ''\n+ for rd_idx in gt_ref_depths_idx\n+ ]\n+\n+ gts = [\n+ ['', ''],\n+ ['', '']\n+ ]\n+ for n, gt_idx in enumerate(gts_idx):\n+ if gt_idx:\n+ gt_sep = '/' if '/' in cols[gt_idx] else '|'\n+ allele1, _, allele2 = [\n+ '' if allele == '.' else allele\n+ for allele in cols[gt_idx].partition(gt_sep)\n+ ]\n+ # follow cBioportal recommendation to leave allele1 empty\n+ # when information is not avaliable\n+ if not allele2:\n+ gts[n] = [allele2, allele1]\n+ else:\n+ gts[n] = [allele1, allele2]\n+ if not gts:\n+ gts = [['', ''], ['', '']]\n+\n+ if cols[variant_type_idx].lower() in ['ins', 'del']:\n+ # transform VCF-style indel representations into MAF ones\n+ ref_allele = cols[ref_idx]\n+ for n, nucs in enumerate(\n+ zip(\n+ ref_allele,\n+ *[allele for gt in gts for allele in gt if allele]\n+ )\n+ ):\n+ if any(nuc != nucs[0] for nuc in nucs[1:]):\n+ break\n+ else:\n+ n += 1\n+ if n > 0:\n+ cols[ref_idx] = cols[ref_idx][n:] or '-'\n+ for gt in gts:\n+ for idx, allele in enumerate(gt):\n+ if allele:\n+ gt[idx] = allele[n:] or '-'\n+ if cols[ref_idx] == '-':\n+ n -= 1\n+ cols[start_pos_idx] = str(int(cols[start_pos_idx]) + n)\n+\n+ # in-place substitution of so_effect with MAF effect\n+ cols[variant_classification_idx] = VariantEffect(\n+ cols[variant_type_idx]\n+ )[cols[variant_classification_idx]]\n+ ret_line = '\\t'.join([cols[n] for n in cols_to_print])\n+\n+ field_formatters = {\n+ 'tumor_seq_allele1': gts[0][0],\n+ 'tumor_seq_allele2': gts[0][1],\n+ 'match_norm_seq_allele1': gts[1][0],\n+ 'match_norm_seq_allele2': gts[1][1],\n+ 't_alt_count': gt_alt_depths[0],\n+ 'n_alt_count': gt_alt_depths[1],\n+ 't_ref_count': gt_ref_depths[0],\n+ 'n_ref_count': gt_ref_depths[1],\n+ }\n+\n+ print(\n+ # use safe_substitute here to avoid key errors with column content\n+ # looking like unknown placeholders\n+ string.Template(ret_line).safe_substitute(field_formatters)\n+ )\n" |
b |
diff -r 840fb4850be3 -r da74170c55c7 gemini_query.xml --- a/gemini_query.xml Fri Jan 24 17:31:00 2020 -0500 +++ b/gemini_query.xml Tue Mar 10 06:14:55 2020 -0400 |
[ |
b'@@ -1,4 +1,4 @@\n-<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@">\n+<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@+galaxy1">\n <description>Querying the GEMINI database</description>\n <macros>\n <import>gemini_macros.xml</import>\n@@ -27,6 +27,13 @@\n <param argument="--dgidb" name="dgidb" type="boolean" truevalue="--dgidb" falsevalue="" checked="False"\n label="Request drug-gene interaction info from DGIdb" help="" />\n </xml>\n+ <xml name="maf_extra_info">\n+ <param name="tumor_sample_name" type="text"\n+ label="Name of the tumor sample in the (multi-sample) GEMINI database"\n+ help="Specify only if the tumor sample is not the only sample in the database." />\n+ <param name="normal_sample_name" type="text"\n+ label="Name of the normal sample in the GEMINI database (for matched tumor/normal sample pair analyses only)" />\n+ </xml>\n </macros>\n <expand macro="requirements" />\n <expand macro="stdio" />\n@@ -69,7 +76,7 @@\n #else:\n affected\n #end if\n- #else:\n+ #elif str($query.oformat.report.format) != \'maf\':\n --format ${query.oformat.report.format}\n #end if\n \n@@ -77,11 +84,36 @@\n ## build the SQL query string from its components\n #if str($query.oformat.report.format) in (\'vcf\', \'tped\'):\n #set $cols = "*"\n+ #elif str($query.oformat.report.format) == \'maf\':\n+ #if str($query.oformat.report.tumor_sample_name):\n+ #set $gt_string = \'gt_alt_depths.{0}, gt_ref_depths.{0}, gts.{0}\'.format(str($query.oformat.report.tumor_sample_name))\n+ #if str($query.oformat.report.normal_sample_name):\n+ #set $gt_string = $gt_string + \', gt_alt_depths.{0}, gt_ref_depths.{0}, gts.{0}\'.format(str($query.oformat.report.normal_sample_name))\n+ #end if\n+ #else:\n+ #set $gt_string = \'(gt_alt_depths).(*), (gt_ref_depths).(*), (gts).(*)\'\n+ #end if\n+ #if str($query.oformat.report.mutation_status.status_select) == \'custom\':\n+ ## Need to quote the user-specified mutation status for the SQL query\n+ #set $mutation_status = \'"%s"\' % str($query.oformat.report.mutation_status.status_custom)\n+ #elif str($query.oformat.report.mutation_status.status_select) == \'expression\':\n+ ## For custom expressions, it is up to the user to ensure valid syntax\n+ #set $mutation_status = str($query.oformat.report.mutation_status.status_expression)\n+ #else:\n+ ## The user selected a fixed value from the list, but\n+ ## it still needs quoting.\n+ #set $mutation_status = \'"%s"\' % str($query.oformat.report.mutation_status.status_select)\n+ #end if\n+ #set $cols = \'ifnull(g1.gene, "unknown") AS Hugo_Symbol, ifnull(ifnull(g2.entrez_id, g1.entrez_id), "") AS Entrez_Gene_Id, "" AS Center, "37" AS NCBI_Build, replace(v.chrom, "chr", "") AS Chromosome, v.start + 1 AS Start_Position, v.end AS End_Position, "" as Strand, v.impact_so AS Variant_Classification, ifnull(nullif(v.type, "indel"), v.sub_type) AS Variant_Type, v.ref AS Reference_Allele, "${tumor_seq_allele1}" AS Tumor_Seq_Allele1, "${tumor_seq_allele2}" AS Tumor_Seq_Allele2, ifnull(v.rs_ids, ifnull(nullif(ifnull(nullif(v.in_omim = 0 AND v.cosmic_ids IS NULL AND v.max_aaf_all = -1, 1), "novel"), 0), "")) AS dbSNP_RS, "" AS dbSNP_Val_Status, printf("%s", "\' + str($query.oformat.report.tumor_sample_id) + \'") AS Tumor_Sample_Barcode, printf("%s", "\' + str($query.oformat.report.norm_sample_id) + \'") AS Matched'..b'when value="Somatic" />\n+ <when value="Germline" />\n+ <when value="LOH" />\n+ <when value="Wildtype" />\n+ <when value="None" />\n+ <when value="custom">\n+ <param name="status_custom" type="text"\n+ label="Mutation status (custom value)">\n+ <validator type="expression" message="Need a value for Mutation status">value.strip()</validator>\n+ </param>\n+ </when>\n+ <when value="expression">\n+ <param name="status_expression" type="text"\n+ label="SQL expression used to compute per-variant status"\n+ help="Enter a valid SQL result column expression to compute the mutation status from columns of the variants table in the GEMINI database. As one example, the expression ifnull(nullif(ifnull(nullif(ifnull(nullif(somatic_status, 3), \'LOH\'), 2), \'Somatic\'), 1), \'Germline\') assumes that you have a column somatic_status added to the variants table of your database, and will record \'Germline\', \'Somatic\', or \'LOH\' for variants with a value of 1, 2, or 3 in that column, respectively.">\n+ <expand macro="sanitize_query" />\n+ <validator type="expression" message="Mutation status expression cannot be empty">value.strip()</validator>\n+ </param>\n+ </when>\n+ </conditional>\n+ <expand macro="sorting" />\n+ </when>\n <when value="tped">\n <param name="header" type="hidden" value="" />\n <param name="dgidb" type="hidden" value="" />\n@@ -222,6 +305,7 @@\n <option value="carrier_summary">tabular with carrier summary</option>\n <option value="vcf">VCF (simplified)</option>\n <option value="json">JSON</option>\n+ <option value="maf">MAF (cBioportal-compatible)</option>\n <option value="tped">TPED</option>\n </param>\n <when value="default">\n@@ -255,6 +339,11 @@\n <param name="header" type="hidden" value="" />\n <param name="dgidb" type="hidden" value="" />\n </when>\n+ <when value="maf">\n+ <param name="header" type="hidden" value="--header" />\n+ <param name="dgidb" type="hidden" value="" />\n+ <expand macro="maf_extra_info" />\n+ </when>\n <when value="tped">\n <param name="header" type="hidden" value="" />\n <param name="dgidb" type="hidden" value="" />\n@@ -285,6 +374,19 @@\n </assert_contents>\n </output>\n </test>\n+ <test>\n+ <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" />\n+ <conditional name="query">\n+ <param name="interface" value="basic" />\n+ </conditional>\n+ <section name="oformat">\n+ <conditional name="report">\n+ <param name="format" value="maf" />\n+ <param name="tumor_sample_id" value="test" />\n+ </conditional>\n+ </section>\n+ <output name="outfile" file="gemini_query_as_maf_result.tabular" />\n+ </test>\n </tests>\n <help>\n <![CDATA[\n' |
b |
diff -r 840fb4850be3 -r da74170c55c7 test-data/gemini_query_as_maf_result.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/gemini_query_as_maf_result.tabular Tue Mar 10 06:14:55 2020 -0400 |
b |
@@ -0,0 +1,39 @@ +Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer HGVSp_Short t_alt_count t_ref_count n_alt_count n_ref_count +unknown 37 1 10583 10583 5'Flank snp G A novel test Somatic p.= +unknown 37 1 10611 10611 5'Flank snp C G novel test Somatic p.= +unknown 37 1 13302 13302 RNA snp C T novel test Somatic p.= +unknown 37 1 13327 13327 RNA snp G C novel test Somatic p.= +unknown 37 1 13958 13958 RNA del C - novel test Somatic p.= +unknown 37 1 13980 13980 RNA snp T C novel test Somatic p.= +unknown 37 1 30923 30923 3'Flank snp G T novel test Somatic p.= +unknown 37 1 46402 46402 IGR ins - TGT novel test Somatic p.= +unknown 37 1 47190 47190 IGR ins - A novel test Somatic p.= +unknown 37 1 51476 51476 IGR snp T C novel test Somatic p.= +unknown 37 1 51479 51479 IGR snp T A novel test Somatic p.= +unknown 37 1 51914 51914 IGR snp T G novel test Somatic p.= +unknown 37 1 51935 51935 IGR snp C T novel test Somatic p.= +unknown 37 1 51954 51954 IGR snp G C novel test Somatic p.= +unknown 37 1 52058 52058 IGR snp G C novel test Somatic p.= +unknown 37 1 52144 52144 IGR snp T A novel test Somatic p.= +unknown 37 1 52186 52188 IGR del TAA - novel test Somatic p.= +unknown 37 1 52238 52238 IGR snp T G novel test Somatic p.= +unknown 37 1 53235 53236 IGR del AT - novel test Somatic p.= +unknown 37 1 54353 54353 IGR snp C A novel test Somatic p.= +unknown 37 1 54421 54421 IGR snp A G novel test Somatic p.= +unknown 37 1 54490 54490 IGR snp G A novel test Somatic p.= +unknown 37 1 54676 54676 IGR snp C T novel test Somatic p.= +unknown 37 1 54753 54753 IGR snp T G novel test Somatic p.= +unknown 37 1 55164 55164 IGR snp C A novel test Somatic p.= +unknown 37 1 55249 55249 IGR ins - TATGG novel test Somatic p.= +unknown 37 1 55299 55299 IGR snp C T novel test Somatic p.= +unknown 37 1 55313 55313 IGR snp A T novel test Somatic p.= +unknown 37 1 55326 55326 IGR snp T C novel test Somatic p.= +unknown 37 1 55330 55330 IGR snp G A novel test Somatic p.= +unknown 37 1 55367 55367 IGR snp G A novel test Somatic p.= +unknown 37 1 55388 55388 IGR snp C T novel test Somatic p.= +unknown 37 1 55394 55394 IGR snp T A novel test Somatic p.= +unknown 37 1 55416 55416 IGR snp G A novel test Somatic p.= +unknown 37 1 55427 55427 IGR snp T C novel test Somatic p.= +unknown 37 1 55816 55816 IGR snp G A novel test Somatic p.= +unknown 37 1 55850 55850 IGR snp C G novel test Somatic p.= +unknown 37 1 55852 55852 IGR snp G C novel test Somatic p.= |