Galaxy |

Changeset 7:86e46972e183 (2020-03-10)

Previous changeset 6:137a3e07062e (2020-01-24) Next changeset 8:20f2ecf46dcb (2022-07-13)

Commit message:
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 5ea789e5342c3ad1afd2e0068c88f2b6dc4f7246"

added:
gemini_mafify.py
test-data/gemini_query_as_maf_result.tabular

diff -r 137a3e07062e -r 86e46972e183 gemini_mafify.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gemini_mafify.py Tue Mar 10 06:14:22 2020 -0400

[

b'@@ -0,0 +1,270 @@\n+import string\n+import sys\n+\n+\n+so_to_maf = {\n+ \'splice_acceptor_variant\': \'Splice_Site\',\n+ \'splice_donor_variant\': \'Splice_Site\',\n+ \'transcript_ablation\': \'Splice_Site\',\n+ \'exon_loss_variant\': \'Splice_Site\',\n+ \'stop_gained\': \'Nonsense_Mutation\',\n+ \'stop_lost\': \'Nonstop_Mutation\',\n+ \'frameshift_variant\': \'Frame_Shift_\',\n+ \'initiator_codon_variant\': \'Translation_Start_Site\',\n+ \'start_lost\': \'Translation_Start_Site\',\n+ \'inframe_insertion\': \'In_Frame_Ins\',\n+ \'inframe_deletion\': \'In_Frame_Del\',\n+ \'conservative_inframe_insertion\': \'In_Frame_Ins\',\n+ \'conservative_inframe_deletion\': \'In_Frame_Del\',\n+ \'disruptive_inframe_insertion\': \'In_Frame_Ins\',\n+ \'disruptive_inframe_deletion\': \'In_Frame_Del\',\n+ \'missense_variant\': \'Missense_Mutation\',\n+ \'coding_sequence_variant\': \'Missense_Mutation\',\n+ \'conservative_missense_variant\': \'Missense_Mutation\',\n+ \'rare_amino_acid_variant\': \'Missense_Mutation\',\n+ \'transcript_amplification\': \'Intron\',\n+ \'intron_variant\': \'Intron\',\n+ \'INTRAGENIC\': \'Intron\',\n+ \'intragenic_variant\': \'Intron\',\n+ \'splice_region_variant\': \'Splice_Region\',\n+ \'mature_miRNA_variant\': \'RNA\',\n+ \'exon_variant\': \'RNA\',\n+ \'non_coding_exon_variant\': \'RNA\',\n+ \'non_coding_transcript_exon_variant\': \'RNA\',\n+ \'non_coding_transcript_variant\': \'RNA\',\n+ \'nc_transcript_variant\': \'RNA\',\n+ \'stop_retained_variant\': \'Silent\',\n+ \'synonymous_variant\': \'Silent\',\n+ \'NMD_transcript_variant\': \'Silent\',\n+ \'incomplete_terminal_codon_variant\': \'Silent\',\n+ \'5_prime_UTR_variant\': "5\'UTR",\n+ \'5_prime_UTR_premature_start_codon_gain_variant\': "5\'UTR",\n+ \'3_prime_UTR_variant\': "3\'UTR",\n+ \'intergenic_variant\': \'IGR\',\n+ \'intergenic_region\': \'IGR\',\n+ \'regulatory_region_variant\': \'IGR\',\n+ \'regulatory_region\': \'IGR\',\n+ \'TF_binding_site_variant\': \'IGR\',\n+ \'upstream_gene_variant\': "5\'Flank",\n+ \'downstream_gene_variant\': "3\'Flank",\n+}\n+\n+\n+class VariantEffect():\n+ def __init__(self, variant_type):\n+ self.variant_type = variant_type.capitalize()\n+ assert self.variant_type in [\'Snp\', \'Ins\', \'Del\']\n+\n+ def __getitem__(self, so_effect):\n+ if so_effect not in so_to_maf or (\n+ \'frame\' in so_effect and self.variant_type == \'Snp\'\n+ ):\n+ return \'Targeted_Region\'\n+\n+ ret = so_to_maf[so_effect]\n+ if ret == \'Frame_Shift_\':\n+ ret += self.variant_type\n+ return ret\n+\n+\n+infile = sys.argv[1]\n+if len(sys.argv) > 2:\n+ tumor_sample_name = sys.argv[2]\n+if len(sys.argv) > 3:\n+ normal_sample_name = sys.argv[3]\n+\n+start_pos_idx = None\n+ref_idx = None\n+alt_idx = None\n+variant_type_idx = None\n+variant_classification_idx = None\n+gt_alt_depths_idx = {}\n+gt_ref_depths_idx = {}\n+gts_idx = {}\n+samples = set()\n+required_fields = [\n+ \'Hugo_Symbol\',\n+ \'NCBI_Build\',\n+ \'Variant_Type\',\n+ \'Variant_Classification\',\n+ \'Tumor_Sample_Barcode\',\n+ \'HGVSp_Short\'\n+]\n+\n+\n+with open(infile) as data_in:\n+ cols = data_in.readline().rstrip().split(\'\\t\')\n+ for field in required_fields:\n+ if field not in cols:\n+ raise IndexError(\n+ \'Cannot generate valid MAF without the following input \'\n+ \'columns: {0}.\\n\'\n+ \'Missing column: "{1}"\'\n+ .format(required_fields, field)\n+ )\n+ for i, col in enumerate(cols):\n+ if col == \'Variant_Type\':\n+ variant_type_idx = i\n+ elif col == \'Variant_Classification\':\n+ variant_classification_idx = i\n+ elif col == \'Start_Position\':\n+ start_pos_idx = i\n+ elif col == \'Reference_Allele\':\n+ ref_idx = i\n+ elif col == \'alt\':\n+ alt_idx = i\n+ else:\n+ column, _, sample = col.partition(\'.\')\n+ if sample:\n+ if column == \'gt_alt_depths\':\n+ gt_alt_depths_idx[sample] = i\n+ elif column == \'gt_ref'..b"\n+ )\n+\n+ if normal_sample_name and normal_sample_name not in samples:\n+ raise ValueError(\n+ 'Could not find information about the specified normal sample '\n+ 'in the input.'\n+ )\n+\n+ # All input data checks passed!\n+ # Now extract just the relevant index numbers for the tumor/normal pair\n+ gts_idx = (\n+ gts_idx.get(tumor_sample_name, alt_idx),\n+ gts_idx.get(normal_sample_name)\n+ )\n+ gt_alt_depths_idx = (\n+ gt_alt_depths_idx.get(tumor_sample_name),\n+ gt_alt_depths_idx.get(normal_sample_name)\n+ )\n+ gt_ref_depths_idx = (\n+ gt_ref_depths_idx.get(tumor_sample_name),\n+ gt_ref_depths_idx.get(normal_sample_name)\n+ )\n+\n+ # Echo all MAF column names\n+ cols_to_print = []\n+ for n in range(len(cols)):\n+ if n in gts_idx:\n+ continue\n+ if n in gt_alt_depths_idx:\n+ continue\n+ if n in gt_ref_depths_idx:\n+ continue\n+ if n != alt_idx:\n+ cols_to_print.append(n)\n+\n+ print('\\t'.join([cols[n] for n in cols_to_print]))\n+\n+ for line in data_in:\n+ cols = line.rstrip().split('\\t')\n+\n+ gt_alt_depths = [\n+ int(cols[ad_idx]) if ad_idx else ''\n+ for ad_idx in gt_alt_depths_idx\n+ ]\n+ gt_ref_depths = [\n+ int(cols[rd_idx]) if rd_idx else ''\n+ for rd_idx in gt_ref_depths_idx\n+ ]\n+\n+ gts = [\n+ ['', ''],\n+ ['', '']\n+ ]\n+ for n, gt_idx in enumerate(gts_idx):\n+ if gt_idx:\n+ gt_sep = '/' if '/' in cols[gt_idx] else '|'\n+ allele1, _, allele2 = [\n+ '' if allele == '.' else allele\n+ for allele in cols[gt_idx].partition(gt_sep)\n+ ]\n+ # follow cBioportal recommendation to leave allele1 empty\n+ # when information is not avaliable\n+ if not allele2:\n+ gts[n] = [allele2, allele1]\n+ else:\n+ gts[n] = [allele1, allele2]\n+ if not gts:\n+ gts = [['', ''], ['', '']]\n+\n+ if cols[variant_type_idx].lower() in ['ins', 'del']:\n+ # transform VCF-style indel representations into MAF ones\n+ ref_allele = cols[ref_idx]\n+ for n, nucs in enumerate(\n+ zip(\n+ ref_allele,\n+ *[allele for gt in gts for allele in gt if allele]\n+ )\n+ ):\n+ if any(nuc != nucs[0] for nuc in nucs[1:]):\n+ break\n+ else:\n+ n += 1\n+ if n > 0:\n+ cols[ref_idx] = cols[ref_idx][n:] or '-'\n+ for gt in gts:\n+ for idx, allele in enumerate(gt):\n+ if allele:\n+ gt[idx] = allele[n:] or '-'\n+ if cols[ref_idx] == '-':\n+ n -= 1\n+ cols[start_pos_idx] = str(int(cols[start_pos_idx]) + n)\n+\n+ # in-place substitution of so_effect with MAF effect\n+ cols[variant_classification_idx] = VariantEffect(\n+ cols[variant_type_idx]\n+ )[cols[variant_classification_idx]]\n+ ret_line = '\\t'.join([cols[n] for n in cols_to_print])\n+\n+ field_formatters = {\n+ 'tumor_seq_allele1': gts[0][0],\n+ 'tumor_seq_allele2': gts[0][1],\n+ 'match_norm_seq_allele1': gts[1][0],\n+ 'match_norm_seq_allele2': gts[1][1],\n+ 't_alt_count': gt_alt_depths[0],\n+ 'n_alt_count': gt_alt_depths[1],\n+ 't_ref_count': gt_ref_depths[0],\n+ 'n_ref_count': gt_ref_depths[1],\n+ }\n+\n+ print(\n+ # use safe_substitute here to avoid key errors with column content\n+ # looking like unknown placeholders\n+ string.Template(ret_line).safe_substitute(field_formatters)\n+ )\n"

diff -r 137a3e07062e -r 86e46972e183 test-data/gemini_query_as_maf_result.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gemini_query_as_maf_result.tabular Tue Mar 10 06:14:22 2020 -0400

@@ -0,0 +1,39 @@
+Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer HGVSp_Short t_alt_count t_ref_count n_alt_count n_ref_count
+unknown 37 1 10583 10583 5'Flank snp G A novel test Somatic p.=
+unknown 37 1 10611 10611 5'Flank snp C G novel test Somatic p.=
+unknown 37 1 13302 13302 RNA snp C T novel test Somatic p.=
+unknown 37 1 13327 13327 RNA snp G C novel test Somatic p.=
+unknown 37 1 13958 13958 RNA del C - novel test Somatic p.=
+unknown 37 1 13980 13980 RNA snp T C novel test Somatic p.=
+unknown 37 1 30923 30923 3'Flank snp G T novel test Somatic p.=
+unknown 37 1 46402 46402 IGR ins - TGT novel test Somatic p.=
+unknown 37 1 47190 47190 IGR ins - A novel test Somatic p.=
+unknown 37 1 51476 51476 IGR snp T C novel test Somatic p.=
+unknown 37 1 51479 51479 IGR snp T A novel test Somatic p.=
+unknown 37 1 51914 51914 IGR snp T G novel test Somatic p.=
+unknown 37 1 51935 51935 IGR snp C T novel test Somatic p.=
+unknown 37 1 51954 51954 IGR snp G C novel test Somatic p.=
+unknown 37 1 52058 52058 IGR snp G C novel test Somatic p.=
+unknown 37 1 52144 52144 IGR snp T A novel test Somatic p.=
+unknown 37 1 52186 52188 IGR del TAA - novel test Somatic p.=
+unknown 37 1 52238 52238 IGR snp T G novel test Somatic p.=
+unknown 37 1 53235 53236 IGR del AT - novel test Somatic p.=
+unknown 37 1 54353 54353 IGR snp C A novel test Somatic p.=
+unknown 37 1 54421 54421 IGR snp A G novel test Somatic p.=
+unknown 37 1 54490 54490 IGR snp G A novel test Somatic p.=
+unknown 37 1 54676 54676 IGR snp C T novel test Somatic p.=
+unknown 37 1 54753 54753 IGR snp T G novel test Somatic p.=
+unknown 37 1 55164 55164 IGR snp C A novel test Somatic p.=
+unknown 37 1 55249 55249 IGR ins - TATGG novel test Somatic p.=
+unknown 37 1 55299 55299 IGR snp C T novel test Somatic p.=
+unknown 37 1 55313 55313 IGR snp A T novel test Somatic p.=
+unknown 37 1 55326 55326 IGR snp T C novel test Somatic p.=
+unknown 37 1 55330 55330 IGR snp G A novel test Somatic p.=
+unknown 37 1 55367 55367 IGR snp G A novel test Somatic p.=
+unknown 37 1 55388 55388 IGR snp C T novel test Somatic p.=
+unknown 37 1 55394 55394 IGR snp T A novel test Somatic p.=
+unknown 37 1 55416 55416 IGR snp G A novel test Somatic p.=
+unknown 37 1 55427 55427 IGR snp T C novel test Somatic p.=
+unknown 37 1 55816 55816 IGR snp G A novel test Somatic p.=
+unknown 37 1 55850 55850 IGR snp C G novel test Somatic p.=
+unknown 37 1 55852 55852 IGR snp G C novel test Somatic p.=