Mercurial > repos > iuc > varscan_copynumber
changeset 2:4ce3441c407c draft
planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit 30867f1f022bed18ba1c3b8dc9c54226890b3a9c
author | iuc |
---|---|
date | Tue, 04 Dec 2018 05:15:15 -0500 |
parents | 8d8de66233ea |
children | 489642c0b543 |
files | macros.xml test-data/control_chrM.bam test-data/hg19_chrM.fa test-data/tumor_chrM.bam test-data/varscan_somatic_indel_result1.vcf test-data/varscan_somatic_indel_result2.vcf test-data/varscan_somatic_snp_result1.vcf test-data/varscan_somatic_snp_result2.vcf varscan.py varscan_copynumber.xml |
diffstat | 10 files changed, 1538 insertions(+), 125 deletions(-) [+] |
line wrap: on
line diff
--- a/macros.xml Sun Jul 15 09:19:48 2018 -0400 +++ b/macros.xml Tue Dec 04 05:15:15 2018 -0500 @@ -2,7 +2,6 @@ <xml name="requirements"> <requirements> <requirement type="package" version="@VERSION@">varscan</requirement> - <requirement type="package" version="4.2.1">gawk</requirement> <yield/> </requirements> </xml> @@ -20,6 +19,7 @@ <xml name="citations"> <citations> <citation type="doi">10.1101/gr.129684.111</citation> + <citation type="doi">10.1002/0471250953.bi1504s44</citation> </citations> </xml> @@ -52,29 +52,34 @@ </token> - <xml name="min_coverage"> + <xml name="min_coverage" token_help="Minimum depth at a position to make a call"> <param argument="--min-coverage" name="min_coverage" type="integer" value="8" min="1" max="200" - label="Minimum read depth" help="Minimum depth at a position to make a call"/> + label="Minimum coverage" help="@HELP@"/> </xml> <xml name="min_reads2"> <param argument="--min-reads2" name="min_reads2" type="integer" value="2" min="1" max="200" - label="Minimum supporting reads" help="Minimum supporting reads at a position to make a call"/> + label="Minimum supporting reads" help="Minimum number of variant-supporting reads at a position required to make a call"/> </xml> <xml name="min_avg_qual"> <param argument="--min-avg-qual" name="min_avg_qual" type="integer" value="15" min="1" max="50" - label="Minimum base quality at a position to count a read"/> + label="Minimum base quality" + help="The minimum base quality at the variant position required to use a read for calling" /> </xml> <xml name="min_var_freq" token_value="0.01"> <param argument="--min-var-freq" name="min_var_freq" type="float" value="@VALUE@" min="0" max="1" - label="Minimum variant allele frequency threshold"/> + label="Minimum variant allele frequency" + help="Minimum variant allele frequency required for calling a variant"/> </xml> <xml name="min_freq_for_hom"> <param argument="--min-freq-for-hom" name="min_freq_for_hom" type="float" value="0.75" min="0" max="1" - label="Minimum frequency to call homozygote"/> + label="Minimum homozygous variant allele frequency" + help="Minimum variant allele frequency required for calling a homozygous genotype" /> </xml> - <xml name="p_value" token_label="p-value threshold for calling variants" token_value="0.01"> - <param argument="--p-value" name="p_value" type="float" value="@VALUE@" min="0.0" max="1.0" - label="@LABEL@"/> + <xml name="p_value" token_value="0.01" + token_label="p-value threshold for calling variants" + token_help=""> + <param argument="--p-value" name="p_value" type="float" value="@VALUE@" min="0" max="1" + label="@LABEL@" help="@HELP@"/> </xml> <xml name="strand_filter"> <param name="strand_filter" type="select" label="Ignore variants with >90% support on one strand"> @@ -83,4 +88,12 @@ </param> </xml> + <token name="@HELP_HEADER@"><![CDATA[ +**VarScan Overview** + +VarScan_ performs variant detection for massively parallel sequencing data, such as exome, WGS, and transcriptome data. Full documentation of the command line package is available here_. + +.. _VarScan: http://dkoboldt.github.io/varscan/ +.. _here: http://dkoboldt.github.io/varscan/using-varscan.html + ]]></token> </macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/hg19_chrM.fa Tue Dec 04 05:15:15 2018 -0500 @@ -0,0 +1,333 @@ +>chrM +GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT +TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG +GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATT +CTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACCTACTA +AAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAAT +GTCTGCACAGCCGCTTTCCACACAGACATCATAACAAAAAATTTCCACCA +AACCCCCCCCTCCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGC +CAAACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAAT +TTTATCTTTAGGCGGTATGCACTTTTAACAGTCACCCCCCAACTAACACA +TTATTTTCCCCTCCCACTCCCATACTACTAATCTCATCAATACAACCCCC +GCCCATCCTACCCAGCACACACACACCGCTGCTAACCCCATACCCCGAAC +CAACCAAACCCCAAAGACACCCCCCACAGTTTATGTAGCTTACCTCCTCA +AAGCAATACACTGAAAATGTTTAGACGGGCTCACATCACCCCATAAACAA +ATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGC +AAGCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAG +GGACAAGCATCAAGCACGCAGCAATGCAGCTCAAAACGCTTAGCCTAGCC +ACACCCCCACGGGAAACAGCAGTGATTAACCTTTAGCAATAAACGAAAGT +TTAACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACC +GCGGTCACACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTT +TAGATCACCCCCTCCCCAATAAAGCTAAAACTCACCTGAGTTGTAAAAAA +CTCCAGTTGACACAAAATAGACTACGAAAGTGGCTTTAACATATCTGAAC +ACACAATAGCTAAGACCCAAACTGGGATTAGATACCCCACTATGCTTAGC +CCTAAACCTCAACAGTTAAATCAACAAAACTGCTCGCCAGAACACTACGA +GCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATCCCTCTAGA +GGAGCCTGTTCTGTAATCGATAAACCCCGATCAACCTCACCACCTCTTGC +TCAGCCTATATACCGCCATCTTCAGCAAACCCTGATGAAGGCTACAAAGT +AAGCGCAAGTACCCACGTAAAGACGTTAGGTCAAGGTGTAGCCCATGAGG +TGGCAAGAAATGGGCTACATTTTCTACCCCAGAAAACTACGATAGCCCTT +ATGAAACTTAAGGGTCGAAGGTGGATTTAGCAGTAAACTGAGAGTAGAGT +GCTTAGTTGAACAGGGCCCTGAAGCGCGTACACACCGCCCGTCACCCTCC +TCAAGTATACTTCAAAGGACATTTAACTAAAACCCCTACGCATTTATATA +GAGGAGACAAGTCGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACG +AACCAGAGTGTAGCTTAACACAAAGCACCCAACTTACACTTAGGAGATTT +CAACTTAACTTGACCGCTCTGAGCTAAACCTAGCCCCAAACCCACTCCAC +CTTACTACCAGACAACCTTAGCCAAACCATTTACCCAAATAAAGTATAGG +CGATAGAAATTGAAACCTGGCGCAATAGATATAGTACCGCAAGGGAAAGA +TGAAAAATTATAACCAAGCATAATATAGCAAGGACTAACCCCTATACCTT +CTGCATAATGAATTAACTAGAAATAACTTTGCAAGGAGAGCCAAAGCTAA +GACCCCCGAAACCAGACGAGCTACCTAAGAACAGCTAAAAGAGCACACCC +GTCTATGTAGCAAAATAGTGGGAAGATTTATAGGTAGAGGCGACAAACCT +ACCGAGCCTGGTGATAGCTGGTTGTCCAAGATAGAATCTTAGTTCAACTT +TAAATTTGCCCACAGAACCCTCTAAATCCCCTTGTAAATTTAACTGTTAG +TCCAAAGAGGAACAGCTCTTTGGACACTAGGAAAAAACCTTGTAGAGAGA +GTAAAAAATTTAACACCCATAGTAGGCCTAAAAGCAGCCACCAATTAAGA +AAGCGTTCAAGCTCAACACCCACTACCTAAAAAATCCCAAACATATAACT +GAACTCCTCACACCCAATTGGACCAATCTATCACCCTATAGAAGAACTAA +TGTTAGTATAAGTAACATGAAAACATTCTCCTCCGCATAAGCCTGCGTCA +GATCAAAACACTGAACTGACAATTAACAGCCCAATATCTACAATCAACCA +ACAAGTCATTATTACCCTCACTGTCAACCCAACACAGGCATGCTCATAAG +GAAAGGTTAAAAAAAGTAAAAGGAACTCGGCAAACCTTACCCCGCCTGTT +TACCAAAAACATCACCTCTAGCATCACCAGTATTAGAGGCACCGCCTGCC +CAGTGACACATGTTTAACGGCCGCGGTACCCTAACCGTGCAaaggtagca +taatcacttgttccttaaatagggacctgtatgaatggctccacgagggt +tcagctgtctcttacttttaaccagtgaaattgacctgcccgtgaagagg +cgggcatgacacagcaagacgagaagaccctatggagctttaatttaTTA +ATGCAAACAGTACCTAACAAACCCACAGGTCCTAAACTACCAAACCTGCA +TTAAAAATTTCGGTTGGGGCGACCTCGGAGCAGAACCCAACCTCCGAGCA +GTACATGCTAAGACTTCACCAGTCAAAGCGAACTACTATACTCAATTGAT +CCAATAACTTGACCAACGGAACAAGTTACCCTAGGGATAACAGCGCAATC +CTATTCTAGAGTCCATATCAACAATAGGGTTTACGACCTCGATGTTGGAT +CAGGACATCCCGATGGTGCAGCCGCTATTAAAGGTTCGTTTGTTCAACGA +TTAAAGTCCTACGTGATCTGAGTTCAGACCGGAGTAATCCAGGTCGGTTT +CTATCTACTTCAAATTCCTCCCTGTACGAAAGGACAAGAGAAATAAGGCC +TACTTCACAAAGCGCCTTCCCCCGTAAATGATATCATCTCAACTTAGTAT +TATACCCACACCCACCCAAGAACAGGGTTTgttaagatggcagagcccgg +taatcgcataaaacttaaaactttacagtcagaggttcaattcctcttct +taacaacaTACCCATGGCCAACCTCCTACTCCTCATTGTACCCATTCTAA +TCGCAATGGCATTCCTAATGCTTACCGAACGAAAAATTCTAGGCTATATA +CAACTACGCAAAGGCCCCAACGTTGTAGGCCCCTACGGGCTACTACAACC +CTTCGCTGACGCCATAAAACTCTTCACCAAAGAGCCCCTAAAACCCGCCA +CATCTACCATCACCCTCTACATCACCGCCCCGACCTTAGCTCTCACCATC +GCTCTTCTACTATGAACCCCCCTCCCCATACCCAACCCCCTGGTCAACCT +CAACCTAGGCCTCCTATTTATTCTAGCCACCTCTAGCCTAGCCGTTTACT +CAATCCTCTGATCAGGGTGAGCATCAAACTCAAACTACGCCCTGATCGGC +GCACTGCGAGCAGTAGCCCAAACAATCTCATATGAAGTCACCCTAGCCAT +CATTCTACTATCAACATTACTAATAAGTGGCTCCTTTAACCTCTCCACCC +TTATCACAACACAAGAACACCTCTGATTACTCCTGCCATCATGACCCTTG +GCCATAATATGATTTATCTCCACACTAGCAGAGACCAACCGAACCCCCTT +CGACCTTGCCGAAGGGGAGTCCGAACTAGTCTCAGGCTTCAACATCGAAT +ACGCCGCAGGCCCCTTCGCCCTATTCTTCATAGCCGAATACACAAACATT +ATTATAATAAACACCCTCACCACTACAATCTTCCTAGGAACAACATATGA +CGCACTCTCCCCTGAACTCTACACAACATATTTTGTCACCAAGACCCTAC +TTCTAACCTCCCTGTTCTTATGAATTCGAACAGCATACCCCCGATTCCGC +TACGACCAACTCATACACCTCCTATGAAAAAACTTCCTACCACTCACCCT +AGCATTACTTATATGATATGTCTCCATACCCATTACAATCTCCAGCATTC +CCCCTCAAACCTAAGAAATATGTCTGATAAAAGAGTTACTTTGATAGAGT +AAATAATAGGAGCTTAAACCCCCTTATTTctaggactatgagaatcgaac +ccatccctgagaatccaaaattctccgtgccacctatcacaccccatcct +aAAGTAAGGTCAGCTAAATAAGCTATCGGGCCCATACCCCGAAAATGTTG +GTTATACCCTTCCCGTACTAATTAATCCCCTGGCCCAACCCGTCATCTAC +TCTACCATCTTTGCAGGCACACTCATCACAGCGCTAAGCTCGCACTGATT +TTTTACCTGAGTAGGCCTAGAAATAAACATGCTAGCTTTTATTCCAGTTC +TAACCAAAAAAATAAACCCTCGTTCCACAGAAGCTGCCATCAAGTATTTC +CTCACGCAAGCAACCGCATCCATAATCCTTCTAATAGCTATCCTCTTCAA +CAATATACTCTCCGGACAATGAACCATAACCAATACTACCAATCAATACT +CATCATTAATAATCATAATGGCTATAGCAATAAAACTAGGAATAGCCCCC +TTTCACTTCTGAGTCCCAGAGGTTACCCAAGGCACCCCTCTGACATCCGG +CCTGCTTCTTCTCACATGACAAAAACTAGCCCCCATCTCAATCATATACC +AAATCTCTCCCTCACTAAACGTAAGCCTTCTCCTCACTCTCTCAATCTTA +TCCATCATAGCAGGCAGTTGAGGTGGATTAAACCAAACCCAGCTACGCAA +AATCTTAGCATACTCCTCAATTACCCACATAGGATGAATAATAGCAGTTC +TACCGTACAACCCTAACATAACCATTCTTAATTTAACTATTTATATTATC +CTAACTACTACCGCATTCCTACTACTCAACTTAAACTCCAGCACCACGAC +CCTACTACTATCTCGCACCTGAAACAAGCTAACATGACTAACACCCTTAA +TTCCATCCACCCTCCTCTCCCTAGGAGGCCTGCCCCCGCTAACCGGCTTT +TTGCCCAAATGGGCCATTATCGAAGAATTCACAAAAAACAATAGCCTCAT +CATCCCCACCATCATAGCCACCATCACCCTCCTTAACCTCTACTTCTACC +TACGCCTAATCTACTCCACCTCAATCACACTACTCCCCATATCTAACAAC +GTAAAAATAAAATGACAGTTTGAACATACAAAACCCACCCCATTCCTCCC +CACACTCATCGCCCTTACCACGCTACTCCTACCTATCTCCCCTTTTATAC +TAATAATCTTATAGAAATTTAGGTTAAATACAGACCAAGAGCCTTCAAAG +CCCTCAGTAAGTTGCAATACTTAATTTCTGCAACAGCTAAGGACTGCAAA +ACCCCACTCTGCATCAACTGAACGCAAATCAGCCACTTTAATTAAGCTAA +GCCCTTACTAGACCAATGGGACTTAAACCCACAAACACTTAGTTAACAGC +TAAGCACCCTAATCAACTGGCTTCAATCTACTTCTCCCGCCGCCGGGAAA +AAAGGCGGGAGAAGCCCCGGCAGGTTTGAAGCTGCTTCTTCGAATTTGCA +ATTCAATATGAAAATCACCTCGGAGCTGGTAAAAAGAGGCCTAACCCCTG +TCTTTAGATTTACAGTCCAATGCTTCACTCAGCCATTTTACCTCACCCCC +ACTGATGTTCGCCGACCGTTGACTATTCTCTACAAACCACAAAGACATTG +GAACACTATACCTATTATTCGGCGCATGAGCTGGAGTCCTAGGCACAGCT +CTAAGCCTCCTTATTCGAGCCGAGCTGGGCCAGCCAGGCAACCTTCTAGG +TAACGACCACATCTACAACGTTATCGTCACAGCCCATGCATTTGTAATAA +TCTTCTTCATAGTAATACCCATCATAATCGGAGGCTTTGGCAACTGACTA +GTTCCCCTAATAATCGGTGCCCCCGATATGGCGTTTCCCCGCATAAACAA +CATAAGCTTCTGACTCTTACCTCCCTCTCTCCTACTCCTGCTCGCATCTG +CTATAGTGGAGGCCGGAGCAGGAACAGGTTGAACAGTCTACCCTCCCTTA +GCAGGGAACTACTCCCACCCTGGAGCCTCCGTAGACCTAACCATCTTCTC +CTTACACCTAGCAGGTGTCTCCTCTATCTTAGGGGCCATCAATTTCATCA +CAACAATTATCAATATAAAACCCCCTGCCATAACCCAATACCAAACGCCC +CTCTTCGTCTGATCCGTCCTAATCACAGCAGTCCTACTTCTCCTATCTCT +CCCAGTCCTAGCTGCTGGCATCACTATACTACTAACAGACCGCAACCTCA +ACACCACCTTCTTCGACCCCGCCGGAGGAGGAGACCCCATTCTATACCAA +CACCTATTCTGATTTTTCGGTCACCCTGAAGTTTATATTCTTATCCTACC +AGGCTTCGGAATAATCTCCCATATTGTAACTTACTACTCCGGAAAAAAAG +AACCATTTGGATACATAGGTATGGTCTGAGCTATGATATCAATTGGCTTC +CTAGGGTTTATCGTGTGAGCACACCATATATTTACAGTAGGAATAGACGT +AGACACACGAGCATATTTCACCTCCGCTACCATAATCATCGCTATCCCCA +CCGGCGTCAAAGTATTTAGCTGACTCGCCACACTCCACGGAAGCAATATG +AAATGATCTGCTGCAGTGCTCTGAGCCCTAGGATTCATCTTTCTTTTCAC +CGTAGGTGGCCTGACTGGCATTGTATTAGCAAACTCATCACTAGACATCG +TACTACACGACACGTACTACGTTGTAGCTCACTTCCACTATGTCCTATCA +ATAGGAGCTGTATTTGCCATCATAGGAGGCTTCATTCACTGATTTCCCCT +ATTCTCAGGCTACACCCTAGACCAAACCTACGCCAAAATCCATTTCACTA +TCATATTCATCGGCGTAAATCTAACTTTCTTCCCACAACACTTTCTCGGC +CTATCCGGAATGCCCCGACGTTACTCGGACTACCCCGATGCATACACCAC +ATGAAACATCCTATCATCTGTAGGCTCATTCATTTCTCTAACAGCAGTAA +TATTAATAATTTTCATGATTTGAGAAGCCTTCGCTTCGAAGCGAAAAGTC +CTAATAGTAGAAGAACCCTCCATAAACCTGGAGTGACTATATGGATGCCC +CCCACCCTACCACACATTCGAAGAACCCGTATACATAAAATCTAGACAaa +aaaggaaggaatcgaaccccccaaagctggtttcaagccaaccccatggc +ctccatgactttttcAAAAAGGTATTAGAAAAACCATTTCATAACTTTGT +CAAAGTTAAATTATAGGCTAAATCCTATATATCTTAATGGCACATGCAGC +GCAAGTAGGTCTACAAGACGCTACTTCCCCTATCATAGAAGAGCTTATCA +CCTTTCATGATCACGCCCTCATAATCATTTTCCTTATCTGCTTCCTAGTC +CTGTATGCCCTTTTCCTAACACTCACAACAAAACTAACTAATACTAACAT +CTCAGACGCTCAGGAAATAGAAACCGTCTGAACTATCCTGCCCGCCATCA +TCCTAGTCCTCATCGCCCTCCCATCCCTACGCATCCTTTACATAACAGAC +GAGGTCAACGATCCCTCCCTTACCATCAAATCAATTGGCCACCAATGGTA +CTGAACCTACGAGTACACCGACTACGGCGGACTAATCTTCAACTCCTACA +TACTTCCCCCATTATTCCTAGAACCAGGCGACCTGCGACTCCTTGACGTT +GACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTAC +ATCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAA +CAGATGCAATTCCCGGACGTCTAAACCAAACCACTTTCACCGCTACACGA +CCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAG +TTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAG +GGCCCGTATTTACCCTATAGCACCCCCTCTACCCCCTCTAGAGCCCACTG +TAAAGCTAACTTAGCATTAACCTTTTAAGTTAAAGATTAAGAGAACCAAC +ACCTCTTTACAGTGAAATGCCCCAACTAAATACTACCGTATGGCCCACCA +TAATTACCCCCATACTCCTTACACTATTCCTCATCACCCAACTAAAAATA +TTAAACACAAACTACCACCTACCTCCCTCACCAAAGCCCATAAAAATAAA +AAATTATAACAAACCCTGAGAACCAAAATGAACGAAAATCTGTTCGCTTC +ATTCATTGCCCCCACAATCCTAGGCCTACCCGCCGCAGTACTGATCATTC +TATTTCCCCCTCTATTGATCCCCACCTCCAAATATCTCATCAACAACCGA +CTAATCACCACCCAACAATGACTAATCAAACTAACCTCAAAACAAATGAT +AGCCATACACAACACTAAAGGACGAACCTGATCTCTTATACTAGTATCCT +TAATCATTTTTATTGCCACAACTAACCTCCTCGGACTCCTGCCTCACTCA +TTTACACCAACCACCCAACTATCTATAAACCTAGCCATGGCCATCCCCTT +ATGAGCGGGCGCAGTGATTATAGGCTTTCGCTCTAAGATTAAAAATGCCC +TAGCCCACTTCTTACCACAAGGCACACCTACACCCCTTATCCCCATACTA +GTTATTATCGAAACCATCAGCCTACTCATTCAACCAATAGCCCTGGCCGT +ACGCCTAACCGCTAACATTACTGCAGGCCACCTACTCATGCACCTAATTG +GAAGCGCCACCCTAGCAATATCAACCATTAACCTTCCCTCTACACTTATC +ATCTTCACAATTCTAATTCTACTGACTATCCTAGAAATCGCTGTCGCCTT +AATCCAAGCCTACGTTTTCACACTTCTAGTAAGCCTCTACCTGCACGACA +ACACATAATGACCCACCAATCACATGCCTATCATATAGTAAAACCCAGCC +CATGACCCCTAACAGGGGCCCTCTCAGCCCTCCTAATGACCTCCGGCCTA +GCCATGTGATTTCACTTCCACTCCATAACGCTCCTCATACTAGGCCTACT +AACCAACACACTAACCATATACCAATGGTGGCGCGATGTAACACGAGAAA +GCACATACCAAGGCCACCACACACCACCTGTCCAAAAAGGCCTTCGATAC +GGGATAATCCTATTTATTACCTCAGAAGTTTTTTTCTTCGCAGGATTTTT +CTGAGCCTTTTACCACTCCAGCCTAGCCCCTACCCCCCAACTAGGAGGGC +ACTGGCCCCCAACAGGCATCACCCCGCTAAATCCCCTAGAAGTCCCACTC +CTAAACACATCCGTATTACTCGCATCAGGAGTATCAATCACCTGAGCTCA +CCATAGTCTAATAGAAAACAACCGAAACCAAATAATTCAAGCACTGCTTA +TTACAATTTTACTGGGTCTCTATTTTACCCTCCTACAAGCCTCAGAGTAC +TTCGAGTCTCCCTTCACCATTTCCGACGGCATCTACGGCTCAACATTTTT +TGTAGCCACAGGCTTCCACGGACTTCACGTCATTATTGGCTCAACTTTCC +TCACTATCTGCTTCATCCGCCAACTAATATTTCACTTTACATCCAAACAT +CACTTTGGCTTCGAAGCCGCCGCCTGATACTGGCATTTTGTAGATGTGGT +TTGACTATTTCTGTATGTCTCCATCTATTGATGAGGGTCTTACTCTTTTA +GTATAAATAGTACCGTTAACTTCCAATTAACTAGTTTTGACAACATTCAA +AAAAGAGTAATAAACTTCGCCTTAATTTTAATAATCAACACCCTCCTAGC +CTTACTACTAATAATTATTACATTTTGACTACCACAACTCAACGGCTACA +TAGAAAAATCCACCCCTTACGAGTGCGGCTTCGACCCTATATCCCCCGCC +CGCGTCCCTTTCTCCATAAAATTCTTCTTAGTAGCTATTACCTTCTTATT +ATTTGATCTAGAAATTGCCCTCCTTTTACCCCTACCATGAGCCCTACAAA +CAACTAACCTGCCACTAATAGTTATGTCATCCCTCTTATTAATCATCATC +CTAGCCCTAAGTCTGGCCTATGAGTGACTACAAAAAGGATTAGACTGAGC +CGAATTGGTATATAGTTTAAACAAAACGAATGATTTCGACTCATTAAATT +ATGATAATCATATTTACCAAATGCCCCTCATTTACATAAATATTATACTA +GCATTTACCATCTCACTTCTAGGAATACTAGTATATCGCTCACACCTCAT +ATCCTCCCTACTATGCCTAGAAGGAATAATACTATCGCTGTTCATTATAG +CTACTCTCATAACCCTCAACACCCACTCCCTCTTAGCCAATATTGTGCCT +ATTGCCATACTAGTCTTTGCCGCCTGCGAAGCAGCGGTGGGCCTAGCCCT +ACTAGTCTCAATCTCCAACACATATGGCCTAGACTACGTACATAACCTAA +ACCTACTCCAATGCTAAAACTAATCGTCCCAACAATTATATTACTACCAC +TGACATGACTTTCCAAAAAGCACATAATTTGAATCAACACAACCACCCAC +AGCCTAATTATTAGCATCATCCCCCTACTATTTTTTAACCAAATCAACAA +CAACCTATTTAGCTGTTCCCCAACCTTTTCCTCCGACCCCCTAACAACCC +CCCTCCTAATACTAACTACCTGACTCCTACCCCTCACAATCATGGCAAGC +CAACGCCACTTATCCAGCGAACCACTATCACGAAAAAAACTCTACCTCTC +TATACTAATCTCCCTACAAATCTCCTTAATTATAACATTCACAGCCACAG +AACTAATCATATTTTATATCTTCTTCGAAACCACACTTATCCCCACCTTG +GCTATCATCACCCGATGAGGCAACCAGCCAGAACGCCTGAACGCAGGCAC +ATACTTCCTATTCTACACCCTAGTAGGCTCCCTTCCCCTACTCATCGCAC +TAATTTACACTCACAACACCCTAGGCTCACTAAACATTCTACTACTCACT +CTCACTGCCCAAGAACTATCAAACTCCTGAGCCAACAACTTAATATGACT +AGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACT +TATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTA +CTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATACGCCTCAC +ACTCATTCTCAACCCCCTGACAAAACACATAGCCTACCCCTTCCTTGTAC +TATCCCTATGAGGCATAATTATAACAAGCTCCATCTGCCTACGACAAACA +GACCTAAAATCGCTCATTGCATACTCTTCAATCAGCCACATAGCCCTCGT +AGTAACAGCCATTCTCATCCAAACCCCCTGAAGCTTCACCGGCGCAGTCA +TTCTCATAATCGCCCACGGACTCACATCCTCATTACTATTCTGCCTAGCA +AACTCAAACTACGAACGCACTCACAGTCGCATCATAATCCTCTCTCAAGG +ACTTCAAACTCTACTCCCACTAATAGCTTTTTGATGACTTCTAGCAAGCC +TCGCTAACCTCGCCTTACCCCCCACTATTAACCTACTGGGAGAACTCTCT +GTGCTAGTAACCACGTTCTCCTGATCAAATATCACTCTCCTACTTACAGG +ACTCAACATACTAGTCACAGCCCTATACTCCCTCTACATATTTACCACAA +CACAATGGGGCTCACTCACCCACCACATTAACAACATAAAACCCTCATTC +ACACGAGAAAACACCCTCATGTTCATACACCTATCCCCCATTCTCCTCCT +ATCCCTCAACCCCGACATCATTACCGGGTTTTCCTCTTGTAAATATAGTT +TAACCAAAACATCAGATTGTGAATCTGACAACAGAGGCTTACGACCCCTT +ATTTACCGAGAAAGCTCACAAGAACTGCTAACTCATGCCCCCATGTCTAA +CAACATGGCTTTCTCAACTTTTAAAGGATAACAGCTATCCATTGGTCTTA +GGCCCCAAAAATTTTGGTGCAACTCCAAATAAAAGTAATAACCATGCACA +CTACTATAACCACCCTAACCCTGACTTCCCTAATTCCCCCCATCCTTACC +ACCCTCGTTAACCCTAACAAAAAAAACTCATACCCCCATTATGTAAAATC +CATTGTCGCATCCACCTTTATTATCAGTCTCTTCCCCACAACAATATTCA +TGTGCCTAGACCAAGAAGTTATTATCTCGAACTGACACTGAGCCACAACC +CAAACAACCCAGCTCTCCCTAAGCTTCAAACTAGACTACTTCTCCATAAT +ATTCATCCCTGTAGCATTGTTCGTTACATGGTCCATCATAGAATTCTCAC +TGTGATATATAAACTCAGACCCAAACATTAATCAGTTCTTCAAATATCTA +CTCATTTTCCTAATTACCATACTAATCTTAGTTACCGCTAACAACCTATT +CCAACTGTTCATCGGCTGAGAGGGCGTAGGAATTATATCCTTCTTGCTCA +TCAGTTGATGATACGCCCGAGCAGATGCCAACACAGCAGCCATTCAAGCA +GTCCTATACAACCGTATCGGCGATATCGGTTTCATCCTCGCCTTAGCATG +ATTTATCCTACACTCCAACTCATGAGACCCACAACAAATAGCCCTTCTAA +ACGCTAATCCAAGCCTCACCCCACTACTAGGCCTCCTCCTAGCAGCAGCA +GGCAAATCAGCCCAATTAGGTCTCCACCCCTGACTCCCCTCAGCCATAGA +AGGCCCCACCCCAGTCTCAGCCCTACTCCACTCAAGCACTATAGTTGTAG +CAGGAATCTTCTTACTCATCCGCTTCCACCCCCTAGCAGAAAATAGCCCA +CTAATCCAAACTCTAACACTATGCTTAGGCGCTATCACCACTCTGTTCGC +AGCAGTCTGCGCCCTTACACAAAATGACATCAAAAAAATCGTAGCCTTCT +CCACTTCAAGTCAACTAGGACTCATAATAGTTACAATCGGCATCAACCAA +CCACACCTAGCATTCCTGCACATCTGTACCCACGCCTTCTTCAAAGCCAT +ACTATTTATGTGCTCCGGGTCCATCATCCACAACCTTAACAATGAACAAG +ATATTCGAAAAATAGGAGGACTACTCAAAACCATACCTCTCACTTCAACC +TCCCTCACCATTGGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGG +TTTCTACTCCAAAGACCACATCATCGAAACCGCAAACATATCATACACAA +ACGCCTGAGCCCTATCTATTACTCTCATCGCTACCTCCCTGACAAGCGCC +TATAGCACTCGAATAATTCTTCTCACCCTAACAGGTCAACCTCGCTTCCC +CACCCTTACTAACATTAACGAAAATAACCCCACCCTACTAAACCCCATTA +AACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCATTACTAACAAC +ATTTCCCCCGCATCCCCCTTCCAAACAACAATCCCCCTCTACCTAAAACT +CACAGCCCTCGCTGTCACTTTCCTAGGACTTCTAACAGCCCTAGACCTCA +ACTACCTAACCAACAAACTTAAAATAAAATCCCCACTATGCACATTTTAT +TTCTCCAACATACTCGGATTCTACCCTAGCATCACACACCGCACAATCCC +CTATCTAGGCCTTCTTACGAGCCAAAACCTGCCCCTACTCCTCCTAGACC +TAACCTGACTAGAAAAGCTATTACCTAAAACAATTTCACAGCACCAAATC +TCCACCTCCATCATCACCTCAACCCAAAAAGGCATAATTAAACTTTACTT +CCTCTCTTTCTTCTTCCCACTCATCCTAACCCTACTCCTAATCACATAAC +CTATTCCCCCGAGCAATCTCAATTACAATATATACACCAACAAACAATGT +TCAACCAGTAACCACTACTAATCAACGCCCATAATCATACAAAGCCCCCG +CACCAATAGGATCCTCCCGAATCAACCCTGACCCCTCTCCTTCATAAATT +ATTCAGCTTCCTACACTATTAAAGTTTACCACAACCACCACCCCATCATA +CTCTTTCACCCACAGCACCAATCCTACCTCCATCGCTAACCCCACTAAAA +CACTCACCAAGACCTCAACCCCTGACCCCCATGCCTCAGGATACTCCTCA +ATAGCCATCGCTGTAGTATATCCAAAGACAACCATCATTCCCCCTAAATA +AATTAAAAAAACTATTAAACCCATATAACCTCCCCCAAAATTCAGAATAA +TAACACACCCGACCACACCGCTAACAATCAGTACTAAACCCCCATAAATA +GGAGAAGGCTTAGAAGAAAACCCCACAAACCCCATTACTAAACCCACACT +CAACAGAAACAAAGCATACATCATTATTCTCGCACGGACTACAACCACGA +CCAATGATATGAAAAACCATCGTTGTATTTCAACTACAAGAACACCAATG +ACCCCAATACGCAAAATTAACCCCCTAATAAAATTAATTAACCACTCATT +CATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCT +CACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTA +GCCATACACTACTCACCAGACGCCTCAACCGCCTTTTCATCAATCGCCCA +CATCACTCGAGACGTAAATTATGGCTGAATCATCCGCTACCTTCACGCCA +ATGGCGCCTCAATATTCTTTATCTGCCTCTTCCTACACATCGGGCGAGGC +CTATATTACGGATCATTTCTCTACTCAGAAACCTGAAACATCGGCATTAT +CCTCCTGCTTGCAACTATAGCAACAGCCTTCATAGGCTATGTCCTCCCGT +GAGGCCAAATATCATTCTGAGGGGCCACAGTAATTACAAACTTACTATCC +GCCATCCCATACATTGGGACAGACCTAGTTCAATGAATCTGAGGAGGCTA +CTCAGTAGACAGTCCCACCCTCACACGATTCTTTACCTTTCACTTCATCT +TACCCTTCATTATTGCAGCCCTAGCAGCACTCCACCTCCTATTCTTGCAC +GAAACGGGATCAAACAACCCCCTAGGAATCACCTCCCATTCCGATAAAAT +CACCTTCCACCCTTACTACACAATCAAAGACGCCCTCGGCTTACTTCTCT +TCCTTCTCTCCTTAATGACATTAACACTATTCTCACCAGACCTCCTAGGC +GACCCAGACAATTATACCCTAGCCAACCCCTTAAACACCCCTCCCCACAT +CAAGCCCGAATGATATTTCCTATTCGCCTACACAATTCTCCGATCCGTCC +CTAACAAACTAGGAGGCGTCCTTGCCCTATTACTATCCATCCTCATCCTA +GCAATAATCCCCATCCTCCATATATCCAAACAACAAAGCATAATATTTCG +CCCACTAAGCCAATCACTTTATTGACTCCTAGCCGCAGACCTCCTCATTC +TAACCTGAATCGGAGGACAACCAGTAAGCTACCCTTTTACCATCATTGGA +CAAGTAGCATCCGTACTATACTTCACAACAATCCTAATCCTAATACCAAC +TATCTCCCTAATTGAAAACAAAATACTCAAATGGGCCTGTCCTTGTAGTA +TAAACTAATACACCAGTCTTGTAAACCGGAGACGAAAACCTTTTTCCAAG +GACAAATCAGAGAAAAAGTCTTTAACTCCACCATTAGCACCCAAAGCTAA +GATTCTAATTTAAACTATTCTCTGTTCTTTCATGGGGAAGCAGATTTGGG +TACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTAC +ATTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCA +CCTGTAGTACATAAAAACCCAACCCACATCAAACCCCCCCCCCCCATGCT +TACAAGCAAGTACAGCAATCAACCTTCAACTATCACACATCAACTGCAAC +TCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTT +AACAGTACATAGTACATAAAGTCATTTACCGTACATAGCACATTACAGTC +AAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTG +ACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCT +CGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGAC +ATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGTTCCC +CTTAAATAAGACATCACGATG
--- a/test-data/varscan_somatic_indel_result1.vcf Sun Jul 15 09:19:48 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,18 +0,0 @@ -##fileformat=VCFv4.1 -##source=VarScan2 -##INFO=<ID=DP,Number=1,Type=Integer,Description="Total depth of quality bases"> -##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Indicates if record is a somatic mutation"> -##INFO=<ID=SS,Number=1,Type=String,Description="Somatic status of variant (0=Reference,1=Germline,2=Somatic,3=LOH, or 5=Unknown)"> -##INFO=<ID=SSC,Number=1,Type=String,Description="Somatic score in Phred scale (0-255) derived from somatic p-value"> -##INFO=<ID=GPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor+normal versus no variant for Germline calls"> -##INFO=<ID=SPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor versus normal for Somatic/LOH calls"> -##FILTER=<ID=str10,Description="Less than 10% or more than 90% of variant supporting reads on one strand"> -##FILTER=<ID=indelError,Description="Likely artifact due to indel reads at this position"> -##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> -##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> -##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> -##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Depth of reference-supporting bases (reads1)"> -##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Depth of variant-supporting bases (reads2)"> -##FORMAT=<ID=FREQ,Number=1,Type=String,Description="Variant allele frequency"> -##FORMAT=<ID=DP4,Number=1,Type=String,Description="Strand read counts: ref/fwd, ref/rev, var/fwd, var/rev"> -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR
--- a/test-data/varscan_somatic_indel_result2.vcf Sun Jul 15 09:19:48 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,18 +0,0 @@ -##fileformat=VCFv4.1 -##source=VarScan2 -##INFO=<ID=DP,Number=1,Type=Integer,Description="Total depth of quality bases"> -##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Indicates if record is a somatic mutation"> -##INFO=<ID=SS,Number=1,Type=String,Description="Somatic status of variant (0=Reference,1=Germline,2=Somatic,3=LOH, or 5=Unknown)"> -##INFO=<ID=SSC,Number=1,Type=String,Description="Somatic score in Phred scale (0-255) derived from somatic p-value"> -##INFO=<ID=GPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor+normal versus no variant for Germline calls"> -##INFO=<ID=SPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor versus normal for Somatic/LOH calls"> -##FILTER=<ID=str10,Description="Less than 10% or more than 90% of variant supporting reads on one strand"> -##FILTER=<ID=indelError,Description="Likely artifact due to indel reads at this position"> -##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> -##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> -##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> -##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Depth of reference-supporting bases (reads1)"> -##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Depth of variant-supporting bases (reads2)"> -##FORMAT=<ID=FREQ,Number=1,Type=String,Description="Variant allele frequency"> -##FORMAT=<ID=DP4,Number=1,Type=String,Description="Strand read counts: ref/fwd, ref/rev, var/fwd, var/rev"> -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR
--- a/test-data/varscan_somatic_snp_result1.vcf Sun Jul 15 09:19:48 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ -##fileformat=VCFv4.1 -##source=VarScan2 -##INFO=<ID=DP,Number=1,Type=Integer,Description="Total depth of quality bases"> -##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Indicates if record is a somatic mutation"> -##INFO=<ID=SS,Number=1,Type=String,Description="Somatic status of variant (0=Reference,1=Germline,2=Somatic,3=LOH, or 5=Unknown)"> -##INFO=<ID=SSC,Number=1,Type=String,Description="Somatic score in Phred scale (0-255) derived from somatic p-value"> -##INFO=<ID=GPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor+normal versus no variant for Germline calls"> -##INFO=<ID=SPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor versus normal for Somatic/LOH calls"> -##FILTER=<ID=str10,Description="Less than 10% or more than 90% of variant supporting reads on one strand"> -##FILTER=<ID=indelError,Description="Likely artifact due to indel reads at this position"> -##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> -##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> -##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> -##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Depth of reference-supporting bases (reads1)"> -##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Depth of variant-supporting bases (reads2)"> -##FORMAT=<ID=FREQ,Number=1,Type=String,Description="Variant allele frequency"> -##FORMAT=<ID=DP4,Number=1,Type=String,Description="Strand read counts: ref/fwd, ref/rev, var/fwd, var/rev"> -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR -chr1 51436072 . C A . PASS DP=47;SOMATIC;SS=2;SSC=3;GPV=1E0;SPV=4.4681E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:26:26:0:0%:23,3,0,0 0/1:.:21:20:1:4.76%:19,1,1,0 -chr1 51436311 . T C . PASS DP=16;SOMATIC;SS=2;SSC=3;GPV=1E0;SPV=4.375E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:9:9:0:0%:1,8,0,0 0/1:.:7:6:1:14.29%:0,6,0,1 -chr1 51436320 . G A . PASS DP=19;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.2632E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:9:9:0:0%:1,8,0,0 0/1:.:10:9:1:10%:0,9,0,1 -chr1 51439628 . T C . str10 DP=237;SOMATIC;SS=2;SSC=3;GPV=1E0;SPV=4.8101E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:123:123:0:0%:77,46,0,0 0/1:.:114:113:1:0.88%:74,39,0,1 -chr1 51439638 . G A . str10 DP=234;SOMATIC;SS=2;SSC=3;GPV=1E0;SPV=4.9145E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:119:119:0:0%:72,47,0,0 0/1:.:115:114:1:0.87%:75,39,0,1 -chr1 51439665 . C T . PASS DP=226;SOMATIC;SS=2;SSC=9;GPV=1E0;SPV=1.2006E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:114:114:0:0%:56,58,0,0 0/1:.:112:109:3:2.68%:63,46,2,1 -chr1 51439671 . G A . str10 DP=222;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.045E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:110:110:0:0%:53,57,0,0 0/1:.:112:111:1:0.89%:59,52,1,0 -chr1 51439684 . G T . str10 DP=210;SOMATIC;SS=2;SSC=3;GPV=1E0;SPV=4.9524E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:106:106:0:0%:51,55,0,0 0/1:.:104:103:1:0.96%:53,50,1,0 -chr1 51439703 . C T . str10 DP=202;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.099E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:99:99:0:0%:46,53,0,0 0/1:.:103:102:1:0.97%:48,54,0,1 -chr1 51439705 . G T . str10 DP=204;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.1961E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:98:98:0:0%:42,56,0,0 0/1:.:106:105:1:0.94%:48,57,0,1 -chr1 51439706 . G T . str10 DP=201;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.1741E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:97:97:0:0%:41,56,0,0 0/1:.:104:103:1:0.96%:46,57,1,0 -chr1 51439726 . C G . str10 DP=187;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.1872E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:90:90:0:0%:37,53,0,0 0/1:.:97:96:1:1.03%:44,52,1,0 -chr1 51439751 . C G . str10 DP=168;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.3293E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:79:78:0:0%:28,50,0,0 0/1:.:89:88:1:1.12%:35,53,0,1 -chr1 51439763 . G A . PASS DP=159;SOMATIC;SS=2;SSC=5;GPV=1E0;SPV=2.7092E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:76:76:0:0%:34,42,0,0 0/1:.:83:81:2:2.41%:32,49,1,1 -chr1 51439766 . G T . str10 DP=154;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.1299E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:75:75:0:0%:34,41,0,0 0/1:.:79:78:1:1.27%:30,48,0,1 -chr1 51439788 . T C . str10 DP=136;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.1471E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:66:66:0:0%:21,45,0,0 0/1:.:70:69:1:1.43%:24,45,1,0 -chr1 51439828 . G A . str10 DP=122;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.7377E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:52:52:0:0%:14,38,0,0 0/1:.:70:69:1:1.43%:27,42,0,1 -chr1 51439832 . C G . str10 DP=125;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.52E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:56:56:0:0%:14,42,0,0 0/1:.:69:68:1:1.45%:25,43,0,1 -chr1 51439876 . T G . str10 DP=105;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.619E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:46:46:0:0%:10,36,0,0 0/1:.:59:58:1:1.69%:23,35,0,1 -chr1 51439882 . G T . str10 DP=105;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.2381E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:50:50:0:0%:13,37,0,0 0/1:.:55:54:1:1.82%:22,32,0,1 -chr1 51439889 . G T . str10 DP=97;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.1546E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:47:47:0:0%:14,33,0,0 0/1:.:50:49:1:2%:21,28,0,1 -chr1 51439953 . G T . str10 DP=59;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.9322E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:24:24:0:0%:5,19,0,0 0/1:.:35:34:1:2.86%:7,27,0,1 -chr1 51440035 . G T . PASS DP=21;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=6.1905E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:8:8:0:0%:1,7,0,0 0/1:.:13:12:1:7.69%:1,11,0,1
--- a/test-data/varscan_somatic_snp_result2.vcf Sun Jul 15 09:19:48 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ -##fileformat=VCFv4.1 -##source=VarScan2 -##INFO=<ID=DP,Number=1,Type=Integer,Description="Total depth of quality bases"> -##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Indicates if record is a somatic mutation"> -##INFO=<ID=SS,Number=1,Type=String,Description="Somatic status of variant (0=Reference,1=Germline,2=Somatic,3=LOH, or 5=Unknown)"> -##INFO=<ID=SSC,Number=1,Type=String,Description="Somatic score in Phred scale (0-255) derived from somatic p-value"> -##INFO=<ID=GPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor+normal versus no variant for Germline calls"> -##INFO=<ID=SPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor versus normal for Somatic/LOH calls"> -##FILTER=<ID=str10,Description="Less than 10% or more than 90% of variant supporting reads on one strand"> -##FILTER=<ID=indelError,Description="Likely artifact due to indel reads at this position"> -##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> -##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> -##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> -##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Depth of reference-supporting bases (reads1)"> -##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Depth of variant-supporting bases (reads2)"> -##FORMAT=<ID=FREQ,Number=1,Type=String,Description="Variant allele frequency"> -##FORMAT=<ID=DP4,Number=1,Type=String,Description="Strand read counts: ref/fwd, ref/rev, var/fwd, var/rev"> -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR -chr1 51436072 . C A . PASS DP=47;SOMATIC;SS=2;SSC=3;GPV=1E0;SPV=4.4681E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:26:26:0:0%:23,3,0,0 0/1:.:21:20:1:4.76%:19,1,1,0 -chr1 51436311 . T C . PASS DP=16;SOMATIC;SS=2;SSC=3;GPV=1E0;SPV=4.375E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:9:9:0:0%:1,8,0,0 0/1:.:7:6:1:14.29%:0,6,0,1 -chr1 51436320 . G A . PASS DP=19;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=5.2632E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:9:9:0:0%:1,8,0,0 0/1:.:10:9:1:10%:0,9,0,1 -chr1 51439665 . C T . PASS DP=226;SOMATIC;SS=2;SSC=9;GPV=1E0;SPV=1.2006E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:114:114:0:0%:56,58,0,0 0/1:.:112:109:3:2.68%:63,46,2,1 -chr1 51439763 . G A . PASS DP=159;SOMATIC;SS=2;SSC=5;GPV=1E0;SPV=2.7092E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:76:76:0:0%:34,42,0,0 0/1:.:83:81:2:2.41%:32,49,1,1 -chr1 51440025 . A C . PASS DP=27;SOMATIC;SS=2;SSC=1;GPV=1E0;SPV=6.6667E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:9:9:0:0%:1,8,0,0 0/1:.:18:17:1:5.56%:1,16,0,1 -chr1 51440035 . G T . PASS DP=21;SOMATIC;SS=2;SSC=2;GPV=1E0;SPV=6.1905E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:8:8:0:0%:1,7,0,0 0/1:.:13:12:1:7.69%:1,11,0,1 -chr1 51440056 . T G . PASS DP=22;SOMATIC;SS=2;SSC=1;GPV=1E0;SPV=6.3636E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/0:.:8:8:0:0%:1,7,0,0 0/1:.:14:13:1:7.14%:1,12,0,1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/varscan.py Tue Dec 04 05:15:15 2018 -0500 @@ -0,0 +1,1178 @@ +#!/usr/bin/env python3 +from __future__ import print_function + +import argparse +import io +import os +import subprocess +import sys +import tempfile +import time +from contextlib import ExitStack +from functools import partial +from threading import Thread + +import pysam + + +class VariantCallingError (RuntimeError): + """Exception class for issues with samtools and varscan subprocesses.""" + + def __init__(self, message=None, call='', error=''): + self.message = message + self.call = call.strip() + self.error = error.strip() + + def __str__(self): + if self.message is None: + return '' + if self.error: + msg_header = '"{0}" failed with:\n{1}\n\n'.format( + self.call, self.error + ) + else: + msg_header = '{0} failed.\n' + 'No further information about this error is available.\n\n'.format( + self.call + ) + return msg_header + self.message + + +class VarScanCaller (object): + def __init__(self, ref_genome, bam_input_files, + max_depth=None, + min_mapqual=None, min_basequal=None, + threads=1, verbose=False, quiet=True + ): + self.ref_genome = ref_genome + self.bam_input_files = bam_input_files + self.max_depth = max_depth + self.min_mapqual = min_mapqual + self.min_basequal = min_basequal + self.threads = threads + self.verbose = verbose + self.quiet = quiet + + with pysam.FastaFile(ref_genome) as ref_fa: + self.ref_contigs = ref_fa.references + self.ref_lengths = ref_fa.lengths + + self.pileup_engine = ['samtools', 'mpileup'] + self.varcall_engine = ['varscan', 'somatic'] + self.requires_stdout_redirect = False + self.TemporaryContigVCF = partial( + tempfile.NamedTemporaryFile, + mode='wb', suffix='', delete=False, dir=os.getcwd() + ) + self.tmpfiles = [] + + def _get_pysam_pileup_args(self): + param_dict = {} + if self.max_depth is not None: + param_dict['max_depth'] = self.max_depth + if self.min_mapqual is not None: + param_dict['min_mapping_quality'] = self.min_mapqual + if self.min_basequal is not None: + param_dict['min_base_quality'] = self.min_basequal + param_dict['compute_baq'] = False + param_dict['stepper'] = 'samtools' + return param_dict + + def varcall_parallel(self, normal_purity=None, tumor_purity=None, + min_coverage=None, + min_var_count=None, + min_var_freq=None, min_hom_freq=None, + p_value=None, somatic_p_value=None, + threads=None, verbose=None, quiet=None + ): + if not threads: + threads = self.threads + if verbose is None: + verbose = self.verbose + if quiet is None: + quiet = self.quiet + # mapping of method parameters to varcall engine command line options + varcall_engine_option_mapping = [ + ('--normal-purity', normal_purity), + ('--tumor-purity', tumor_purity), + ('--min-coverage', min_coverage), + ('--min-reads2', min_var_count), + ('--min-var-freq', min_var_freq), + ('--min-freq-for-hom', min_hom_freq), + ('--p-value', p_value), + ('--somatic-p-value', somatic_p_value), + ('--min-avg-qual', self.min_basequal) + ] + varcall_engine_options = [] + for option, value in varcall_engine_option_mapping: + if value is not None: + varcall_engine_options += [option, str(value)] + pileup_engine_options = ['-B'] + if self.max_depth is not None: + pileup_engine_options += ['-d', str(self.max_depth)] + if self.min_mapqual is not None: + pileup_engine_options += ['-q', str(self.min_mapqual)] + if self.min_basequal is not None: + pileup_engine_options += ['-Q', str(self.min_basequal)] + + # Create a tuple of calls to samtools mpileup and varscan for + # each contig. The contig name is stored as the third element of + # that tuple. + # The calls are stored in the reverse order of the contig list so + # that they can be popped off later in the original order + calls = [( + self.pileup_engine + pileup_engine_options + [ + '-r', contig + ':', + '-f', self.ref_genome + ] + self.bam_input_files, + self.varcall_engine + [ + '-', '{out}', '--output-vcf', '1', '--mpileup', '1' + ] + varcall_engine_options, + contig + ) for contig in self.ref_contigs[::-1]] + + if verbose: + print('Starting variant calling ..') + + # launch subprocesses and monitor their status + subprocesses = [] + error_table = {} + tmp_io_started = [] + tmp_io_finished = [] + self.tmpfiles = [] + + def enqueue_stderr_output(out, stderr_buffer): + for line in iter(out.readline, b''): + # Eventually we are going to print the contents of + # the stderr_buffer to sys.stderr so we can + # decode things here using its encoding. + # We do a 'backslashreplace' just to be on the safe side. + stderr_buffer.write(line.decode(sys.stderr.encoding, + 'backslashreplace')) + out.close() + + try: + while subprocesses or calls: + while calls and len(subprocesses) < threads: + # There are still calls waiting for execution and we + # have unoccupied threads so we launch a new combined + # call to samtools mpileup and the variant caller. + + # pop the call arguments from our call stack + call = calls.pop() + # get the name of the contig that this call is going + # to work on + contig = call[2] + # Based on the contig name, generate a readable and + # file system-compatible prefix and use it to create + # a named temporary file, to which the call output + # will be redirected. + # At the moment we create the output file we add it to + # the list of all temporary output files so that we can + # remove it eventually during cleanup. + call_out = self.TemporaryContigVCF( + prefix=''.join( + c if c.isalnum() else '_' for c in contig + ) + '_', + ) + # maintain a list of variant call outputs + # in the order the subprocesses got launched + tmp_io_started.append(call_out.name) + if self.requires_stdout_redirect: + # redirect stdout to the temporary file just created + stdout_p2 = call_out + stderr_p2 = subprocess.PIPE + else: + # variant caller wants to write output to file directly + stdout_p2 = subprocess.PIPE + stderr_p2 = subprocess.STDOUT + call[1][call[1].index('{out}')] = call_out.name + call_out.close() + # for reporting purposes, join the arguments for the + # samtools and the variant caller calls into readable + # strings + c_str = (' '.join(call[0]), ' '.join(call[1])) + error_table[c_str] = [io.StringIO(), io.StringIO()] + # start the subprocesses + p1 = subprocess.Popen( + call[0], + stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + p2 = subprocess.Popen( + call[1], + stdin=p1.stdout, + stdout=stdout_p2, + stderr=stderr_p2 + ) + # subprocess bookkeeping + subprocesses.append((c_str, p1, p2, call_out, contig)) + # make sure our newly launched call does not block + # because its buffers fill up + p1.stdout.close() + t1 = Thread( + target=enqueue_stderr_output, + args=(p1.stderr, error_table[c_str][0]) + ) + t2 = Thread( + target=enqueue_stderr_output, + args=( + p2.stderr + if self.requires_stdout_redirect else + p2.stdout, + error_table[c_str][1] + ) + ) + t1.daemon = t2.daemon = True + t1.start() + t2.start() + + if verbose: + print( + 'Calling variants for contig: {0}' + .format(call[2]) + ) + + # monitor all running calls to see if any of them are done + for call, p1, p2, ofo, contig in subprocesses: + p1_stat = p1.poll() + p2_stat = p2.poll() + if p1_stat is not None or p2_stat is not None: + # There is an outcome for this process! + # Lets see what it is + if p1_stat or p2_stat: + print() + print( + error_table[call][0].getvalue(), + error_table[call][1].getvalue(), + file=sys.stderr + ) + raise VariantCallingError( + 'Variant Calling for contig {0} failed.' + .format(contig), + call='{0} | {1}'.format(call[0], call[1]) + ) + if p1_stat == 0 and p2_stat is None: + # VarScan is not handling the no output from + # samtools mpileup situation correctly so maybe + # that's the issue here + last_words = error_table[call][1].getvalue( + ).splitlines()[-4:] + if len(last_words) < 4 or any( + not msg.startswith('Input stream not ready') + for msg in last_words + ): + break + # lets give this process a bit more time + # VarScan is waiting for input it will never + # get, stop it. + p2.terminate() + subprocesses.remove((call, p1, p2, ofo, contig)) + ofo.close() + break + if p2_stat == 0: + # Things went fine. + # maintain a list of variant call outputs + # that finished successfully (in the order + # they finished) + tmp_io_finished.append(ofo.name) + if verbose: + print() + print('Contig {0} finished.'.format(contig)) + if not quiet: + print() + print( + 'stderr output from samtools mpileup/' + 'bcftools:'.upper(), + file=sys.stderr + ) + print( + error_table[call][0].getvalue(), + error_table[call][1].getvalue(), + file=sys.stderr + ) + # Discard the collected stderr output from + # the call, remove the call from the list of + # running calls and close its output file. + del error_table[call] + subprocesses.remove((call, p1, p2, ofo, contig)) + # Closing the output file is important or we + # may hit the file system limit for open files + # if there are lots of contigs. + ofo.close() + break + # wait a bit in between monitoring cycles + time.sleep(2) + finally: + for call, p1, p2, ofo, contig in subprocesses: + # make sure we do not leave running subprocesses behind + for proc in (p1, p2): + try: + proc.terminate() + except Exception: + pass + # close currently open files + ofo.close() + # store the files with finished content in the order that + # the corresponding jobs were launched + self.tmpfiles = [f for f in tmp_io_started if f in tmp_io_finished] + # Make sure remaining buffered stderr output of + # subprocesses does not get lost. + # Currently, we don't screen this output for real errors, + # but simply rewrite everything. + if not quiet and error_table: + print() + print( + 'stderr output from samtools mpileup/bcftools:'.upper(), + file=sys.stderr + ) + for call, errors in error_table.items(): + print(' | '.join(call), ':', file=sys.stderr) + print('-' * 20, file=sys.stderr) + print('samtools mpileup output:', file=sys.stderr) + print(errors[0].getvalue(), file=sys.stderr) + print('varscan somatic output:', file=sys.stderr) + print(errors[1].getvalue(), file=sys.stderr) + + def _add_ref_contigs_to_header(self, header): + for chrom, length in zip(self.ref_contigs, self.ref_lengths): + header.add_meta( + 'contig', + items=[('ID', chrom), ('length', length)] + ) + + def _add_filters_to_header(self, header): + varscan_fpfilters = { + 'VarCount': 'Fewer than {min_var_count2} variant-supporting reads', + 'VarFreq': 'Variant allele frequency below {min_var_freq2}', + 'VarAvgRL': + 'Average clipped length of variant-supporting reads < ' + '{min_var_len}', + 'VarReadPos': 'Relative average read position < {min_var_readpos}', + 'VarDist3': + 'Average distance to effective 3\' end < {min_var_dist3}', + 'VarMMQS': + 'Average mismatch quality sum for variant reads > ' + '{max_var_mmqs}', + 'VarMapQual': + 'Average mapping quality of variant reads < {min_var_mapqual}', + 'VarBaseQual': + 'Average base quality of variant reads < {min_var_basequal}', + 'Strand': + 'Strand representation of variant reads < {min_strandedness}', + 'RefAvgRL': + 'Average clipped length of ref-supporting reads < ' + '{min_ref_len}', + 'RefReadPos': + 'Relative average read position < {min_ref_readpos}', + 'RefDist3': + 'Average distance to effective 3\' end < {min_ref_dist3}', + 'RefMapQual': + 'Average mapping quality of reference reads < ' + '{min_ref_mapqual}', + 'RefBaseQual': + 'Average base quality of ref-supporting reads < ' + '{min_ref_basequal}', + 'RefMMQS': + 'Average mismatch quality sum for ref-supporting reads > ' + '{max_ref_mmqs}', + 'MMQSdiff': + 'Mismatch quality sum difference (var - ref) > ' + '{max_mmqs_diff}', + 'MinMMQSdiff': + 'Mismatch quality sum difference (var - ref) < ' + '{max_mmqs_diff}', + 'MapQualDiff': + 'Mapping quality difference (ref - var) > {max_mapqual_diff}', + 'MaxBAQdiff': + 'Average base quality difference (ref - var) > ' + '{max_basequal_diff}', + 'ReadLenDiff': + 'Average supporting read length difference (ref - var) > ' + '{max_relative_len_diff}', + } + for filter_id, description in varscan_fpfilters.items(): + header.filters.add(filter_id, None, None, description) + + def _add_indel_info_flag_to_header(self, header): + header.info.add( + 'INDEL', 0, 'Flag', 'Indicates that the variant is an INDEL' + ) + + def _compile_common_header(self, varcall_template, no_filters=False): + # fix the header generated by VarScan + # by adding reference and contig information + common_header = pysam.VariantHeader() + common_header.add_meta('reference', value=self.ref_genome) + self._add_ref_contigs_to_header(common_header) + if not no_filters: + # add filter info + self._add_filters_to_header(common_header) + # change the source information + common_header.add_meta('source', value='varscan.py') + # declare an INDEL flag for record INFO fields + self._add_indel_info_flag_to_header(common_header) + # take the remaining metadata from the template header produced by + # VarScan + with pysam.VariantFile(varcall_template, 'r') as original_data: + varscan_header = original_data.header + for sample in varscan_header.samples: + common_header.samples.add(sample) + common_header.merge(varscan_header) + return common_header + + def pileup_masker(self, mask): + def apply_mask_on_pileup(piled_items): + for item, status in zip(piled_items, mask): + if status: + yield item + return apply_mask_on_pileup + + def get_allele_specific_pileup_column_stats( + self, allele, pile_column, ref_fetch + ): + # number of reads supporting the given allele on + # forward and reverse strand, and in total + var_reads_plus = var_reads_minus = 0 + var_supp_read_mask = [] + for base in pile_column.get_query_sequences(): + if base == allele: + # allele supporting read on + strand + var_reads_plus += 1 + var_supp_read_mask.append(True) + elif base.upper() == allele: + # allele supporting read on - strand + var_reads_minus += 1 + var_supp_read_mask.append(True) + else: + var_supp_read_mask.append(False) + var_reads_total = var_reads_plus + var_reads_minus + + if var_reads_total == 0: + # No stats without reads! + return None + + var_supp_only = self.pileup_masker(var_supp_read_mask) + + # average mapping quality of the reads supporting the + # given allele + avg_mapping_quality = sum( + mq for mq in var_supp_only( + pile_column.get_mapping_qualities() + ) + ) / var_reads_total + + # for the remaining stats we need access to complete + # read information + piled_reads = [ + p for p in var_supp_only(pile_column.pileups) + ] + assert len(piled_reads) == var_reads_total + sum_avg_base_qualities = 0 + sum_dist_from_center = 0 + sum_dist_from_3prime = 0 + sum_clipped_length = 0 + sum_unclipped_length = 0 + sum_num_mismatches_as_fraction = 0 + sum_mismatch_qualities = 0 + + for p in piled_reads: + sum_avg_base_qualities += sum( + p.alignment.query_qualities + ) / p.alignment.infer_query_length() + sum_clipped_length += p.alignment.query_alignment_length + unclipped_length = p.alignment.infer_read_length() + sum_unclipped_length += unclipped_length + read_center = p.alignment.query_alignment_length / 2 + sum_dist_from_center += 1 - abs( + p.query_position - read_center + ) / read_center + if p.alignment.is_reverse: + sum_dist_from_3prime += p.query_position / unclipped_length + else: + sum_dist_from_3prime += 1 - p.query_position / unclipped_length + + sum_num_mismatches = 0 + for qpos, rpos in p.alignment.get_aligned_pairs(): + if qpos is not None and rpos is not None: + if p.alignment.query_sequence[qpos] != ref_fetch( + rpos, rpos + 1 + ).upper(): # ref bases can be lowercase! + sum_num_mismatches += 1 + sum_mismatch_qualities += p.alignment.query_qualities[ + qpos + ] + sum_num_mismatches_as_fraction += ( + sum_num_mismatches / p.alignment.query_alignment_length + ) + avg_basequality = sum_avg_base_qualities / var_reads_total + avg_pos_as_fraction = sum_dist_from_center / var_reads_total + avg_num_mismatches_as_fraction = ( + sum_num_mismatches_as_fraction / var_reads_total + ) + avg_sum_mismatch_qualities = sum_mismatch_qualities / var_reads_total + avg_clipped_length = sum_clipped_length / var_reads_total + avg_distance_to_effective_3p_end = ( + sum_dist_from_3prime / var_reads_total + ) + + return ( + avg_mapping_quality, + avg_basequality, + var_reads_plus, + var_reads_minus, + avg_pos_as_fraction, + avg_num_mismatches_as_fraction, + avg_sum_mismatch_qualities, + avg_clipped_length, + avg_distance_to_effective_3p_end + ) + + def _postprocess_variant_records(self, invcf, + min_var_count2, min_var_count2_lc, + min_var_freq2, max_somatic_p, + max_somatic_p_depth, + min_ref_readpos, min_var_readpos, + min_ref_dist3, min_var_dist3, + min_ref_len, min_var_len, + max_relative_len_diff, + min_strandedness, min_strand_reads, + min_ref_basequal, min_var_basequal, + max_basequal_diff, + min_ref_mapqual, min_var_mapqual, + max_mapqual_diff, + max_ref_mmqs, max_var_mmqs, + min_mmqs_diff, max_mmqs_diff, + **args): + # set FILTER field according to Varscan criteria + # multiple FILTER entries must be separated by semicolons + # No filters applied should be indicated with MISSING + + # since posterior filters are always applied to just one sample, + # a better place to store the info is in the FT genotype field: + # can be PASS, '.' to indicate that filters have not been applied, + # or a semicolon-separated list of filters that failed + # unfortunately, gemini does not support this field + + with ExitStack() as io_stack: + normal_reads, tumor_reads = ( + io_stack.enter_context( + pysam.Samfile(fn, 'rb')) for fn in self.bam_input_files + ) + refseq = io_stack.enter_context(pysam.FastaFile(self.ref_genome)) + pileup_args = self._get_pysam_pileup_args() + for record in invcf: + if any(len(allele) > 1 for allele in record.alleles): + # skip indel postprocessing for the moment + yield record + continue + # get pileup for genomic region affected by this variant + if record.info['SS'] == '2': + # a somatic variant => generate pileup from tumor data + pile = tumor_reads.pileup( + record.chrom, record.start, record.stop, + **pileup_args + ) + sample_of_interest = 'TUMOR' + elif record.info['SS'] in ['1', '3']: + # a germline or LOH variant => pileup from normal data + pile = normal_reads.pileup( + record.chrom, record.start, record.stop, + **pileup_args + ) + sample_of_interest = 'NORMAL' + else: + # TO DO: figure out if there is anything interesting to do + # for SS status codes 0 (reference) and 5 (unknown) + yield record + continue + + # apply false-positive filtering a la varscan fpfilter + # find the variant site in the pileup columns + for pile_column in pile: + if pile_column.reference_pos == record.start: + break + # extract required information + # overall read depth at the site + read_depth = pile_column.get_num_aligned() + assert read_depth > 0 + # no multiallelic sites in varscan + assert len(record.alleles) == 2 + if record.samples[sample_of_interest]['RD'] > 0: + ref_stats, alt_stats = [ + self.get_allele_specific_pileup_column_stats( + allele, + pile_column, + partial( + pysam.FastaFile.fetch, refseq, record.chrom + ) + ) + for allele in record.alleles + ] + else: + ref_stats = None + alt_stats = self.get_allele_specific_pileup_column_stats( + record.alleles[1], + pile_column, + partial(pysam.FastaFile.fetch, refseq, record.chrom) + ) + ref_count = 0 + if ref_stats: + ref_count = ref_stats[2] + ref_stats[3] + if ref_stats[1] < min_ref_basequal: + record.filter.add('RefBaseQual') + if ref_count >= 2: + if ref_stats[0] < min_ref_mapqual: + record.filter.add('RefMapQual') + if ref_stats[4] < min_ref_readpos: + record.filter.add('RefReadPos') + # ref_stats[5] (avg_num_mismatches_as_fraction + # is not a filter criterion in VarScan fpfilter + if ref_stats[6] > max_ref_mmqs: + record.filter.add('RefMMQS') + if ref_stats[7] < min_ref_len: + # VarScan fpfilter does not apply this filter + # for indels, but there is no reason + # not to do it. + record.filter.add('RefAvgRL') + if ref_stats[8] < min_ref_dist3: + record.filter.add('RefDist3') + if alt_stats: + alt_count = alt_stats[2] + alt_stats[3] + if ( + alt_count < min_var_count2_lc + ) or ( + read_depth >= max_somatic_p_depth and + alt_count < min_var_count2 + ): + record.filter.add('VarCount') + if alt_count / read_depth < min_var_freq2: + record.filter.add('VarFreq') + if alt_stats[1] < min_var_basequal: + record.filter.add('VarBaseQual') + if alt_count > min_strand_reads: + if ( + alt_stats[2] / alt_count < min_strandedness + ) or ( + alt_stats[3] / alt_count < min_strandedness + ): + record.filter.add('Strand') + if alt_stats[2] + alt_stats[3] >= 2: + if alt_stats[0] < min_var_mapqual: + record.filter.add('VarMapQual') + if alt_stats[4] < min_var_readpos: + record.filter.add('VarReadPos') + # alt_stats[5] (avg_num_mismatches_as_fraction + # is not a filter criterion in VarScan fpfilter + if alt_stats[6] > max_var_mmqs: + record.filter.add('VarMMQS') + if alt_stats[7] < min_var_len: + # VarScan fpfilter does not apply this filter + # for indels, but there is no reason + # not to do it. + record.filter.add('VarAvgRL') + if alt_stats[8] < min_var_dist3: + record.filter.add('VarDist3') + if ref_count >= 2 and alt_count >= 2: + if (ref_stats[0] - alt_stats[0]) > max_mapqual_diff: + record.filter.add('MapQualDiff') + if (ref_stats[1] - alt_stats[1]) > max_basequal_diff: + record.filter.add('MaxBAQdiff') + mmqs_diff = alt_stats[6] - ref_stats[6] + if mmqs_diff < min_mmqs_diff: + record.filter.add('MinMMQSdiff') + if mmqs_diff > max_mmqs_diff: + record.filter.add('MMQSdiff') + if ( + 1 - alt_stats[7] / ref_stats[7] + ) > max_relative_len_diff: + record.filter.add('ReadLenDiff') + else: + # No variant-supporting reads for this record! + # This can happen in rare cases because of + # samtools mpileup issues, but indicates a + # rather unreliable variant call. + record.filter.add('VarCount') + record.filter.add('VarFreq') + yield record + + def _indel_flagged_records(self, vcf): + for record in vcf: + record.info['INDEL'] = True + yield record + + def _merge_generator(self, vcf1, vcf2): + try: + record1 = next(vcf1) + except StopIteration: + for record2 in vcf2: + yield record2 + return + try: + record2 = next(vcf2) + except StopIteration: + yield record1 + for record1 in vcf1: + yield record1 + return + while True: + if (record1.start, record1.stop) < (record2.start, record2.stop): + yield record1 + try: + record1 = next(vcf1) + except StopIteration: + yield record2 + for record2 in vcf2: + yield record2 + return + else: + yield record2 + try: + record2 = next(vcf2) + except StopIteration: + yield record1 + for record1 in vcf1: + yield record1 + return + + def merge_and_postprocess(self, snps_out, indels_out=None, + no_filters=False, **filter_args): + temporary_data = self.tmpfiles + self.tmpfiles = [] + temporary_snp_files = [f + '.snp.vcf' for f in temporary_data] + temporary_indel_files = [f + '.indel.vcf' for f in temporary_data] + + for f in temporary_data: + try: + os.remove(f) + except Exception: + pass + + def noop_gen(data, **kwargs): + for d in data: + yield d + + if no_filters: + apply_filters = noop_gen + else: + apply_filters = self._postprocess_variant_records + + output_header = self._compile_common_header( + temporary_snp_files[0], + no_filters + ) + if indels_out is None: + with open(snps_out, 'w') as o: + o.write(str(output_header).format(**filter_args)) + for snp_f, indel_f in zip( + temporary_snp_files, temporary_indel_files + ): + with pysam.VariantFile(snp_f, 'r') as snp_invcf: + # fix the input header on the fly + # to avoid Warnings from htslib about missing contig + # info + self._add_ref_contigs_to_header(snp_invcf.header) + self._add_filters_to_header(snp_invcf.header) + self._add_indel_info_flag_to_header(snp_invcf.header) + with pysam.VariantFile(indel_f, 'r') as indel_invcf: + # fix the input header on the fly + # to avoid Warnings from htslib about missing + # contig info + self._add_ref_contigs_to_header(indel_invcf.header) + self._add_filters_to_header(indel_invcf.header) + self._add_indel_info_flag_to_header( + indel_invcf.header + ) + for record in apply_filters( + self._merge_generator( + snp_invcf, + self._indel_flagged_records(indel_invcf) + ), + **filter_args + ): + o.write(str(record)) + try: + os.remove(snp_f) + except Exception: + pass + try: + os.remove(indel_f) + except Exception: + pass + + else: + with open(snps_out, 'w') as o: + o.write(str(output_header).format(**filter_args)) + for f in temporary_snp_files: + with pysam.VariantFile(f, 'r') as invcf: + # fix the input header on the fly + # to avoid Warnings from htslib about missing + # contig info and errors because of undeclared + # filters + self._add_ref_contigs_to_header(invcf.header) + self._add_filters_to_header(invcf.header) + for record in apply_filters( + invcf, **filter_args + ): + o.write(str(record)) + try: + os.remove(f) + except Exception: + pass + with open(indels_out, 'w') as o: + o.write(str(output_header)) + for f in temporary_indel_files: + with pysam.VariantFile(f, 'r') as invcf: + # fix the input header on the fly + # to avoid Warnings from htslib about missing + # contig info and errors because of undeclared + # filters + self._add_ref_contigs_to_header(invcf.header) + self._add_filters_to_header(invcf.header) + self._add_indel_info_flag_to_header(invcf.header) + for record in apply_filters( + self._indel_flagged_records(invcf), **filter_args + ): + o.write(str(record)) + try: + os.remove(f) + except Exception: + pass + + +def varscan_call(ref_genome, normal, tumor, output_path, **args): + """Preparse arguments and orchestrate calling and postprocessing.""" + + if args.pop('split_output'): + if '%T' in output_path: + out = ( + output_path.replace('%T', 'snp'), + output_path.replace('%T', 'indel') + ) + else: + out = ( + output_path + '.snp', + output_path + '.indel' + ) + else: + out = (output_path, None) + + instance_args = { + k: args.pop(k) for k in [ + 'max_depth', + 'min_mapqual', + 'min_basequal', + 'threads', + 'verbose', + 'quiet' + ] + } + varscan_somatic_args = { + k: args.pop(k) for k in [ + 'normal_purity', + 'tumor_purity', + 'min_coverage', + 'min_var_count', + 'min_var_freq', + 'min_hom_freq', + 'somatic_p_value', + 'p_value' + ] + } + + v = VarScanCaller(ref_genome, [normal, tumor], **instance_args) + v.varcall_parallel(**varscan_somatic_args) + v.merge_and_postprocess(*out, **args) + + +if __name__ == '__main__': + p = argparse.ArgumentParser() + p.add_argument( + 'ref_genome', + metavar='reference_genome', + help='the reference genome (in fasta format)' + ) + p.add_argument( + '--normal', + metavar='BAM_file', required=True, + help='the BAM input file of aligned reads from the normal sample' + ) + p.add_argument( + '--tumor', + metavar='BAM_file', required=True, + help='the BAM input file of aligned reads from the tumor sample' + ) + p.add_argument( + '-o', '--ofile', required=True, + metavar='OFILE', dest='output_path', + help='Name of the variant output file. With --split-output, the name ' + 'may use the %%T replacement token or will be used as the ' + 'basename for the two output files to be generated (see ' + '-s|--split-output below).' + ) + p.add_argument( + '-s', '--split-output', + dest='split_output', action='store_true', default=False, + help='indicate that separate output files for SNPs and indels ' + 'should be generated (original VarScan behavior). If specified, ' + '%%T in the --ofile file name will be replaced with "snp" and ' + '"indel" to generate the names of the SNP and indel output ' + 'files, respectively. If %%T is not found in the file name, it ' + 'will get interpreted as a basename to which ".snp"/".indel" ' + 'will be appended.' + ) + p.add_argument( + '-t', '--threads', + type=int, default=1, + help='level of parallelism' + ) + p.add_argument( + '-v', '--verbose', + action='store_true', + help='be verbose about progress' + ) + p.add_argument( + '-q', '--quiet', + action='store_true', + help='suppress output from wrapped tools' + ) + call_group = p.add_argument_group('Variant calling parameters') + call_group.add_argument( + '--normal-purity', + dest='normal_purity', type=float, + default=1.0, + help='Estimated purity of the normal sample (default: 1.0)' + ) + call_group.add_argument( + '--tumor-purity', + dest='tumor_purity', type=float, + default=1.0, + help='Estimated purity of the tumor sample (default: 1.0)' + ) + call_group.add_argument( + '--max-pileup-depth', + dest='max_depth', type=int, default=8000, + help='Maximum depth of generated pileups (samtools mpileup -d option; ' + 'default: 8000)' + ) + call_group.add_argument( + '--min-basequal', + dest='min_basequal', type=int, + default=13, + help='Minimum base quality at the variant position to use a read ' + '(default: 13)' + ) + call_group.add_argument( + '--min-mapqual', + dest='min_mapqual', type=int, + default=0, + help='Minimum mapping quality required to use a read ' + '(default: 0)' + ) + call_group.add_argument( + '--min-coverage', + dest='min_coverage', type=int, + default=8, + help='Minimum site coverage required in the normal and in the tumor ' + 'sample to call a variant (default: 8)' + ) + call_group.add_argument( + '--min-var-count', + dest='min_var_count', type=int, + default=2, + help='Minimum number of variant-supporting reads required to call a ' + 'variant (default: 2)' + ) + call_group.add_argument( + '--min-var-freq', + dest='min_var_freq', type=float, + default=0.1, + help='Minimum variant allele frequency for calling (default: 0.1)' + ) + call_group.add_argument( + '--min-hom-freq', + dest='min_hom_freq', type=float, + default=0.75, + help='Minimum variant allele frequency for homozygous call ' + '(default: 0.75)' + ) + call_group.add_argument( + '--p-value', + dest='p_value', type=float, + default=0.99, + help='P-value threshold for heterozygous call (default: 0.99)' + ) + call_group.add_argument( + '--somatic-p-value', + dest='somatic_p_value', type=float, + default=0.05, + help='P-value threshold for somatic call (default: 0.05)' + ) + filter_group = p.add_argument_group('Posterior variant filter parameters') + filter_group.add_argument( + '--no-filters', + dest='no_filters', action='store_true', + help='Disable all posterior variant filters. ' + 'If specified, all following options will be ignored' + ) + filter_group.add_argument( + '--min-var-count2', + dest='min_var_count2', type=int, + default=4, + help='Minimum number of variant-supporting reads (default: 4)' + ) + filter_group.add_argument( + '--min-var-count2-lc', + dest='min_var_count2_lc', type=int, + default=2, + help='Minimum number of variant-supporting reads when depth below ' + '--somatic-p-depth (default: 2)' + ) + filter_group.add_argument( + '--min-var-freq2', + dest='min_var_freq2', type=float, + default=0.05, + help='Minimum variant allele frequency (default: 0.05)' + ) + filter_group.add_argument( + '--max-somatic-p', + dest='max_somatic_p', type=float, + default=0.05, + help='Maximum somatic p-value (default: 0.05)' + ) + filter_group.add_argument( + '--max-somatic-p-depth', + dest='max_somatic_p_depth', type=int, + default=10, + help='Depth required to run --max-somatic-p filter (default: 10)' + ) + filter_group.add_argument( + '--min-ref-readpos', + dest='min_ref_readpos', type=float, + default=0.1, + help='Minimum average relative distance of site from the ends of ' + 'ref-supporting reads (default: 0.1)' + ) + filter_group.add_argument( + '--min-var-readpos', + dest='min_var_readpos', type=float, + default=0.1, + help='Minimum average relative distance of site from the ends of ' + 'variant-supporting reads (default: 0.1)' + ) + filter_group.add_argument( + '--min-ref-dist3', + dest='min_ref_dist3', type=float, + default=0.1, + help='Minimum average relative distance of site from the effective ' + '3\'end of ref-supporting reads (default: 0.1)' + ) + filter_group.add_argument( + '--min-var-dist3', + dest='min_var_dist3', type=float, + default=0.1, + help='Minimum average relative distance of site from the effective ' + '3\'end of variant-supporting reads (default: 0.1)' + ) + filter_group.add_argument( + '--min-ref-len', + dest='min_ref_len', type=int, + default=90, + help='Minimum average trimmed length of reads supporting the ref ' + 'allele (default: 90)' + ) + filter_group.add_argument( + '--min-var-len', + dest='min_var_len', type=int, + default=90, + help='Minimum average trimmed length of reads supporting the variant ' + 'allele (default: 90)' + ) + filter_group.add_argument( + '--max-len-diff', + dest='max_relative_len_diff', type=float, + default=0.25, + help='Maximum average relative read length difference (ref - var; ' + 'default: 0.25)' + ) + filter_group.add_argument( + '--min-strandedness', + dest='min_strandedness', type=float, + default=0.01, + help='Minimum fraction of variant reads from each strand ' + '(default: 0.01)' + ) + filter_group.add_argument( + '--min-strand-reads', + dest='min_strand_reads', type=int, + default=5, + help='Minimum allele depth required to run --min-strandedness filter ' + '(default: 5)' + ) + filter_group.add_argument( + '--min-ref-basequal', + dest='min_ref_basequal', type=int, + default=15, + help='Minimum average base quality for the ref allele (default: 15)' + ) + filter_group.add_argument( + '--min-var-basequal', + dest='min_var_basequal', type=int, + default=15, + help='Minimum average base quality for the variant allele ' + '(default: 15)' + ) + filter_group.add_argument( + '--max-basequal-diff', + dest='max_basequal_diff', type=int, + default=50, + help='Maximum average base quality diff (ref - var; default: 50)' + ) + filter_group.add_argument( + '--min-ref-mapqual', + dest='min_ref_mapqual', type=int, + default=15, + help='Minimum average mapping quality of reads supporting the ref ' + 'allele (default: 15)' + ) + filter_group.add_argument( + '--min-var-mapqual', + dest='min_var_mapqual', type=int, + default=15, + help='Minimum average mapping quality of reads supporting the variant ' + 'allele (default: 15)' + ) + filter_group.add_argument( + '--max-mapqual-diff', + dest='max_mapqual_diff', type=int, + default=50, + help='Maximum average mapping quality difference (ref - var; ' + 'default: 50)' + ) + filter_group.add_argument( + '--max-ref-mmqs', + dest='max_ref_mmqs', type=int, + default=100, + help='Maximum mismatch quality sum of reads supporting the ref ' + 'allele (default: 100)' + ) + filter_group.add_argument( + '--max-var-mmqs', + dest='max_var_mmqs', type=int, + default=100, + help='Maximum mismatch quality sum of reads supporting the variant ' + 'allele (default: 100)' + ) + filter_group.add_argument( + '--min-mmqs-diff', + dest='min_mmqs_diff', type=int, + default=0, + help='Minimum mismatch quality sum difference (var - ref; default: 0)' + ) + filter_group.add_argument( + '--max-mmqs-diff', + dest='max_mmqs_diff', type=int, + default=50, + help='Maximum mismatch quality sum difference (var - ref; default: 50)' + ) + args = vars(p.parse_args()) + varscan_call(**args)
--- a/varscan_copynumber.xml Sun Jul 15 09:19:48 2018 -0400 +++ b/varscan_copynumber.xml Tue Dec 04 05:15:15 2018 -0500 @@ -1,4 +1,4 @@ -<tool id="varscan_copynumber" name="VarScan copynumber" version="@VERSION@.1"> +<tool id="varscan_copynumber" name="VarScan copynumber" version="@VERSION@.2"> <description>Determine relative tumor copy number from tumor-normal pileups</description> <macros> <import>macros.xml</import> @@ -65,16 +65,8 @@ </test> </tests> - <help> -**VarScan Overview** - -VarScan_ performs variant detection for massively parallel sequencing data, such as exome, WGS, and transcriptome data. -It calls variants from a mpileup dataset and produces a VCF 4.1. Full documentation is available online_. - -This tool estimates the relative tumor copy number from tumor-normal pileups. - -.. _VarScan: http://dkoboldt.github.io/varscan/ -.. _online: http://dkoboldt.github.io/varscan/using-varscan.html + <help><![CDATA[ +@HELP_HEADER@ **Input** @@ -88,6 +80,6 @@ VarScan produces a VCF 4.1 dataset as output. - </help> + ]]></help> <expand macro="citations" /> </tool>