changeset 0:93ea196681c8 draft

"planemo upload for repository https://github.com/phac-nml/snvphyl-galaxy commit 90a172f1fc12b9c4d73f4c924a8c0c5a559589d0"
author nml
date Tue, 27 Aug 2019 12:31:30 -0400
parents
children 48922d9ca355
files filtervcf.xml test-data/filterVcf.input.1.vcf test-data/filterVcf.output.1.vcf
diffstat 3 files changed, 137 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filtervcf.xml	Tue Aug 27 12:31:30 2019 -0400
@@ -0,0 +1,27 @@
+<tool id="filtervcf" name="Filter vcf" version ="1.8.2">
+  <description>filter out indels and complex SNVS</description>
+  <requirements>
+    <requirement type="package" version="1.8.2">snvphyl-tools</requirement>
+  </requirements>
+ <command detect_errors="exit_code">
+	filterVcf.pl --noindels  $vcf
+	-o $vcfout
+  </command>
+  <inputs>
+    <param name="vcf" type="data"  label="VCF file" format="vcf"/>
+  </inputs>
+  <outputs>
+    <data format="vcf" name="vcfout" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="vcf" value="filterVcf.input.1.vcf"/>
+      <output name="vcfout" file="filterVcf.output.1.vcf"/>
+    </test>
+  </tests>
+  <help>
+  Filter out indels and complex variants from VCF file
+  </help>
+<citations>
+</citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filterVcf.input.1.vcf	Tue Aug 27 12:31:30 2019 -0400
@@ -0,0 +1,55 @@
+##fileformat=VCFv4.1
+##fileDate=20140423
+##source=freeBayes version 0.9.8
+##reference=/home/aaron/microbialinformatics2014/core-snv-tutorial/output-10-subsample/reference/2010EL-1749.2010EL-1786-c1_2000_2400kb.fasta
+##phasing=none
+##commandline="/opt/freebayes/freebayes --bam /home/aaron/microbialinformatics2014/core-snv-tutorial/output-10-subsample/bam/2010EL-1749.bam --vcf /home/aaron/microbialinformatics2014/core-snv-tutorial/output-10-subsample/vcf/2010EL-1749.vcf --fasta-reference /home/aaron/microbialinformatics2014/core-snv-tutorial/output-10-subsample/reference/2010EL-1749.2010EL-1786-c1_2000_2400kb.fasta --min-coverage 2 --pvar 0 --ploidy 1 --left-align-indels --min-mapping-quality 30 --min-base-quality 30 --min-alternate-fraction 0.75"
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total read depth at the locus">
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Total number of alternate alleles in called genotypes">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Estimated allele frequency in the range (0,1]">
+##INFO=<ID=RO,Number=1,Type=Integer,Description="Reference allele observations">
+##INFO=<ID=AO,Number=A,Type=Integer,Description="Alternate allele observations">
+##INFO=<ID=SRP,Number=1,Type=Float,Description="Strand balance probability for the reference allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SRF and SRR given E(SRF/SRR) ~ 0.5, derived using Hoeffding's inequality">
+##INFO=<ID=SAP,Number=A,Type=Float,Description="Strand balance probability for the alternate allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SAF and SAR given E(SAF/SAR) ~ 0.5, derived using Hoeffding's inequality">
+##INFO=<ID=AB,Number=A,Type=Float,Description="Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous">
+##INFO=<ID=ABP,Number=A,Type=Float,Description="Allele balance probability at heterozygous sites: Phred-scaled upper-bounds estimate of the probability of observing the deviation between ABR and ABA given E(ABR/ABA) ~ 0.5, derived using Hoeffding's inequality">
+##INFO=<ID=RUN,Number=A,Type=Integer,Description="Run length: the number of consecutive repeats of the alternate allele in the reference genome">
+##INFO=<ID=RPP,Number=A,Type=Float,Description="Read Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality">
+##INFO=<ID=RPPR,Number=1,Type=Float,Description="Read Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality">
+##INFO=<ID=EPP,Number=A,Type=Float,Description="End Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding's inequality">
+##INFO=<ID=EPPR,Number=1,Type=Float,Description="End Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding's inequality">
+##INFO=<ID=DPRA,Number=A,Type=Float,Description="Alternate allele depth ratio.  Ratio between depth in samples with each called alternate allele and those without.">
+##INFO=<ID=XRM,Number=1,Type=Float,Description="Reference allele read mismatch rate: The rate of SNVs + MNPs + INDELs in reads supporting the reference allele.">
+##INFO=<ID=XRS,Number=1,Type=Float,Description="Reference allele read SNV rate: The rate of per-base mismatches (SNVs + MNPs) in reads supporting the reference allele.">
+##INFO=<ID=XRI,Number=1,Type=Float,Description="Reference allele read INDEL rate: The rate of INDELs (gaps) in reads supporting the reference allele.">
+##INFO=<ID=XAM,Number=A,Type=Float,Description="Alternate allele read mismatch rate: The rate of SNVs + MNPs + INDELs in reads supporting the alternate allele, excluding the called variant.">
+##INFO=<ID=XAS,Number=A,Type=Float,Description="Alternate allele read SNV rate: The rate of per-base mismatches (SNVs + MNPs) in reads supporting the alternate allele, excluding the called variant.">
+##INFO=<ID=XAI,Number=A,Type=Float,Description="Alternate allele read INDEL rate: The rate of INDELs (gaps) in reads supporting the alternate allele, excluding the called variant.">
+##INFO=<ID=ODDS,Number=1,Type=Float,Description="The log odds ratio of the best genotype combination to the second-best.">
+##INFO=<ID=BVAR,Number=0,Type=Flag,Description="The best genotype combination in the posterior is variant (non homozygous).">
+##INFO=<ID=CpG,Number=0,Type=Flag,Description="CpG site (either CpG, TpG or CpA)">
+##INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele, either snv, mnp, ins, del, or complex.">
+##INFO=<ID=CIGAR,Number=A,Type=String,Description="The extended CIGAR representation of each alternate allele, with the exception that '=' is replaced by 'M' to ease VCF parsing.  Note that INDEL alleles do not have the first matched base (which is provided by default, per the spec) referred to by the CIGAR.">
+##INFO=<ID=NUMALT,Number=1,Type=Integer,Description="Number of unique non-reference alleles in called genotypes at this position.">
+##INFO=<ID=MEANALT,Number=A,Type=Float,Description="Mean number of unique non-reference allele observations per sample with the corresponding alternate alleles.">
+##INFO=<ID=HWE,Number=1,Type=Float,Description="Phred-scaled discrete HWE prior probability of the genotyping across all samples.">
+##INFO=<ID=LEN,Number=A,Type=Integer,Description="allele length">
+##INFO=<ID=MQM,Number=A,Type=Float,Description="Mean mapping quality of observed alternate alleles">
+##INFO=<ID=MQMR,Number=1,Type=Float,Description="Mean mapping quality of observed reference alleles">
+##INFO=<ID=PAIRED,Number=A,Type=Float,Description="Proportion of observed alternate alleles which are supported by properly paired read fragments">
+##INFO=<ID=PAIREDR,Number=1,Type=Float,Description="Proportion of observed reference alleles which are supported by properly paired read fragments">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype">
+##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy">
+##FORMAT=<ID=GLE,Number=1,Type=String,Description="Genotype Likelihood Explicit, same as GL, but with tags to indicate the specific genotype.  For instance, 0^-75.22|1^-223.42|0/0^-323.03|1/0^-99.29|1/1^-802.53 represents both haploid and diploid genotype likilehoods in a biallelic context">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=RO,Number=1,Type=Integer,Description="Reference allele observation count">
+##FORMAT=<ID=QR,Number=1,Type=Integer,Description="Sum of quality of the reference observations">
+##FORMAT=<ID=AO,Number=A,Type=Integer,Description="Alternate allele observation count">
+##FORMAT=<ID=QA,Number=A,Type=Integer,Description="Sum of quality of the alternate observations">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	unknown
+gi|360034408|ref|NC_016445.1|_2000000_2400000	128	.	GA	AT	197.867	.	AB=0;ABP=0;AC=1;AF=1;AN=1;AO=3;CIGAR=1X;DP=3;DPRA=0;EPP=3.73412;EPPR=0;HWE=-0;LEN=1;MEANALT=1;MQM=54;MQMR=0;NS=1;NUMALT=1;ODDS=45.5605;PAIRED=0;PAIREDR=0;RO=0;RPP=3.73412;RPPR=0;RUN=1;SAP=3.73412;SRP=0;TYPE=snv;XAI=0;XAM=0;XAS=0;XRI=0;XRM=0;XRS=0;BVAR	GT:GQ:DP:RO:QR:AO:QA:GL	1:50000:3:0:0:3:212:-19.7867,0
+gi|360034408|ref|NC_016445.1|_2000000_2400000	256	.	A	C	250.1	.	AB=0;ABP=0;AC=1;AF=1;AN=1;AO=6;CIGAR=1X;DP=8;DPRA=0;EPP=8.80089;EPPR=3.0103;HWE=-0;LEN=1;MEANALT=1;MQM=54;MQMR=54;NS=1;NUMALT=1;ODDS=57.5877;PAIRED=0;PAIREDR=0;RO=2;RPP=8.80089;RPPR=7.35324;RUN=1;SAP=4.45795;SRP=3.0103;TYPE=snv;XAI=0;XAM=0;XAS=0;XRI=0;XRM=0;XRS=0;BVAR	GT:GQ:DP:RO:QR:AO:QA:GL	1:50000:8:2:142:6:420:-38.5,-13.49
+gi|360034408|ref|NC_016445.1|_2000000_2400000	512	.	AT	C	250.1	.	AB=0;ABP=0;AC=1;AF=1;AN=1;AO=6;CIGAR=1X;DP=8;DPRA=0;EPP=8.80089;EPPR=3.0103;HWE=-0;LEN=1;MEANALT=1;MQM=54;MQMR=54;NS=1;NUMALT=1;ODDS=57.5877;PAIRED=0;PAIREDR=0;RO=2;RPP=8.80089;RPPR=7.35324;RUN=1;SAP=4.45795;SRP=3.0103;TYPE=snv;XAI=0;XAM=0;XAS=0;XRI=0;XRM=0;XRS=0;BVAR	GT:GQ:DP:RO:QR:AO:QA:GL	1:50000:8:2:142:6:420:-38.5,-13.49
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filterVcf.output.1.vcf	Tue Aug 27 12:31:30 2019 -0400
@@ -0,0 +1,55 @@
+##fileformat=VCFv4.1
+##fileDate=20140423
+##source=freeBayes version 0.9.8
+##reference=/home/aaron/microbialinformatics2014/core-snv-tutorial/output-10-subsample/reference/2010EL-1749.2010EL-1786-c1_2000_2400kb.fasta
+##phasing=none
+##commandline="/opt/freebayes/freebayes --bam /home/aaron/microbialinformatics2014/core-snv-tutorial/output-10-subsample/bam/2010EL-1749.bam --vcf /home/aaron/microbialinformatics2014/core-snv-tutorial/output-10-subsample/vcf/2010EL-1749.vcf --fasta-reference /home/aaron/microbialinformatics2014/core-snv-tutorial/output-10-subsample/reference/2010EL-1749.2010EL-1786-c1_2000_2400kb.fasta --min-coverage 2 --pvar 0 --ploidy 1 --left-align-indels --min-mapping-quality 30 --min-base-quality 30 --min-alternate-fraction 0.75"
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total read depth at the locus">
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Total number of alternate alleles in called genotypes">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Estimated allele frequency in the range (0,1]">
+##INFO=<ID=RO,Number=1,Type=Integer,Description="Reference allele observations">
+##INFO=<ID=AO,Number=A,Type=Integer,Description="Alternate allele observations">
+##INFO=<ID=SRP,Number=1,Type=Float,Description="Strand balance probability for the reference allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SRF and SRR given E(SRF/SRR) ~ 0.5, derived using Hoeffding's inequality">
+##INFO=<ID=SAP,Number=A,Type=Float,Description="Strand balance probability for the alternate allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SAF and SAR given E(SAF/SAR) ~ 0.5, derived using Hoeffding's inequality">
+##INFO=<ID=AB,Number=A,Type=Float,Description="Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous">
+##INFO=<ID=ABP,Number=A,Type=Float,Description="Allele balance probability at heterozygous sites: Phred-scaled upper-bounds estimate of the probability of observing the deviation between ABR and ABA given E(ABR/ABA) ~ 0.5, derived using Hoeffding's inequality">
+##INFO=<ID=RUN,Number=A,Type=Integer,Description="Run length: the number of consecutive repeats of the alternate allele in the reference genome">
+##INFO=<ID=RPP,Number=A,Type=Float,Description="Read Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality">
+##INFO=<ID=RPPR,Number=1,Type=Float,Description="Read Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality">
+##INFO=<ID=EPP,Number=A,Type=Float,Description="End Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding's inequality">
+##INFO=<ID=EPPR,Number=1,Type=Float,Description="End Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding's inequality">
+##INFO=<ID=DPRA,Number=A,Type=Float,Description="Alternate allele depth ratio.  Ratio between depth in samples with each called alternate allele and those without.">
+##INFO=<ID=XRM,Number=1,Type=Float,Description="Reference allele read mismatch rate: The rate of SNVs + MNPs + INDELs in reads supporting the reference allele.">
+##INFO=<ID=XRS,Number=1,Type=Float,Description="Reference allele read SNV rate: The rate of per-base mismatches (SNVs + MNPs) in reads supporting the reference allele.">
+##INFO=<ID=XRI,Number=1,Type=Float,Description="Reference allele read INDEL rate: The rate of INDELs (gaps) in reads supporting the reference allele.">
+##INFO=<ID=XAM,Number=A,Type=Float,Description="Alternate allele read mismatch rate: The rate of SNVs + MNPs + INDELs in reads supporting the alternate allele, excluding the called variant.">
+##INFO=<ID=XAS,Number=A,Type=Float,Description="Alternate allele read SNV rate: The rate of per-base mismatches (SNVs + MNPs) in reads supporting the alternate allele, excluding the called variant.">
+##INFO=<ID=XAI,Number=A,Type=Float,Description="Alternate allele read INDEL rate: The rate of INDELs (gaps) in reads supporting the alternate allele, excluding the called variant.">
+##INFO=<ID=ODDS,Number=1,Type=Float,Description="The log odds ratio of the best genotype combination to the second-best.">
+##INFO=<ID=BVAR,Number=0,Type=Flag,Description="The best genotype combination in the posterior is variant (non homozygous).">
+##INFO=<ID=CpG,Number=0,Type=Flag,Description="CpG site (either CpG, TpG or CpA)">
+##INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele, either snv, mnp, ins, del, or complex.">
+##INFO=<ID=CIGAR,Number=A,Type=String,Description="The extended CIGAR representation of each alternate allele, with the exception that '=' is replaced by 'M' to ease VCF parsing.  Note that INDEL alleles do not have the first matched base (which is provided by default, per the spec) referred to by the CIGAR.">
+##INFO=<ID=NUMALT,Number=1,Type=Integer,Description="Number of unique non-reference alleles in called genotypes at this position.">
+##INFO=<ID=MEANALT,Number=A,Type=Float,Description="Mean number of unique non-reference allele observations per sample with the corresponding alternate alleles.">
+##INFO=<ID=HWE,Number=1,Type=Float,Description="Phred-scaled discrete HWE prior probability of the genotyping across all samples.">
+##INFO=<ID=LEN,Number=A,Type=Integer,Description="allele length">
+##INFO=<ID=MQM,Number=A,Type=Float,Description="Mean mapping quality of observed alternate alleles">
+##INFO=<ID=MQMR,Number=1,Type=Float,Description="Mean mapping quality of observed reference alleles">
+##INFO=<ID=PAIRED,Number=A,Type=Float,Description="Proportion of observed alternate alleles which are supported by properly paired read fragments">
+##INFO=<ID=PAIREDR,Number=1,Type=Float,Description="Proportion of observed reference alleles which are supported by properly paired read fragments">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype">
+##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy">
+##FORMAT=<ID=GLE,Number=1,Type=String,Description="Genotype Likelihood Explicit, same as GL, but with tags to indicate the specific genotype.  For instance, 0^-75.22|1^-223.42|0/0^-323.03|1/0^-99.29|1/1^-802.53 represents both haploid and diploid genotype likilehoods in a biallelic context">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=RO,Number=1,Type=Integer,Description="Reference allele observation count">
+##FORMAT=<ID=QR,Number=1,Type=Integer,Description="Sum of quality of the reference observations">
+##FORMAT=<ID=AO,Number=A,Type=Integer,Description="Alternate allele observation count">
+##FORMAT=<ID=QA,Number=A,Type=Integer,Description="Sum of quality of the alternate observations">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	unknown
+gi|360034408|ref|NC_016445.1|_2000000_2400000	128	.	G	A	197.867	.	AB=0;ABP=0;AC=1;AF=1;AN=1;AO=3;CIGAR=1X;DP=3;DPRA=0;EPP=3.73412;EPPR=0;HWE=-0;LEN=1;MEANALT=1;MQM=54;MQMR=0;NS=1;NUMALT=1;ODDS=45.5605;PAIRED=0;PAIREDR=0;RO=0;RPP=3.73412;RPPR=0;RUN=1;SAP=3.73412;SRP=0;TYPE=snv;XAI=0;XAM=0;XAS=0;XRI=0;XRM=0;XRS=0;BVAR	GT:GQ:DP:RO:QR:AO:QA:GL	1:50000:3:0:0:3:212:-19.7867,0
+gi|360034408|ref|NC_016445.1|_2000000_2400000	129	.	A	T	197.867	.	AB=0;ABP=0;AC=1;AF=1;AN=1;AO=3;CIGAR=1X;DP=3;DPRA=0;EPP=3.73412;EPPR=0;HWE=-0;LEN=1;MEANALT=1;MQM=54;MQMR=0;NS=1;NUMALT=1;ODDS=45.5605;PAIRED=0;PAIREDR=0;RO=0;RPP=3.73412;RPPR=0;RUN=1;SAP=3.73412;SRP=0;TYPE=snv;XAI=0;XAM=0;XAS=0;XRI=0;XRM=0;XRS=0;BVAR	GT:GQ:DP:RO:QR:AO:QA:GL	1:50000:3:0:0:3:212:-19.7867,0
+gi|360034408|ref|NC_016445.1|_2000000_2400000	256	.	A	C	250.1	.	AB=0;ABP=0;AC=1;AF=1;AN=1;AO=6;CIGAR=1X;DP=8;DPRA=0;EPP=8.80089;EPPR=3.0103;HWE=-0;LEN=1;MEANALT=1;MQM=54;MQMR=54;NS=1;NUMALT=1;ODDS=57.5877;PAIRED=0;PAIREDR=0;RO=2;RPP=8.80089;RPPR=7.35324;RUN=1;SAP=4.45795;SRP=3.0103;TYPE=snv;XAI=0;XAM=0;XAS=0;XRI=0;XRM=0;XRS=0;BVAR	GT:GQ:DP:RO:QR:AO:QA:GL	1:50000:8:2:142:6:420:-38.5,-13.49