diff varextract.xml @ 0:f0f2795de2c7 draft

planemo upload for repository https://github.com/wm75/mimodd_galaxy_wrappers commit 528bcf3b769c7c73f119b2a176d19071f9ef5312
author wolma
date Tue, 19 Dec 2017 04:54:04 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/varextract.xml	Tue Dec 19 04:54:04 2017 -0500
@@ -0,0 +1,184 @@
+<tool id="mimodd_varextract" name="MiModD Extract Variant Sites" 
+version="@MIMODD_WRAPPER_VERSION@">
+    <description>from a BCF file</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="stdio" />
+    <expand macro="version_command" />
+    <command><![CDATA[
+	mimodd varextract '$ifile'
+	  #if $len($sitesinfo)
+	    -p
+	    #for $source in $sitesinfo
+          '${source.pre_vcf}'
+	    #end for
+	  #end if
+	  --ofile '$output_vcf'
+	  $keep_alts
+	  --verbose
+    ]]></command>
+
+    <inputs>
+        <param name="ifile" type="data" format="bcf" label="BCF input file"
+        help="Use the MiModD Variant Calling tool to generate the input for this tool."/>
+        <repeat name="sitesinfo" title="include information from pre-calculated vcf dataset" default="0">
+	        <param name="pre_vcf" type="data" format="vcf"
+	        label="independently generated vcf datset" />
+        </repeat>
+        <param name="keep_alts" type="boolean" truevalue="-a" falsevalue="" checked="false"
+        label="keep all sites with alternate bases"
+        help="If selected, the VCF output will include ALL sites for which non-reference bases have been observed, i.e., even those not considered allelic sites by the variant caller." />
+    </inputs>
+    <outputs>
+        <data name="output_vcf" format="vcf"
+        label="Variants extracted with MiModd from ${on_string}"/>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="ifile" value="a.bcf" />
+            <output name="output_vcf" ftype="vcf">
+                <assert_contents>
+                    <has_line_matching expression="#CHROM.POS.ID.REF.ALT.QUAL.FILTER.INFO.FORMAT.N2.ot266" />
+                </assert_contents>
+            </output>
+            <assert_command>
+                <not_has_text text="-a" />
+            </assert_command>
+        </test>
+        <test>
+            <param name="ifile" value="a_part2.bcf" />
+            <param name="keep_alts" value="true" />
+            <param name="pre_vcf" value="a.vcf" />
+            <output name="output_vcf" ftype="vcf">
+                <assert_contents>
+                    <has_line_matching expression="#CHROM.POS.ID.REF.ALT.QUAL.FILTER.INFO.FORMAT.ot266.external_source_1_N2.external_source_1_ot266" />
+                </assert_contents>
+            </output>
+            <assert_command>
+                <has_text text="-a" />
+            </assert_command>
+        </test>
+    </tests>
+    <help><![CDATA[
+.. class:: infomark
+
+   **What it does**
+
+The tool takes as input a BCF dataset like the ones produced by the
+*MiModD Variant Calling* tool, extracts just the variant sites from it and
+reports them in VCF format.
+
+If the BCF input file specifies multiple samples, sites are included if they qualify as variant sites in at least one sample.
+
+----------
+
+**Options:**
+
+**keep all sites with alternate bases**
+
+By default, a variant site is considered to be a position in the genome for
+which a non-reference allele appears in the inferred genotype of any sample. 
+
+You can select the *keep all sites with alternate bases* option, if instead
+you want to extract all sites, for which at least one non-reference base has
+been observed (whether resulting in a non-reference allele call or not).
+Using this option should rarely be necessary, but could be occassionally
+helpful for closer inspection of candidate genomic regions.
+
+ 
+**include information from pre-calculated vcf dataset**
+
+During the process of variant extraction the tool can take into account
+genome positions specified in one or more independently generated VCF datasets.
+If such additional VCF input is provided, the tool output will contain the
+samples found in these files as additional samples and sites from the main BCF
+dataset will be included not only if they qualify as variant sites in at least
+one sample specified in the BCF, but also if they are listed in any of the
+additional VCF datasets.
+
+Optional VCF input can be particularly useful in one of the following
+situations:
+   
+1) you have prior information that leads you to think that certain genome
+   positions are of special relevance for your project and, thus, you are
+   interested in the statistics produced by the variant caller for these
+   positions even if they are not considered variant sites. In this case you
+   can use a minimal VCF dataset to guide the variant extraction process to
+   include these positions. This dataset needs a minimal header of the form:
+
+   ``##fileformat=VCFv4.2``
+
+   followed by positional information like in this example::
+ 						     
+     #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+     chrI	1222	.	.	.	.	.	.
+     chrI	2651	.	.	.	.	.	.
+     chrI	3659	.	.	.	.	.	.
+     chrI	3731	.	.	.	.	.	.
+  
+   , where columns are tab-separated and . serves as a placeholder for missing
+   information.
+  
+2) you have actual variant calls from an additional sample, but you do not
+   have access to the original sequenced reads data (if you had, the
+   recommended approach would be to include that data in the
+   *MiModD Variant Calling* step. 
+
+   This situation is often encountered with published datasets. Assume you
+   have obtained a list of known single nucleotide variants (SNVs) found in
+   one particular strain of your favorite model organism and you would like
+   to know which of these SNVs are present in the related strains you have
+   sequenced. You have aligned the sequenced reads from your samples and have
+   used the *MiModD Variant Calling* tool, which has generated a BCF dataset
+   ready for variant extraction. If the SNV list for the previously sequenced
+   strain is in VCF format already, you can now just plug it into the
+   analysis process by specifying it in the tool interface as an
+   *independently generated vcf dataset*.
+   The resulting vcf output will contain all SNV sites along with the variant
+   sites found in the BCF alone. You can then proceed to the
+   *MiModD VCF Filter* tool to look at the original SNV sites only or to
+   investigate any other interesting subset of sites. If the SNV list is in
+   some other format, you will have o convert it to VCF first. At a minimum,
+   the dataset must have a ``##fileformat`` header line like the previous
+   example and have the ``REF`` and ``ALT`` column filled in like so::
+
+     #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+     chrI	1897409	.	A	G	.	.	.
+     chrI	1897492	.	C	T	.	.	.
+     chrI	1897616	.	C	A	.	.	.
+     chrI	1897987	.	A	T	.	.	.
+     chrI	1898185	.	C	T	.	.	.
+     chrI	1898715	.	G	A	.	.	.
+     chrI	1898729	.	T	C	.	.	.
+     chrI	1900288	.	T	A	.	.	.
+  
+   , in which case the tool will assume that the corresponding sample is
+   homozygous for each of the SNVs.
+   If you need to distinguish between homozygous and heterozygous SNVs you
+   will have to extend the format to include a format and a sample column
+   with genotype (GT) information like in this example::
+
+     #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sampleX
+     chrI	1897409	.	A	G	.	.	.	GT	1/1
+     chrI	1897492	.	C	T	.	.	.	GT	0/1
+     chrI	1897616	.	C	A	.	.	.	GT	0/1
+     chrI	1897987	.	A	T	.	.	.	GT	0/1
+     chrI	1898185	.	C	T	.	.	.	GT	0/1
+     chrI	1898715	.	G	A	.	.	.	GT	0/1
+     chrI	1898729	.	T	C	.	.	.	GT	0/1
+     chrI	1900288	.	T	A	.	.	.	GT	0/1
+  
+   , in which sampleX would be heterozygous for all SNVs except the first.
+
+.. class:: warningmark
+   
+   If the optional VCF input contains INDEL calls, these will be ignored by the
+   tool.
+
+@HELP_FOOTER@
+    ]]></help>
+    <expand macro="citations" />
+</tool>