diff beagle.xml @ 0:553b27c30eb8 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/beagle commit ccb3f8eaa99490f8513200e45fc59e5011fb41e8"
author iuc
date Sat, 03 Jul 2021 23:33:37 +0000
parents
children f75bf16ac901
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/beagle.xml	Sat Jul 03 23:33:37 2021 +0000
@@ -0,0 +1,314 @@
+<tool id='beagle' name='Beagle' version='@TOOL_VERSION@+galaxy@SUFFIX_VERSION@' profile='20.01'>
+    <description>phasing genotypes and imputing ungenotyped markers</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro='edam_ontology' />
+    <expand macro='requirements' />
+    <command detect_errors='exit_code'><![CDATA[
+        #set out_prefix='out'
+        #if $optional_inputs.ref.ext == 'bref3'
+            ln -s '${optional_inputs.ref}' ref.bref3 &&
+        #end if
+        beagle
+        gt='${gt}'
+        #if $optional_inputs.ref and $optional_inputs.ref.ext == 'bref3'
+            ref=ref.bref3
+        #else if $optional_inputs.ref
+            ref='${optional_inputs.ref}'
+        #end if
+        #if $optional_inputs.map
+            map='${optional_inputs.map}'
+        #end if
+        #if $chrom
+            chrom='${chrom}'
+        #end if
+        #if $optional_inputs.excludesamples
+            excludesamples='${optional_inputs.excludesamples}'
+        #end if
+        #if $optional_inputs.excludemarkers
+            excludemarkers='${optional_inputs.excludemarkers}'
+        #end if
+        ne=$ne
+        window=$window
+        overlap=$overlap
+        #if $seed
+            seed=$seed
+        #end if
+        #if $err
+            err=$err
+        #end if
+        burnin=$phasing_parameters.burnin
+        iterations=$phasing_parameters.iterations
+        phase-states=$phasing_parameters.phase_states
+        impute=$imputation_parameters.impute
+        imp-states=$imputation_parameters.imp_states
+        imp-segment=$imputation_parameters.imp_segment
+        imp-step=$imputation_parameters.imp_step
+        cluster=$imputation_parameters.cluster
+        ap=$imputation_parameters.ap
+        gp=$imputation_parameters.gp
+        out=$out_prefix
+        nthreads=\${GALAXY_SLOTS:-1}
+        && gunzip 'out.vcf.gz'
+    ]]>    </command>
+    <inputs>
+        <param argument="gt" type="data" format="vcf" label="VCF file" 
+            help="It specifies a VCF file containing genotypes for the study samples. 
+                Each VCF record must contain a GT (genotype) format field"/>
+        <section name="optional_inputs" title="Optional input files" expanded="true">
+            <param argument="ref" type="data" format="vcf,bref3" optional="true" label="Bref3 or VCF file with phased genotypes" 
+                help="Each genotype must have two phased, non-missing alleles. If a VCF file is specified, the 
+                    phased allele separator must be used '|'"/>
+            <param argument="map" type="data" format="txt" optional="true" label="PLINK map file with cM units" 
+                help="Beagle uses linear interpolation to estimate genetic positions between map positions. If 
+                no genetic map is specified, Beagle assumes a constant recombination rate of 1 cM per Mb"/>
+            <param argument="excludesamples" type="data" format="txt" optional="true" label="Samples to exclude" 
+                help="It specifies a file containing samples (one sample identifier per line) to be excluded 
+                    from the analysis" />
+            <param argument="excludemarkers" type="data" format="txt" optional="true" label="Markers to exclude" 
+                help="It specifies a file containing markers (one marker per line) to be excluded from the 
+                    analysis. Each line of the file can be either an identifier from a VCF record’s ID field 
+                    or a genomic coordinate in the format: CHROM:POS" />
+        </section>
+        <param argument="chrom" type="text" optional="true" label="Specify a chromosome interval" 
+            help="Input format: [chrom]:[start]-[end]. The entire chromosome, the beginning, or the end may be 
+                specified by chrom=[chrom], chrom=[chrom]:-[end], and chrom=[chrom]:[start]-, respectively">
+            <sanitizer invalid_char="">
+                <valid initial="string.letters,string.digits">
+                    <add value=":" />
+                    <add value="-" />
+                </valid>
+            </sanitizer>
+            <validator type="regex">[0-9a-zA-Z:-]+</validator>
+        </param>
+        <param argument="ne" type="integer" min="0" value="1000000" label="Effective population size" 
+            help="The default value is suitable for a large, outbred population. It is needed to specify an 
+                appropriate effective populations size if you are imputing ungenotyped markers in a small 
+                or inbred population"/>
+        <param argument="window" type="float" min="0" value="40.0" label="Window length in cM" 
+            help="The window parameter must be at least 1.1 times as large as the overlap parameter. 
+                The window parameter controls the amount of memory required for the analysis"/>
+        <param argument="overlap" type="float" min="0" value="2.0" label="Window overlap in cM" 
+            help="It specifies the cM length of overlap between adjacent sliding windows"/>
+        <param argument="err" type="float" min="0" max="1" optional="true" 
+            label="Allele mismatch probability for the hidden Markov model" 
+            help="If no err parameter is specified, the err parameter will be set equal πœƒ/(2(πœƒ + 𝐻)) 
+                where πœƒ = 1/(0.5 + ln 𝐻) and 𝐻 is the number of haplotypes"/>
+        <param argument="seed" type="integer" value="" optional="true" label="Random seed" 
+            help="A random seed is a number used to initialize a pseudorandom number generator" />
+        <param name="output_log" type="boolean" checked="false" label="Output a log file"/>
+        <section name="phasing_parameters" title="Phasing parameters">
+            <param argument="burnin" type="integer" min="0" value="3" label="Max burnin iterations" 
+                help="It is the maximum number of burnin iterations used to estimate an initial haplotype 
+                    frequency model for inferring genotype phase" />
+            <param argument="iterations" type="integer" min="0" value="12" label="Phasing iterations" 
+                help="It is the number of iterations used to estimate genotype phase. Increasing this 
+                    parameter will trade increased computation time for increased phasing accuracy" />
+            <param argument="phase-states" type="integer" min="0" value="280" label="Model states for phasing" 
+                help="It is the number of model states used to estimate genotype phase" />
+        </section>
+        <section name="imputation_parameters" title="Imputation parameters">
+            <param argument="impute" type="boolean" truevalue="true" falsevalue="false" 
+                checked="true" label="Impute ungenotyped markers" 
+                help="It specifies whether markers that are present in the reference panel but absent in 
+                    that target will be imputed. This option has no effect if no reference panel is specified"/>
+            <param argument="imp-states" type="integer" min="0" value="1600" label="Model states for imputation" 
+                help="It is the number of model states used to impute ungenotyped markers" />
+            <param argument="imp-segment" type="float" min="0" value="6.0" label="Minimum cM length of haplotype segments" 
+                help="It is the minimum cM length of haplotype segments that will be incorporated in the HMM state 
+                    space for a target haplotype." />
+            <param argument="imp-step" type="float" min="0" value="0.1" label="Length in cM for detecting short IBS segments"
+                 help="It is the length in cM of the step used for detecting short IBS segments" />
+            <param argument="cluster" type="float" min="0" value="0.005" label="Max cM in a marker cluster" 
+                help="It specifies the maximum cM distance between individual markers that are combined 
+                    into an aggregate marker when imputing ungenotyped markers" />
+            <param argument="ap" type="boolean" truevalue="true" falsevalue="false" 
+                checked="false" label="Include posterior allele probabilities" 
+                help="It specifies whether AP1 and AP2 (allele probability) fields will be included in the output 
+                    VCF file when imputing ungenotyped markers" />
+            <param argument="gp" type="boolean" truevalue="true" falsevalue="false" 
+                checked="false" label="Include posterior genotype probabilities" 
+                help="It specifies whether a GP (genotype probability) format field will be included in the output 
+                VCF file when imputing ungenotyped markers. Genotype probabilities are calculated from allele 
+                probabilities assuming Hardy-Weinberg Equilibrium. Consequently, the alleles in the genotype 
+                with highest genotype probability may occasionally be different than the genotype obtained by 
+                taking the allele with highest probability on each haplotype, which is the genotype reported 
+                in the GT format field" />
+        </section>
+    </inputs>
+    <outputs>
+        <data name="vcf_file" format="vcf" from_work_dir="out.vcf" label="${tool.name} on ${on_string}: VCF file"/>
+        <data name="log_file" format="txt" from_work_dir="out.log" label="${tool.name} on ${on_string}: log file">
+            <filter>output_log</filter>
+        </data>
+    </outputs>
+    <tests>
+        <!-- Test default values -->
+        <test expect_num_outputs="2">
+            <param name="gt" value="test.vcf.gz"/>
+            <param name="chrom" value="22:100-"/>
+            <param name="ne" value="1000000"/>
+            <param name="window" value="40.0"/>
+            <param name="overlap" value="2.0"/>
+            <param name="err" value="0.02"/>
+            <param name="seed" value="1"/>
+            <param name="output_log" value="true"/>
+            <section name="phasing_parameters">
+                <param name="burnin" value="3"/>
+                <param name="iterations" value="12"/>
+                <param name="phase_states" value="280"/>
+            </section>
+            <output name="vcf_file" file="test_output.vcf" ftype="vcf" lines_diff="3"/>
+            <output name="log_file" file="test_output.log" ftype="txt" lines_diff="16"/>
+        </test>
+        <!-- Test plink file-->
+        <test expect_num_outputs="2">
+            <param name="gt" value="test.vcf.gz"/>
+            <param name="ne" value="1000000"/>
+            <param name="window" value="30.0"/>
+            <param name="overlap" value="3.0"/>
+            <param name="output_log" value="true"/>
+            <section name="optional_inputs">
+                <param name="map" value="plink.map"/>
+            </section>
+            <section name="phasing_parameters">
+                <param name="burnin" value="4"/>
+                <param name="iterations" value="10"/>
+                <param name="phase_states" value="250"/>
+            </section>
+            <output name="vcf_file" ftype="vcf">
+                <assert_contents>
+                    <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/>
+                    <has_size value="181272"/>
+                </assert_contents>
+            </output>
+            <output name="log_file" ftype="txt">
+                <assert_contents>
+                    <has_text text="Reference markers:                  223"/>
+                    <has_size value="1586" delta="10"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- Test ref VCF input -->
+        <test expect_num_outputs="2">
+            <param name="gt" value="target.vcf.gz"/>
+            <param name="ne" value="1000000"/>
+            <param name="window" value="40.0"/>
+            <param name="overlap" value="2.0"/>
+            <param name="output_log" value="true"/>
+            <section name="optional_inputs">
+                <param name="ref" value="ref.vcf.gz"/>
+            </section>
+            <section name="imputation_parameters">
+                <param name="impute" value="true"/>
+                <param name="imp_states" value="1600"/>
+                <param name="imp_segment" value="6.0"/>
+                <param name="imp_step" value="0.1"/>
+                <param name="cluster" value="0.005"/>
+                <param name="ap" value="true"/>
+                <param name="gp" value="true"/>
+            </section>
+            <output name="vcf_file" ftype="vcf">
+                <assert_contents>
+                    <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/>
+                    <has_size value="18635"/>
+                </assert_contents>
+            </output>
+            <output name="log_file" ftype="txt">
+                <assert_contents>
+                    <has_text text="Reference markers:                  223"/>
+                    <has_size value="1801" delta="10"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- Test ref bref3 input -->
+        <test expect_num_outputs="1">
+            <param name="gt" value="target.vcf.gz"/>
+            <param name="ne" value="1000000"/>
+            <param name="window" value="40.0"/>
+            <param name="overlap" value="2.0"/>
+            <section name="optional_inputs">
+                <param name="ref" value="ref.bref3"/>
+            </section>
+            <section name="imputation_parameters">
+                <param name="impute" value="true"/>
+                <param name="imp_states" value="1600"/>
+                <param name="imp_segment" value="6.0"/>
+                <param name="imp_step" value="0.1"/>
+                <param name="cluster" value="0.005"/>
+                <param name="ap" value="true"/>
+                <param name="gp" value="true"/>
+            </section>
+            <output name="vcf_file" ftype="vcf">
+                <assert_contents>
+                    <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/>
+                    <has_size value="18635"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+.. class:: infomark
+
+**Purpose**
+
+Beagle is a program for phasing and imputing missing genotypes. Sporadic missing
+genotypes are imputed during phasing. If a reference panel of phased genotypes is specified
+with the ref argument, ungenotyped markers that are present in the reference panel can also
+be imputed.
+
+Beagle version 5.2 provides significantly faster genotype phasing than version 5.1.
+Recent versions of Beagle do not infer genotypes from genotype likelihood input data, but
+Beagle versions 4.0 and 4.1 have this capability.
+
+----
+
+.. class:: infomark
+
+**HapMap genetic maps**
+
+HapMap genetic maps in PLINK format for GRCh36, GRCh37, and GRCh38 are available 
+in `this link <http://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/>`_
+
+----
+
+.. class:: infomark
+
+**Input files**
+
+Beagle uses `Variant Call Format <http://faculty.washington.edu/browning/beagle/intro-to-vcf.html>`_ 
+(VCF) 4.3 for input and output genotype data. Pseuodoautosomal and non-pseudoautosomal 
+X-chromosome genotypes must be in separate input files and analysed separately unless male 
+haploid genotypes are coded as homozygous diploid genotypes.
+
+In the VCF file, if any heterozygote genotype is unphased (with "/" allele separator) in a marker window, 
+it will consider all heterozygote genotypes to be unphased, regardless of the allele separator used ("|" or "/").
+Beagle assumes that an the VCF file has a name ending in ".gz" is compressed with gzip or bgzip, 
+and that a reference VCF file that has a name ending in β€œ.bref3” is compressed with bref version 3.
+
+----
+
+.. class:: infomark
+
+**Output files**
+
+There are two output files. The log file gives a summary of the analysis that includes the
+Beagle version, the command line arguments, and compute time.
+
+The vcf.gz file is a bgzip-compressed VCF file that contains phased, non-missing
+genotypes for all non-reference samples. The output vcf.gz file can be uncompressed with the
+unix gunzip utility.
+
+If a reference panel is specified and ungenotyped markers are imputed, the VCF INFO
+field will contain:
+
+    ::
+
+        - A "DR2" subfield with the estimated squared correlation between the estimated allele dose and the true allele dose.
+        - An "AF" subfield with the estimated alternate allele frequencies in the target samples.
+        - The "IMP" flag if the marker is imputed.
+                  
+    ]]>    </help>
+    <expand macro="citations" />
+</tool>