Mercurial > repos > iuc > beagle
diff beagle.xml @ 0:553b27c30eb8 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/beagle commit ccb3f8eaa99490f8513200e45fc59e5011fb41e8"
author | iuc |
---|---|
date | Sat, 03 Jul 2021 23:33:37 +0000 |
parents | |
children | f75bf16ac901 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/beagle.xml Sat Jul 03 23:33:37 2021 +0000 @@ -0,0 +1,314 @@ +<tool id='beagle' name='Beagle' version='@TOOL_VERSION@+galaxy@SUFFIX_VERSION@' profile='20.01'> + <description>phasing genotypes and imputing ungenotyped markers</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro='edam_ontology' /> + <expand macro='requirements' /> + <command detect_errors='exit_code'><![CDATA[ + #set out_prefix='out' + #if $optional_inputs.ref.ext == 'bref3' + ln -s '${optional_inputs.ref}' ref.bref3 && + #end if + beagle + gt='${gt}' + #if $optional_inputs.ref and $optional_inputs.ref.ext == 'bref3' + ref=ref.bref3 + #else if $optional_inputs.ref + ref='${optional_inputs.ref}' + #end if + #if $optional_inputs.map + map='${optional_inputs.map}' + #end if + #if $chrom + chrom='${chrom}' + #end if + #if $optional_inputs.excludesamples + excludesamples='${optional_inputs.excludesamples}' + #end if + #if $optional_inputs.excludemarkers + excludemarkers='${optional_inputs.excludemarkers}' + #end if + ne=$ne + window=$window + overlap=$overlap + #if $seed + seed=$seed + #end if + #if $err + err=$err + #end if + burnin=$phasing_parameters.burnin + iterations=$phasing_parameters.iterations + phase-states=$phasing_parameters.phase_states + impute=$imputation_parameters.impute + imp-states=$imputation_parameters.imp_states + imp-segment=$imputation_parameters.imp_segment + imp-step=$imputation_parameters.imp_step + cluster=$imputation_parameters.cluster + ap=$imputation_parameters.ap + gp=$imputation_parameters.gp + out=$out_prefix + nthreads=\${GALAXY_SLOTS:-1} + && gunzip 'out.vcf.gz' + ]]> </command> + <inputs> + <param argument="gt" type="data" format="vcf" label="VCF file" + help="It specifies a VCF file containing genotypes for the study samples. + Each VCF record must contain a GT (genotype) format field"/> + <section name="optional_inputs" title="Optional input files" expanded="true"> + <param argument="ref" type="data" format="vcf,bref3" optional="true" label="Bref3 or VCF file with phased genotypes" + help="Each genotype must have two phased, non-missing alleles. If a VCF file is specified, the + phased allele separator must be used '|'"/> + <param argument="map" type="data" format="txt" optional="true" label="PLINK map file with cM units" + help="Beagle uses linear interpolation to estimate genetic positions between map positions. If + no genetic map is specified, Beagle assumes a constant recombination rate of 1 cM per Mb"/> + <param argument="excludesamples" type="data" format="txt" optional="true" label="Samples to exclude" + help="It specifies a file containing samples (one sample identifier per line) to be excluded + from the analysis" /> + <param argument="excludemarkers" type="data" format="txt" optional="true" label="Markers to exclude" + help="It specifies a file containing markers (one marker per line) to be excluded from the + analysis. Each line of the file can be either an identifier from a VCF recordβs ID field + or a genomic coordinate in the format: CHROM:POS" /> + </section> + <param argument="chrom" type="text" optional="true" label="Specify a chromosome interval" + help="Input format: [chrom]:[start]-[end]. The entire chromosome, the beginning, or the end may be + specified by chrom=[chrom], chrom=[chrom]:-[end], and chrom=[chrom]:[start]-, respectively"> + <sanitizer invalid_char=""> + <valid initial="string.letters,string.digits"> + <add value=":" /> + <add value="-" /> + </valid> + </sanitizer> + <validator type="regex">[0-9a-zA-Z:-]+</validator> + </param> + <param argument="ne" type="integer" min="0" value="1000000" label="Effective population size" + help="The default value is suitable for a large, outbred population. It is needed to specify an + appropriate effective populations size if you are imputing ungenotyped markers in a small + or inbred population"/> + <param argument="window" type="float" min="0" value="40.0" label="Window length in cM" + help="The window parameter must be at least 1.1 times as large as the overlap parameter. + The window parameter controls the amount of memory required for the analysis"/> + <param argument="overlap" type="float" min="0" value="2.0" label="Window overlap in cM" + help="It specifies the cM length of overlap between adjacent sliding windows"/> + <param argument="err" type="float" min="0" max="1" optional="true" + label="Allele mismatch probability for the hidden Markov model" + help="If no err parameter is specified, the err parameter will be set equal π/(2(π + π»)) + where π = 1/(0.5 + ln π») and π» is the number of haplotypes"/> + <param argument="seed" type="integer" value="" optional="true" label="Random seed" + help="A random seed is a number used to initialize a pseudorandom number generator" /> + <param name="output_log" type="boolean" checked="false" label="Output a log file"/> + <section name="phasing_parameters" title="Phasing parameters"> + <param argument="burnin" type="integer" min="0" value="3" label="Max burnin iterations" + help="It is the maximum number of burnin iterations used to estimate an initial haplotype + frequency model for inferring genotype phase" /> + <param argument="iterations" type="integer" min="0" value="12" label="Phasing iterations" + help="It is the number of iterations used to estimate genotype phase. Increasing this + parameter will trade increased computation time for increased phasing accuracy" /> + <param argument="phase-states" type="integer" min="0" value="280" label="Model states for phasing" + help="It is the number of model states used to estimate genotype phase" /> + </section> + <section name="imputation_parameters" title="Imputation parameters"> + <param argument="impute" type="boolean" truevalue="true" falsevalue="false" + checked="true" label="Impute ungenotyped markers" + help="It specifies whether markers that are present in the reference panel but absent in + that target will be imputed. This option has no effect if no reference panel is specified"/> + <param argument="imp-states" type="integer" min="0" value="1600" label="Model states for imputation" + help="It is the number of model states used to impute ungenotyped markers" /> + <param argument="imp-segment" type="float" min="0" value="6.0" label="Minimum cM length of haplotype segments" + help="It is the minimum cM length of haplotype segments that will be incorporated in the HMM state + space for a target haplotype." /> + <param argument="imp-step" type="float" min="0" value="0.1" label="Length in cM for detecting short IBS segments" + help="It is the length in cM of the step used for detecting short IBS segments" /> + <param argument="cluster" type="float" min="0" value="0.005" label="Max cM in a marker cluster" + help="It specifies the maximum cM distance between individual markers that are combined + into an aggregate marker when imputing ungenotyped markers" /> + <param argument="ap" type="boolean" truevalue="true" falsevalue="false" + checked="false" label="Include posterior allele probabilities" + help="It specifies whether AP1 and AP2 (allele probability) fields will be included in the output + VCF file when imputing ungenotyped markers" /> + <param argument="gp" type="boolean" truevalue="true" falsevalue="false" + checked="false" label="Include posterior genotype probabilities" + help="It specifies whether a GP (genotype probability) format field will be included in the output + VCF file when imputing ungenotyped markers. Genotype probabilities are calculated from allele + probabilities assuming Hardy-Weinberg Equilibrium. Consequently, the alleles in the genotype + with highest genotype probability may occasionally be different than the genotype obtained by + taking the allele with highest probability on each haplotype, which is the genotype reported + in the GT format field" /> + </section> + </inputs> + <outputs> + <data name="vcf_file" format="vcf" from_work_dir="out.vcf" label="${tool.name} on ${on_string}: VCF file"/> + <data name="log_file" format="txt" from_work_dir="out.log" label="${tool.name} on ${on_string}: log file"> + <filter>output_log</filter> + </data> + </outputs> + <tests> + <!-- Test default values --> + <test expect_num_outputs="2"> + <param name="gt" value="test.vcf.gz"/> + <param name="chrom" value="22:100-"/> + <param name="ne" value="1000000"/> + <param name="window" value="40.0"/> + <param name="overlap" value="2.0"/> + <param name="err" value="0.02"/> + <param name="seed" value="1"/> + <param name="output_log" value="true"/> + <section name="phasing_parameters"> + <param name="burnin" value="3"/> + <param name="iterations" value="12"/> + <param name="phase_states" value="280"/> + </section> + <output name="vcf_file" file="test_output.vcf" ftype="vcf" lines_diff="3"/> + <output name="log_file" file="test_output.log" ftype="txt" lines_diff="16"/> + </test> + <!-- Test plink file--> + <test expect_num_outputs="2"> + <param name="gt" value="test.vcf.gz"/> + <param name="ne" value="1000000"/> + <param name="window" value="30.0"/> + <param name="overlap" value="3.0"/> + <param name="output_log" value="true"/> + <section name="optional_inputs"> + <param name="map" value="plink.map"/> + </section> + <section name="phasing_parameters"> + <param name="burnin" value="4"/> + <param name="iterations" value="10"/> + <param name="phase_states" value="250"/> + </section> + <output name="vcf_file" ftype="vcf"> + <assert_contents> + <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/> + <has_size value="181272"/> + </assert_contents> + </output> + <output name="log_file" ftype="txt"> + <assert_contents> + <has_text text="Reference markers: 223"/> + <has_size value="1586" delta="10"/> + </assert_contents> + </output> + </test> + <!-- Test ref VCF input --> + <test expect_num_outputs="2"> + <param name="gt" value="target.vcf.gz"/> + <param name="ne" value="1000000"/> + <param name="window" value="40.0"/> + <param name="overlap" value="2.0"/> + <param name="output_log" value="true"/> + <section name="optional_inputs"> + <param name="ref" value="ref.vcf.gz"/> + </section> + <section name="imputation_parameters"> + <param name="impute" value="true"/> + <param name="imp_states" value="1600"/> + <param name="imp_segment" value="6.0"/> + <param name="imp_step" value="0.1"/> + <param name="cluster" value="0.005"/> + <param name="ap" value="true"/> + <param name="gp" value="true"/> + </section> + <output name="vcf_file" ftype="vcf"> + <assert_contents> + <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/> + <has_size value="18635"/> + </assert_contents> + </output> + <output name="log_file" ftype="txt"> + <assert_contents> + <has_text text="Reference markers: 223"/> + <has_size value="1801" delta="10"/> + </assert_contents> + </output> + </test> + <!-- Test ref bref3 input --> + <test expect_num_outputs="1"> + <param name="gt" value="target.vcf.gz"/> + <param name="ne" value="1000000"/> + <param name="window" value="40.0"/> + <param name="overlap" value="2.0"/> + <section name="optional_inputs"> + <param name="ref" value="ref.bref3"/> + </section> + <section name="imputation_parameters"> + <param name="impute" value="true"/> + <param name="imp_states" value="1600"/> + <param name="imp_segment" value="6.0"/> + <param name="imp_step" value="0.1"/> + <param name="cluster" value="0.005"/> + <param name="ap" value="true"/> + <param name="gp" value="true"/> + </section> + <output name="vcf_file" ftype="vcf"> + <assert_contents> + <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/> + <has_size value="18635"/> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ +.. class:: infomark + +**Purpose** + +Beagle is a program for phasing and imputing missing genotypes. Sporadic missing +genotypes are imputed during phasing. If a reference panel of phased genotypes is specified +with the ref argument, ungenotyped markers that are present in the reference panel can also +be imputed. + +Beagle version 5.2 provides significantly faster genotype phasing than version 5.1. +Recent versions of Beagle do not infer genotypes from genotype likelihood input data, but +Beagle versions 4.0 and 4.1 have this capability. + +---- + +.. class:: infomark + +**HapMap genetic maps** + +HapMap genetic maps in PLINK format for GRCh36, GRCh37, and GRCh38 are available +in `this link <http://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/>`_ + +---- + +.. class:: infomark + +**Input files** + +Beagle uses `Variant Call Format <http://faculty.washington.edu/browning/beagle/intro-to-vcf.html>`_ +(VCF) 4.3 for input and output genotype data. Pseuodoautosomal and non-pseudoautosomal +X-chromosome genotypes must be in separate input files and analysed separately unless male +haploid genotypes are coded as homozygous diploid genotypes. + +In the VCF file, if any heterozygote genotype is unphased (with "/" allele separator) in a marker window, +it will consider all heterozygote genotypes to be unphased, regardless of the allele separator used ("|" or "/"). +Beagle assumes that an the VCF file has a name ending in ".gz" is compressed with gzip or bgzip, +and that a reference VCF file that has a name ending in β.bref3β is compressed with bref version 3. + +---- + +.. class:: infomark + +**Output files** + +There are two output files. The log file gives a summary of the analysis that includes the +Beagle version, the command line arguments, and compute time. + +The vcf.gz file is a bgzip-compressed VCF file that contains phased, non-missing +genotypes for all non-reference samples. The output vcf.gz file can be uncompressed with the +unix gunzip utility. + +If a reference panel is specified and ungenotyped markers are imputed, the VCF INFO +field will contain: + + :: + + - A "DR2" subfield with the estimated squared correlation between the estimated allele dose and the true allele dose. + - An "AF" subfield with the estimated alternate allele frequencies in the target samples. + - The "IMP" flag if the marker is imputed. + + ]]> </help> + <expand macro="citations" /> +</tool>