# HG changeset patch # User iuc # Date 1496788878 14400 # Node ID 977a5301b66d6c04ebce64bb89dff9b82d20bd77 # Parent 9f164587a92f9853b8ad1d81d24a77edf48a01e0 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/freebayes commit 9bbda385129b4bc34f66889d28c2570bf5bb2214 diff -r 9f164587a92f -r 977a5301b66d freebayes.xml --- a/freebayes.xml Tue Jun 06 11:44:38 2017 -0400 +++ b/freebayes.xml Tue Jun 06 18:41:18 2017 -0400 @@ -1,18 +1,13 @@ - + bayesian genetic variant detector macros.xml - - freebayes - samtools + gawk parallel - - - - - + > regions_all.bed && + awk '{ gsub("^SN:","",$1); gsub("^LN:","",$2); print $1"\t0\t"$2; }' >> regions_all.bed && #end for #end if @@ -63,15 +55,12 @@ ## split into even small chunks, this has some disatvantages and will not be used for the moment ## bedtools makewindows -b regions_uniq.bed -w 10000000 -s 9990000 > regions.bed && - mkdir vcf_output && - mkdir failed_alleles && - mkdir trace && + mkdir vcf_output failed_alleles trace && ## Finished setting up inputs for i in `cat regions_uniq.bed | awk '{print $1":"$2".."$3}'`; do - echo " ## COMMAND LINE STARTS HERE @@ -90,11 +79,10 @@ ##advanced options #if str( $options_type.options_type_selector ) == "simple": - ##do nothing as command like build up to this point is sufficinet for simple diploid calling - + #pass #elif str( $options_type.options_type_selector ) == "simple_w_filters": --standard-filters - --min-coverage '${options_type.min_coverage}' + --min-coverage ${options_type.min_coverage} #elif str( $options_type.options_type_selector ) == "naive": --haplotype-length 0 --min-alternate-count 1 @@ -108,9 +96,7 @@ --pooled-continuous --report-monomorphic --standard-filters - --min-coverage '${options_type.min_coverage}' - - ## Command line direct text entry is not allowed at this time for security reasons + --min-coverage ${options_type.min_coverage} #elif str( $options_type.options_type_selector ) == "full": #if str( $options_type.optional_inputs.optional_inputs_selector ) == 'set': ${options_type.optional_inputs.report_monomorphic} @@ -151,8 +137,8 @@ #end if ## POPULATION MODEL #if str( $options_type.population_model.population_model_selector ) == "set": - --theta '${options_type.population_model.T}' - --ploidy '${options_type.population_model.P}' + --theta ${options_type.population_model.T} + --ploidy ${options_type.population_model.P} ${options_type.population_model.J} ${options_type.population_model.K} #end if @@ -171,11 +157,11 @@ ${options_type.allele_scope.u} ${options_type.allele_scope.no_partial_observations} - -n '${options_type.allele_scope.n}' + -n ${options_type.allele_scope.n} - --haplotype-length '${options_type.allele_scope.haplotype_length}' - --min-repeat-size '${options_type.allele_scope.min_repeat_length}' - --min-repeat-entropy '${options_type.allele_scope.min_repeat_entropy}' + --haplotype-length ${options_type.allele_scope.haplotype_length} + --min-repeat-size ${options_type.allele_scope.min_repeat_length} + --min-repeat-entropy ${options_type.allele_scope.min_repeat_entropy} #end if ## REALIGNMENT @@ -184,25 +170,27 @@ ##INPUT FILTERS #if str( $options_type.input_filters.input_filters_selector ) == "set": ${options_type.input_filters.use_duplicate_reads} - -m '${options_type.input_filters.m}' - -q '${options_type.input_filters.q}' - -R '${options_type.input_filters.R}' - -Y '${options_type.input_filters.Y}' - -e '${options_type.input_filters.e}' - -F '${options_type.input_filters.F}' - -C '${options_type.input_filters.C}' - -G '${options_type.input_filters.G}' + -m ${options_type.input_filters.m} + -q ${options_type.input_filters.q} + -R ${options_type.input_filters.R} + -Y ${options_type.input_filters.Y} + -e ${options_type.input_filters.e} + -F ${options_type.input_filters.F} + -C ${options_type.input_filters.C} + -G ${options_type.input_filters.G} #if str( $options_type.input_filters.mismatch_filters.mismatch_filters_selector ) == "set": - -Q '${options_type.input_filters.mismatch_filters.Q}' - -U '${options_type.input_filters.mismatch_filters.U}' - -z '${options_type.input_filters.mismatch_filters.z}' + -Q ${options_type.input_filters.mismatch_filters.Q} + #if str($options_type.input_filters.mismatch_filters.U) + -U ${options_type.input_filters.mismatch_filters.U} + #end if + -z ${options_type.input_filters.mismatch_filters.z} - --read-snp-limit '${options_type.input_filters.mismatch_filters.read_snp_limit}' + --read-snp-limit ${options_type.input_filters.mismatch_filters.read_snp_limit} #end if - --min-coverage '${options_type.input_filters.min_coverage}' - --min-alternate-qsum "${options_type.input_filters.min_alternate_qsum}" + --min-coverage ${options_type.input_filters.min_coverage} + --min-alternate-qsum ${options_type.input_filters.min_alternate_qsum} #end if ## POPULATION AND MAPPABILITY PRIORS @@ -217,8 +205,8 @@ #if str( $options_type.genotype_likelihoods.genotype_likelihoods_selector ) == "set": ${$options_type.genotype_likelihoods.experimental_gls} - --base-quality-cap '${$options_type.genotype_likelihoods.base_quality_cap}' - --prob-contamination '${$options_type.genotype_likelihoods.prob_contamination}' + --base-quality-cap ${$options_type.genotype_likelihoods.base_quality_cap} + --prob-contamination ${$options_type.genotype_likelihoods.prob_contamination} #end if ## ALGORITHMIC FEATURES @@ -227,8 +215,8 @@ -W '${options_type.algorithmic_features.W}' -D '${options_type.algorithmic_features.D}' - #if str( $options_type.algorithmic_features.genotype_variant_threshold.genotype_variant_threshold_selector ) == "set": - -S '${options_type.algorithmic_features.genotype_variant_threshold.S}' + #if str($options_type.algorithmic_features.genotype_variant_threshold) + -S ${options_type.algorithmic_features.genotype_variant_threshold} #end if ${options_type.algorithmic_features.N} @@ -237,7 +225,7 @@ ${options_type.algorithmic_features.genotype_qualities} ${options_type.algorithmic_features.report_genotype_likelihood_max} - --genotyping-max-banddepth '${options_type.algorithmic_features.genotyping_max_banddepth}' + --genotyping-max-banddepth ${options_type.algorithmic_features.genotyping_max_banddepth} #end if #end if @@ -245,7 +233,7 @@ done > freebayes_commands.sh && cat freebayes_commands.sh | - parallel --no-notice -j \${GALAXY_SLOTS:-1} && + parallel --will-cite -j \${GALAXY_SLOTS:-1} && ## make VCF header grep "^#" "./vcf_output/part_\$i.vcf" > header.txt && @@ -292,7 +280,7 @@ - + @@ -303,24 +291,24 @@ - + - + - + - + - + help="Select how much control over the freebayes run you need"> + @@ -329,162 +317,159 @@ + help="Sets --samples, --populations, --cnv-map, --trace, --failed-alleles, --variant-input, --only-use-input-alleles, --haplotype-basis-alleles, --report-all-haplotype-alleles, --report-monomorphic options, --observation-bias, and --contamination-estimates"> - - - - - + + + + + - - - + + + - + - + - + - - - - + + + + - + - + - + - + + help="Sets --theta, --ploidy, --pooled-discrete, and --pooled-continuous options"> - - - - + + + + - + + help="Sets --use-reference-allele and --reference-quality options"> - - + + - + + help="Sets -I, i, -X, -u, -n, --haplotype-length, --min-repeat-size, --min-repeat-entropy, and --no-partial-observations options"> - - - - - - - - - + + + + + + + + + help="By default, FreeBayes uses all observations, dividing partial support across matching haplotypes when generating haplotypes" /> - + - + + help="Sets -4, -m, -q, -R, -Y, -Q, -U, -z, -$, -e, -0, -F, -C, -3, -G, and -! options"> - - - - - + + + + + @@ -492,86 +477,77 @@ - - - - + + + + - + - - + - - - - + help="Equivalent to -m 30 -q 20 -R 0 -S 0" /> + + + + - + + help="Sets -k, -w, -V, and -a options"> - - - - + + + + - + + help="Sets --base-quality-cap, --experimental-gls, and --prob-contamination options"> - - + - + help="Incorporate partial observations. This is the default when contamination estimates are provided. Optimized for diploid samples" /> + - + @@ -582,57 +558,42 @@ - - - - - - - - - - - - - - - - - + + + + + + + - - + help="By default, FreeBayes uses a minimum Base Quality in flanking sequence" /> + + - + - + - + - + @@ -695,7 +656,7 @@ **Description** -Privided BAM file(s) and a reference. FreeBayes will provide VCF output on standard out describing SNPs, indels, and complex variants in samples in the input alignments. +Provided some BAM dataset(s) and a reference sequence, FreeBayes will produce a VCF dataset describing SNPs, indels, and complex variants in samples in the input alignments. By default, FreeBayes will consider variants supported by at least 2 observations in a single sample (-C) and also by at least 20% of the reads from a single sample (-F). These settings are suitable to low to high depth sequencing in haploid and diploid samples, but users working with polyploid or pooled samples may wish to adjust them depending on the characteristics of their sequencing data. @@ -709,268 +670,20 @@ **Galaxy-specific options** -Galaxy allows five levels of control over FreeBayes options provided by **Choose parameter selection level** menu option. These are: - - 1. *Simple diploid calling*: The simples possible FreeBayes application. Equvalent of using FreeBayes with only a BAM input and no other parameter options. - 2. *Simple diploid calling with filtering and coverage*: Same as #1 plus two additional options: -0 (standard filters: --min-mapping-quality 30 --min-base-quality 20 --min-supporting-allele-qsum 0 --genotype-varinat-threshold 0) and --min-coverage. - 3. *Frequency-based pooled calling*: This is equivalent to using FreeBayes with the following options: --haplotype-length 0 --min-alternate-count 1 --min-alternate-fraction 0 --pooled-continuous --report-monomorphic. This is the best choice for calling varinats in mixtures such as viral, bacterial, or organellar genomes. - 4. *Frequency-based pooled calling with filtering and coverage*: Same as #3 but adds -0 and --min-coverage like in #2. - 5. *Complete list of all options*: Gives you full control by exposing all FreeBayes options as Galaxy widgets. - ------ - -**FreeBayes options** - -.. class:: infomark - -Note that each Galaxy parameter widget corresponding to command line flags listed below: - -Input and output:: - - -t --targets FILE - Limit analysis to targets listed in the BED-format FILE. - -r --region chrom:start_position-end_position - Limit analysis to the specified region, 0-base coordinates, - end_position included. Either '-' or '..' maybe used as a separator. - -s --samples FILE - Limit analysis to samples listed (one per line) in the FILE. - By default FreeBayes will analyze all samples in its input - BAM files. - --populations FILE - Each line of FILE should list a sample and a population which - it is part of. The population-based bayesian inference model - will then be partitioned on the basis of the populations. - -A --cnv-map FILE - Read a copy number map from the BED file FILE, which has - the format: - reference sequence, start, end, sample name, copy number - ... for each region in each sample which does not have the - default copy number as set by --ploidy. - --trace FILE Output an algorithmic trace to FILE. - --failed-alleles FILE - Write a BED file of the analyzed positions which do not - pass --pvar to FILE. - -@ --variant-input VCF - Use variants reported in VCF file as input to the algorithm. - Variants in this file will be treated as putative variants - even if there is not enough support in the data to pass - input filters. - -l --only-use-input-alleles - Only provide variant calls and genotype likelihoods for sites - and alleles which are provided in the VCF input, and provide - output in the VCF for all input alleles, not just those which - have support in the data. - --haplotype-basis-alleles VCF - When specified, only variant alleles provided in this input - VCF will be used for the construction of complex or haplotype - alleles. - --report-all-haplotype-alleles - At sites where genotypes are made over haplotype alleles, - provide information about all alleles in output, not only - those which are called. - --report-monomorphic - Report even loci which appear to be monomorphic, and report all - considered alleles, even those which are not in called genotypes. - Loci which do not have any potential alternates have '.' for ALT. - -Reporting:: - - -P --pvar N Report sites if the probability that there is a polymorphism - at the site is greater than N. default: 0.0. Note that post- - filtering is generally recommended over the use of this parameter. - -Population model:: - - -T --theta N The expected mutation rate or pairwise nucleotide diversity - among the population under analysis. This serves as the - single parameter to the Ewens Sampling Formula prior model - default: 0.001 - -p --ploidy N Sets the default ploidy for the analysis to N. default: 2 - -J --pooled-discrete - Assume that samples result from pooled sequencing. - Model pooled samples using discrete genotypes across pools. - When using this flag, set --ploidy to the number of - alleles in each sample or use the --cnv-map to define - per-sample ploidy. - -K --pooled-continuous - Output all alleles which pass input filters, regardles of - genotyping outcome or model. - -Reference allele:: - - -Z --use-reference-allele - This flag includes the reference allele in the analysis as - if it is another sample from the same population. - --reference-quality MQ,BQ - Assign mapping quality of MQ to the reference allele at each - site and base quality of BQ. default: 100,60 - -Allele scope:: - - -I --no-snps Ignore SNP alleles. - -i --no-indels Ignore insertion and deletion alleles. - -X --no-mnps Ignore multi-nuceotide polymorphisms, MNPs. - -u --no-complex Ignore complex events (composites of other classes). - -n --use-best-n-alleles N - Evaluate only the best N SNP alleles, ranked by sum of - supporting quality scores. (Set to 0 to use all; default: all) - -E --max-complex-gap N - --haplotype-length N - Allow haplotype calls with contiguous embedded matches of up - to this length. (default: 3) - --min-repeat-size N - When assembling observations across repeats, require the total repeat - length at least this many bp. (default: 5) - --min-repeat-entropy N - To detect interrupted repeats, build across sequence until it has - entropy > N bits per bp. (default: 0, off) - --no-partial-observations - Exclude observations which do not fully span the dynamically-determined - detection window. (default, use all observations, dividing partial - support across matching haplotypes when generating haplotypes.) - -Indel realignment:: - - -O --dont-left-align-indels - Turn off left-alignment of indels, which is enabled by default. - -Input filters:: +Galaxy allows five levels of control over FreeBayes options, provided by the **Choose parameter selection level** menu option. These are: - -4 --use-duplicate-reads - Include duplicate-marked alignments in the analysis. - default: exclude duplicates marked as such in alignments - -m --min-mapping-quality Q - Exclude alignments from analysis if they have a mapping - quality less than Q. default: 1 - -q --min-base-quality Q - Exclude alleles from analysis if their supporting base - quality is less than Q. default: 0 - -R --min-supporting-allele-qsum Q - Consider any allele in which the sum of qualities of supporting - observations is at least Q. default: 0 - -Y --min-supporting-mapping-qsum Q - Consider any allele in which and the sum of mapping qualities of - supporting reads is at least Q. default: 0 - -Q --mismatch-base-quality-threshold Q - Count mismatches toward --read-mismatch-limit if the base - quality of the mismatch is >= Q. default: 10 - -U --read-mismatch-limit N - Exclude reads with more than N mismatches where each mismatch - has base quality >= mismatch-base-quality-threshold. - default: ~unbounded - -z --read-max-mismatch-fraction N - Exclude reads with more than N [0,1] fraction of mismatches where - each mismatch has base quality >= mismatch-base-quality-threshold - default: 1.0 - -$ --read-snp-limit N - Exclude reads with more than N base mismatches, ignoring gaps - with quality >= mismatch-base-quality-threshold. - default: ~unbounded - -e --read-indel-limit N - Exclude reads with more than N separate gaps. - default: ~unbounded - -0 --standard-filters Use stringent input base and mapping quality filters - Equivalent to -m 30 -q 20 -R 0 -S 0 - -F --min-alternate-fraction N - Require at least this fraction of observations supporting - an alternate allele within a single individual in the - in order to evaluate the position. default: 0.2 - -C --min-alternate-count N - Require at least this count of observations supporting - an alternate allele within a single individual in order - to evaluate the position. default: 2 - -3 --min-alternate-qsum N - Require at least this sum of quality of observations supporting - an alternate allele within a single individual in order - to evaluate the position. default: 0 - -G --min-alternate-total N - Require at least this count of observations supporting - an alternate allele within the total population in order - to use the allele in analysis. default: 1 - -! --min-coverage N - Require at least this coverage to process a site. default: 0 - -Population priors:: - - -k --no-population-priors - Equivalent to --pooled-discrete --hwe-priors-off and removal of - Ewens Sampling Formula component of priors. - -Mappability priors:: - - -w --hwe-priors-off - Disable estimation of the probability of the combination - arising under HWE given the allele frequency as estimated - by observation frequency. - -V --binomial-obs-priors-off - Disable incorporation of prior expectations about observations. - Uses read placement probability, strand balance probability, - and read position (5'-3') probability. - -a --allele-balance-priors-off - Disable use of aggregate probability of observation balance between alleles - as a component of the priors. - -Genotype likelihoods:: - - --observation-bias FILE - Read length-dependent allele observation biases from FILE. - The format is [length] [alignment efficiency relative to reference] - where the efficiency is 1 if there is no relative observation bias. - --base-quality-cap Q - Limit estimated observation quality by capping base quality at Q. - --experimental-gls - Generate genotype likelihoods using 'effective base depth' metric - qual = 1-BaseQual * 1-MapQual. Incorporate partial observations. - This is the default when contamination estimates are provided. - Optimized for diploid samples. - --prob-contamination F - An estimate of contamination to use for all samples. default: 10e-9 - --contamination-estimates FILE - A file containing per-sample estimates of contamination, such as - those generated by VerifyBamID. The format should be: - sample p(read=R|genotype=AR) p(read=A|genotype=AA) - Sample '*' can be used to set default contamination estimates. - -Algorithmic features:: - - --report-genotype-likelihood-max - Report genotypes using the maximum-likelihood estimate provided - from genotype likelihoods. - -B --genotyping-max-iterations N - Iterate no more than N times during genotyping step. default: 1000. - --genotyping-max-banddepth N - Integrate no deeper than the Nth best genotype by likelihood when - genotyping. default: 6. - -W --posterior-integration-limits N,M - Integrate all genotype combinations in our posterior space - which include no more than N samples with their Mth best - data likelihood. default: 1,3. - -N --exclude-unobserved-genotypes - Skip sample genotypings for which the sample has no supporting reads. - -S --genotype-variant-threshold N - Limit posterior integration to samples where the second-best - genotype likelihood is no more than log(N) from the highest - genotype likelihood for the sample. default: ~unbounded - -j --use-mapping-quality - Use mapping quality of alleles when calculating data likelihoods. - -H --harmonic-indel-quality - Use a weighted sum of base qualities around an indel, scaled by the - distance from the indel. By default use a minimum BQ in flanking sequence. - -D --read-dependence-factor N - Incorporate non-independence of reads by scaling successive - observations by this factor during data likelihood - calculations. default: 0.9 - -= --genotype-qualities - Calculate the marginal probability of genotypes and report as GQ in - each sample field in the VCF output. - + 1. *Simple diploid calling*: The simplest possible FreeBayes application. Equivalent to using FreeBayes with only a BAM input and no other parameter options. + 2. *Simple diploid calling with filtering and coverage*: Same as #1 plus two additional options: -0 (standard filters: --min-mapping-quality 30 --min-base-quality 20 --min-supporting-allele-qsum 0 --genotype-variant-threshold 0) and --min-coverage. + 3. *Frequency-based pooled calling*: This is equivalent to using FreeBayes with the following options: --haplotype-length 0 --min-alternate-count 1 --min-alternate-fraction 0 --pooled-continuous --report-monomorphic. This is the best choice for calling variants in mixtures such as viral, bacterial, or organellar genomes. + 4. *Frequency-based pooled calling with filtering and coverage*: Same as #3 but adds -0 and --min-coverage like in #2. + 5. *Complete list of all options*: Gives you full control by exposing all FreeBayes options as Galaxy parameters. ------ **Acknowledgments** The initial version of the wrapper was produced by Dan Blankenberg and upgraded by Anton Nekrutenko. -TNG was developed by Bjoern Gruening +TNG was developed by Bjoern Gruening. diff -r 9f164587a92f -r 977a5301b66d leftalign.xml --- a/leftalign.xml Tue Jun 06 11:44:38 2017 -0400 +++ b/leftalign.xml Tue Jun 06 18:41:18 2017 -0400 @@ -1,17 +1,11 @@ - + indels in BAM datasets macros.xml - - freebayes - samtools - - - - - + 1.1.0 - + + + freebayes + samtools + + + @@ -20,24 +26,23 @@ - - + + - + - + - + diff -r 9f164587a92f -r 977a5301b66d test-data/left-align-output.bam Binary file test-data/left-align-output.bam has changed