view STACKS_population.xml @ 3:0e0ff9e9c761 default tip

fix inputs clean name
author cmonjeau
date Mon, 28 Sep 2015 13:21:35 +0000
parents d6ba40f6c824
children
line wrap: on
line source

<tool id="STACKSpopulation" name="STACKS : populations"  force_history_refresh="True">
  <description>Run the STACKS populations program</description>


<requirements>
    <requirement type="package" version="1.18">stacks</requirement>
</requirements>

<command interpreter="python">

STACKS_population.py
-P $STACKS_archive
-b $batchid
-M $popmap

#if $options_kernel.kernel
  -k
  --window_size $options_kernel.window
#end if

#if str( $options_enzyme.options_enzyme_selector ) == "1":
  -e $options_enzyme.enzyme
  --genomic $options_enzyme.genomic
#end if

## advanced options
--advanced_options_activate $advanced_options_activate
#if $advanced_options_activate
  -r $advanced_options.minperc
  -p $advanced_options.minpop
  -m $advanced_options.mindepth
  -a $advanced_options.minminor
  #if str( $advanced_options.correction_select.correction ) != "no_corr":
    -f $advanced_options.correction_select.correction
    --p_value_cutoff $advanced_options.correction_select.pcutoff
  #end if
  #if str( $advanced_options.blacklistselect.advanced_blackoptions_selector) == "advanced"
   -B $advanced_options.blacklistselect.blacklist
  #end if
  #if str( $advanced_options.whitelistselect.advanced_whiteoptions_selector) == "advanced"
    -W $advanced_options.whitelistselect.whitelist
  #end if
  #if str( $advanced_options.bootstrapresampling.advanced_bootoptions_selector) == "advanced"
    --bootstrap $advanced_options.bootstrapresampling.bootstrap
    --bootstrap_reps $advanced_options.bootstrapresampling.bootstrapreps
  #end if
#end if

## output files
--ss $sumstatssum
--s $sumstats
--fst_output $outfst

## output section
#if $options_output.vcf
--vcf
--ov $outvcf 
#end if
#if $options_output.phylip
--phylip
--op $outphylip
#end if
#if $options_output.phylip
--phylip_var
--ol $outphyliplog 
#end if
#if $options_output.fasta
--fasta
--of $outfasta 
#end if
#if $options_output.structure
--structure
--os $outstructure 
#end if
#if $options_output.plink
--plink
--oe $outplinkped 
--om=$outplinkmap
#end if
#if $options_output.phase
--phase
--phase_output $outphase 
#end if
#if $options_output.beagle
--beagle
--unphased_output $outbeagle 
#end if
--markers_output $outmarkers
#if $options_output.genepop
--genepop
--og=$outgenepop 
#end if
#if $options_output.write_single_snp
--write_single_snp
#end if
--logfile $output

</command>

<inputs>
	<param name="STACKS_archive" format="zip,tar.gz" type="data" label="Archive from STACKS pipeline regrouping all outputs" />
	<param name="batchid" type="integer" value="1" label="Batch ID" help="Batch ID to examine when exporting from the catalog" />
	<param name="popmap" type="data" format="tabular,txt" label="Specify a population map" help="specify a population map" />
	<section name="options_output" title="Output options" expanded="False">
		<param name="vcf" type="boolean" checked="false" default="false" label="output results in Variant Call Format (VCF)" />
		<param name="genepop" type="boolean" checked="false" default="false" label="output results in GenePop Format" />
		<param name="structure" type="boolean" checked="false" default="false" label="output results in Structure Format" />
		<param name="fasta" type="boolean" checked="false" default="false" label="output full sequence for each allele, from each sample locus in FASTA format" />
		<param name="phase" type="boolean" checked="false" default="false" label="output genotypes in PHASE/fastPHASE format" />
		<param name="beagle" type="boolean" checked="false" default="false" label="output genotypes in Beagle format" />
		<param name="plink" type="boolean" checked="false" default="false" label="output genotypes in PLINK format" />
		<param name="phylip" type="boolean" checked="false" default="false" label="output nucleotides that are fixed-within, and variant among populations in Phylip format for phylogenetic tree construction" />
		<param name="phylip_var" type="boolean" checked="false" default="false" label="include variable sites in the phylip output" />
		<param name="write_single_snp" type="boolean" checked="false" default="false" label="write only the first SNP per locus in Genepop and Structure outputs" />
	</section>
	<section name="options_kernel" title="Kernel options" expanded="False">
			<param name="kernel" type="boolean" checked="false" default="false" label="enable kernel-smoothed FIS, π, and FST calculations" />
			<param name="window" type="integer" value="150" label="window size" help="distance over which to average values (sigma, default 150Kb)" />
	</section>

	<conditional name="options_enzyme">
	    <param name="options_enzyme_selector" type="select" label="Did you want to use the genomic output option?">
			<option value="1">Yes</option>
			<option value="2" selected="true">No</option>
	    </param>
	    <when value="1">
			<param name="enzyme" type="select" format="text" label="provide the restriction enzyme used" help="required if generating genomic output" >
				<option value="apeKI">apeKI</option>
				<option value="bamHI">bamHI</option>
				<option value="claI">claI</option>
				<option value="dpnII">dpnII</option>
				<option value="eaeI">eaeI</option>
				<option value="ecoRI">ecoRI</option>
				<option value="ecoT22I">ecoT22I</option>
				<option value="hindIII">hindIII</option>
				<option value="mluCI">mluCI</option>
				<option value="mseI">mseI</option>
				<option value="mspI">mspI</option>
				<option value="ndeI">ndeI</option>
				<option value="nlaIII">nlaIII</option>
				<option value="notI">notI</option>
				<option value="nsiI">nsiI</option>
				<option value="pstI">pstI</option>
				<option value="sau3AI">sau3AI</option>
				<option value="sbfI">sbfI</option>
				<option value="sexAI">sexAI</option>
				<option value="sgrAI">sgrAI</option>
				<option value="sphI">sphI</option>
				<option value="taqI">taqI</option>
				<option value="xbaI">xbaI</option>
			</param>
			<param name="genomic" type="boolean" checked="false" default="false" label="output each nucleotide position (fixed or polymorphic) in all population members to a file" />
	    </when>
	    <when value="2">
	    </when>
	</conditional>
        <param name="advanced_options_activate" type="boolean" label="Activate advanced options" help="advanced options are defined below" />
 	<section name="advanced_options" title="Advanced options">
		<conditional name="whitelistselect">
			<param name="advanced_whiteoptions_selector" type="select" label="whitelist advanced options">
				<option value="default" selected="true">Default</option>		
				<option value="advanced">Advanced</option>
			</param>
			<when value="default"></when>
			<when value="advanced">
				<param name="whitelist" format="txt, tabular" type="data" label="specify a file containing Whitelisted markers to include in the export" />
			</when>
		</conditional>
		<conditional name="blacklistselect">
			<param name="advanced_blackoptions_selector" type="select" label="blacklist advanced options">
				<option value="default" selected="true">Default</option>		
				<option value="advanced">Advanced</option>
			</param>
			<when value="default"></when>
			<when value="advanced">
				<param name="blacklist" format="txt, tabular" type="data" label="specify a file containing Blacklisted markers to be excluded from the export" />
			</when>
		</conditional>
		<param name="minperc" type="float" value="0.5" min="0" max="1" label="min percentage of individuals by population" help="minimum percentage of individuals in a population required to process a locus for that population" />
		<param name="minpop" type="integer" value="2" label="min number of populations" help="minimum number of populations a locus must be present in to process a locus" />
		<param name="mindepth" type="integer" value="1" label="min stack depth" help="specify a minimum stack depth required for individuals at a locus" />
		<param name="minminor" type="float" value="0.25" label="min minor allele frequency" help="specify a minimum minor allele frequency required before calculating Fst at a locus (between 0 and 0.5)" />
		<conditional name="correction_select">
			<param name="correction" type="select" format="text" label="Correction type" help="specify a correction to be applied to Fst values: 'p_value', 'bonferroni_win', or 'bonferroni_gen'" >
					<option value="no_corr">No correction</option>
					<option value="p_value">p_value</option>
					<option value="bonferroni_win">bonferroni_win</option>
					<option value="bonferroni_gen">bonferroni_gen</option>
			</param>
			<when value="no_corr"></when>
			<when value="p_value">
				<param name="pcutoff" type="float" value="0.05" label="p-value" help="required p-value to keep an Fst measurement (0.05 by default). Also used as base for Bonferroni correction" />
			</when>
			<when value="bonferroni_win">
        	                <param name="pcutoff" type="float" value="0.05" label="p-value" help="required p-value to keep an Fst measurement (0.05 by default). Also used as base for Bonferroni correction" />
                	</when>
			<when value="bonferroni_gen">
        	                <param name="pcutoff" type="float" value="0.05" label="p-value" help="required p-value to keep an Fst measurement (0.05 by default). Also used as base for Bonferroni correction" />
                	</when>
		</conditional>
		<conditional name="bootstrapresampling">
			<param name="advanced_bootoptions_selector" type="select" label="bootstrap resampling advanced options">
				<option value="default" selected="true">Default</option>		
				<option value="advanced">Advanced</option>
			</param>
			<when value="default"></when>
			<when value="advanced">
				<param name="bootstrap" type="select" format="text" label="Bootstrap resampling" help="enable bootstrap resampling for population statistics (reference genome required)" >
					<option value="exact">exact</option>
					<option value="approx">approx</option>
				</param>
				<param name="bootstrapreps" type="integer" value="100" label="number of resampling" help="number of bootstrap resamplings to calculate" />
			</when>
		</conditional>
	</section>
</inputs>
<outputs>
    <data format="txt" name="output" label="result.log with ${tool.name} on ${on_string}" />
    <data format="txt" name="additional" label="additional file with ${tool.name}" hidden="true">
          <discover_datasets pattern="__designation_and_ext__" directory="galaxy_outputs" visible="true" />
    </data>  

    <data format="tabular" name="sumstatssum" label="sumstats_summary.tsv with ${tool.name} on ${on_string}" />
    <data format="tabular" name="sumstats" label="sumstats.tsv with ${tool.name} on ${on_string}" />
    <data format="zip" name="outfst" label="fst.zip with ${tool.name} on ${on_string}" />

    <data format="vcf" name="outvcf" label="vcf file with ${tool.name} on ${on_string}">
        <filter>options_output['vcf']</filter>
        <filter>options_output['options_output_selector'] == '1' </filter>
    </data>
    <data format="phylip" name="outphylip" label="phylip file with ${tool.name} on ${on_string}">
        <filter>options_output['phylip']</filter>
        <filter>options_output['options_output_selector'] == '1' </filter>
    </data>
    <data format="txt" name="outphyliplog" label="phylip.log file with ${tool.name} on ${on_string}">
        <filter>options_output['phylip']</filter>
        <filter>options_output['options_output_selector'] == '1' </filter>
    </data>
    <data format="txt" name="outunphasedlog" label="unphased.log file with ${tool.name} on ${on_string}">
        <filter>options_output['beagle']</filter>
        <filter>options_output['options_output_selector'] == '1' </filter>
    </data>
    <data format="fasta" name="outfasta" label="fasta file with ${tool.name} on ${on_string}">
        <filter>options_output['fasta']</filter>
        <filter>options_output['options_output_selector'] == '1' </filter>
    </data>
    <data format="tabular" name="outstructure" label="structure file with ${tool.name} on ${on_string}">
        <filter>options_output['structure']</filter>
        <filter>options_output['options_output_selector'] == '1' </filter>
    </data>
    <data format="txt" name="outplinkped" label="plink.bed file with ${tool.name} on ${on_string}">
        <filter>options_output['plink']</filter>
        <filter>options_output['options_output_selector'] == '1' </filter>
    </data>
    <data format="txt" name="outplinkmap" label="plink.map file with ${tool.name} on ${on_string}">
        <filter>options_output['plink']</filter>
        <filter>options_output['options_output_selector'] == '1' </filter>
    </data>
    <data format="txt" name="outgenepop" label="genepop file with ${tool.name} on ${on_string}">
        <filter>options_output['genepop']</filter>
        <filter>options_output['options_output_selector'] == '1' </filter>
    </data>
    <data format="zip" name="outphase" label="phased.zip PHASE/fastPHASE genotype files with ${tool.name} on ${on_string}">
        <filter>options_output['phase']</filter>
        <filter>options_output['options_output_selector'] == '1' </filter>
    </data>
    <data format="zip" name="outbeagle" label="unphase.zip Beagle genotype files with ${tool.name} on ${on_string}">
        <filter>options_output['beagle']</filter>
        <filter>options_output['options_output_selector'] == '1' </filter>
    </data>
    <data format="zip" name="outmarkers" label="markers.zip Genotype files with ${tool.name} on ${on_string}">
        <filter>options_output['beagle']</filter>
        <filter>options_output['options_output_selector'] == '1' </filter>
    </data>


</outputs>

<stdio>
   <exit_code range="1" level="fatal" description="Error in Stacks population execution" />
</stdio>

<help>

.. class:: infomark

**What it does**

This program will be executed in place of the genotypes program when a population is being processed through the pipeline. A map specifiying which individuals belong to which population is submitted to the program and the program will then calculate population genetics statistics, expected/observed heterzygosity, π, and FIS at each nucleotide position. The populations program will compare all populations pairwise to compute FST. If a set of data is reference aligned, then a kernel-smoothed FST will also be calculated. 

--------

**Created by:**

Stacks was developed by Julian Catchen with contributions from Angel Amores, Paul Hohenlohe, and Bill Cresko

--------

**Example:**

Input files:

FASTQ, FASTA, zip, tar.gz

- Population map::

    indv_01    1
    indv_02    1
    indv_03    1
    indv_04    2
    indv_05    2
    indv_06    2


Output files:

- XXX.tags.tsv file::

    Column    Name                     Description
    1         Sql ID                   This field will always be "0", however the MySQL database will assign an ID when it is loaded.
    2         Sample ID                Each sample passed through Stacks gets a unique id for that sample.
    3         Stack ID                 Each stack formed gets an ID.
    4         Chromosome               If aligned to a reference genome using pstacks, otherwise it is blank.
    5         Basepair                 If aligned to ref genome using pstacks.
    6         Strand                   If aligned to ref genome using pstacks.
    7         Sequence Type            Either 'consensus', 'primary' or 'secondary', see the Stacks paper for definitions of these terms.
    8         Sequence ID              The individual sequence read that was merged into this stack.
    9         Sequence                 The raw sequencing read.
    10        Deleveraged Flag         If "1", this stack was processed by the deleveraging algorithm and was broken down from a larger stack.
    11        Blacklisted Flag         If "1", this stack was still confounded depsite processing by the deleveraging algorithm.
    12        Lumberja ckstack Flag    If "1", this stack was set aside due to having an extreme depth of coverage.

Notes: For the tags file, each stack will start in the file with a consensus sequence for the entire stack followed by the flags for that stack. Then, each individual read that was merged into that stack will follow. The next stack will start with another consensus sequence.


- XXX.snps.tsv file::

    Column    Name                     Description
    1         Sql ID                   This field will always be "0", however the MySQL database will assign an ID when it is loaded.
    2         Sample ID                
    3         Stack ID                 
    4         SNP Column               
    5         Likelihood ratio         From the SNP-calling model.
    6         Rank_1                   Majority nucleotide.
    7         Rank_2                   Alternative nucleotide.

Notes: If a stack has two SNPs called within it, then there will be two lines in this file listing each one.


- XXX.alleles.tsv file::

    Column    Name                     Description
    1         Sql ID                   This field will always be "0", however the MySQL database will assign an ID when it is loaded.
    2         Sample ID                
    3         Stack ID                 
    4         Haplotype                The haplotype, as constructed from the called SNPs at each locus.
    5         Percent                  Percentage of reads that have this haplotype
    6         Count                    Raw number of reads that have this haplotype


- XXX.matches.tsv file::

    Column    Name                     Description
    1         Sql ID                   This field will always be "0", however the MySQL database will assign an ID when it is loaded.
    2         Batch ID                
    3         Catalog ID                 
    4         Sample ID               
    5         Stack ID         
    6         Haplotype 
    7         Stack Depth	

Notes: Each line in this file records a match between a catalog locus and a locus in an individual, for a particular haplotype. The Batch ID plus the Catalog ID together represent a unique locus in the entire population, while the Sample ID and the Stack ID together represent a unique locus in an individual sample.


- batch_X.sumstats.tsv Summary Statistics Output::

	Batch ID			The batch identifier for this data set.
	Locus ID			Catalog locus identifier.
	Chromosome			If aligned to a reference genome.
	Basepair			If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand).
	Column				The nucleotide site within the catalog locus.
	Population ID			The ID supplied to the populations program, as written in the population map file.
	P Nucleotide			The most frequent allele at this position in this population.
	Q Nucleotide			The alternative allele.
	Number of Individuals 		Number of individuals sampled in this population at this site.
	P 				Frequency of most frequent allele.
	Observed Heterozygosity		The proportion of individuals that are heterozygotes in this population.
	Observed Homozygosity 		The proportion of individuals that are homozygotes in this population.
	Expected Heterozygosity		Heterozygosity expected under Hardy-Weinberg equilibrium.
	Expected Homozygosity 		Homozygosity expected under Hardy-Weinberg equilibrium.
	pi				An estimate of nucleotide diversity.
	Smoothed pi			A weighted average of p depending on the surrounding 3s of sequence in both directions.
	Smoothed pi P-value 		If bootstrap resampling is enabled, a p-value ranking the significance of p within this population.
	FIS 				The inbreeding coefficient of an individual (I) relative to the subpopulation (S).
	Smoothed FIS 			A weighted average of FIS depending on the surrounding 3s of sequence in both directions.
	Smoothed FIS P-value		If bootstrap resampling is enabled, a p-value ranking the significance of FIS within this population.
	Private allele 			True (1) or false (0), depending on if this allele is only occurs in this population.

- batch_X.fst_Y-Z.tsv Pairwise FST Output::

	Batch ID 			The batch identifier for this data set.
	Locus ID 			Catalog locus identifier.
	Population ID 1 		The ID supplied to the populations program, as written in the population map file.
	Population ID 2 		The ID supplied to the populations program, as written in the population map file.
	Chromosome 			If aligned to a reference genome.
	Basepair 			If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand).
	Column 				The nucleotide site within the catalog locus.
	Overall pi 			An estimate of nucleotide diversity across the two populations.
	FST 				A measure of population differentiation.
	FET p-value 			P-value describing if the FST measure is statistically significant according to Fisher's Exact Test.
	Odds Ratio 			Fisher's Exact Test odds ratio
	CI High 			Fisher's Exact Test confidence interval.
	CI Low 				Fisher's Exact Test confidence interval.
	LOD Score 			Logarithm of odds score.
	Expected Heterozygosity		Heterozygosity expected under Hardy-Weinberg equilibrium.
	Expected Homozygosity 		Homozygosity expected under Hardy-Weinberg equilibrium.
	Corrected FST 			FST with either the FET p-value, or a window-size or genome size Bonferroni correction.
	Smoothed FST 			A weighted average of FST depending on the surrounding 3s of sequence in both directions.
	Smoothed FST P-value 		If bootstrap resampling is enabled, a p-value ranking the significance of FST within this pair of populations.


Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki &lt;https://www.e-biogenouest.org/wiki/ManArchiveGalaxy&gt;`_ .

--------

**Output type:**

- Output type details::

	No compression 			All files will be added in the current history.
	Compressed by categories	Files will be compressed by categories (snps, allele, matches and tags) into 4 zip archives. These archives and batch files will be added in the current history.
	Compressed all outputs 		All files will be compressed in an unique zip archive. Batch files will be added in the current history with the archive.


--------

**Project links:**

`STACKS website &lt;http://creskolab.uoregon.edu/stacks/&gt;`_ .

`STACKS manual &lt;http://creskolab.uoregon.edu/stacks/stacks_manual.pdf&gt;`_ .

`STACKS google group &lt;https://groups.google.com/forum/#!forum/stacks-users&gt;`_ .

--------

**References:**

-J. Catchen, P. Hohenlohe, S. Bassham, A. Amores, and W. Cresko. Stacks: an analysis tool set for population genomics. Molecular Ecology. 2013.

-J. Catchen, S. Bassham, T. Wilson, M. Currey, C. O'Brien, Q. Yeates, and W. Cresko. The population structure and recent colonization history of Oregon threespine stickleback determined using restriction-site associated DNA-sequencing. Molecular Ecology. 2013.

-J. Catchen, A. Amores, P. Hohenlohe, W. Cresko, and J. Postlethwait. Stacks: building and genotyping loci de novo from short-read sequences. G3: Genes, Genomes, Genetics, 1:171-182, 2011.

-A. Amores, J. Catchen, A. Ferrara, Q. Fontenot and J. Postlethwait. Genome evolution and meiotic maps by massively parallel DNA sequencing: Spotted gar, an outgroup for the teleost genome duplication. Genetics, 188:799'808, 2011.

-P. Hohenlohe, S. Amish, J. Catchen, F. Allendorf, G. Luikart. RAD sequencing identifies thousands of SNPs for assessing hybridization between rainbow trout and westslope cutthroat trout. Molecular Ecology Resources, 11(s1):117-122, 2011.

-K. Emerson, C. Merz, J. Catchen, P. Hohenlohe, W. Cresko, W. Bradshaw, C. Holzapfel. Resolving postglacial phylogeography using high-throughput sequencing. Proceedings of the National Academy of Science, 107(37):16196-200, 2010.

--------

**Integrated by:**

Yvan Le Bras and Cyril Monjeaud 

GenOuest Bio-informatics Core Facility

UMR 6074 IRISA INRIA-CNRS-UR1 Rennes (France)

support@genouest.org

If you use this tool in Galaxy, please cite :

`Y. Le Bras, A. Roult, C. Monjeaud, M. Bahin, O. Quenez, C. Heriveau, A. Bretaudeau, O. Sallou, O. Collin, Towards a Life Sciences Virtual Research Environment : an e-Science initiative in Western France. JOBIM 2013. &lt;https://www.e-biogenouest.org/resources/128&gt;`_


</help>
<citations>
    <citation type="doi">10.1111/mec.12354</citation>
    <citation type="doi">10.1111/mec.12330</citation>
    <citation type="doi">10.1534/g3.111.000240</citation>
    <citation type="doi">10.1534/genetics.111.127324</citation>
    <citation type="doi">10.1111/j.1755-0998.2010.02967.x</citation>
    <citation type="doi">10.1073/pnas.1006538107</citation>

    <citation type="bibtex">@INPROCEEDINGS{JOBIM2013,
    author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.},
    title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France},
    booktitle = {JOBIM 2013 Proceedings},
    year = {2013},
    url = {https://www.e-biogenouest.org/resources/128},
    pages = {97-106}
    }</citation>
</citations>
</tool>