# HG changeset patch # User cmonjeau # Date 1440408552 0 # Node ID d6ba40f6c824d14cc32d99ef6573536f6a1fde80 first commit diff -r 000000000000 -r d6ba40f6c824 STACKS_denovomap.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_denovomap.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,265 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import sys +import re +import os +import tempfile +import shutil +import subprocess +import glob +import argparse +from os.path import basename +import zipfile +import tarfile +import gzip +from galaxy.datatypes.checkers import * +from stacks import * + + +def __main__(): + + # arguments recuperation + + parser = argparse.ArgumentParser() + parser.add_argument('-p') + parser.add_argument('-b') + parser.add_argument('-r') + parser.add_argument('-s') + parser.add_argument('-O') + parser.add_argument('-m') + parser.add_argument('-P') + parser.add_argument('-M') + parser.add_argument('-N') + parser.add_argument('-n') + parser.add_argument('-t') + parser.add_argument('-H') + parser.add_argument('--bound_low') + parser.add_argument('--bound_high') + parser.add_argument('--alpha') + parser.add_argument('--logfile') + parser.add_argument('--compress_output') + parser.add_argument('--catalogsnps') + parser.add_argument('--catalogalleles') + parser.add_argument('--catalogtags') + + # additionnal outputs + parser.add_argument('--total_output') + parser.add_argument('--tags_output') + parser.add_argument('--snps_output') + parser.add_argument('--alleles_output') + parser.add_argument('--matches_output') + + options = parser.parse_args() + + # create working directories + + os.mkdir('inputs') + os.mkdir('job_outputs') + os.mkdir('galaxy_outputs') + + cmd_line = [] + cmd_line.append('denovo_map.pl') + + # if genetic map + + if options.p: + + # parse config files + + tab_parent_files = galaxy_config_to_tabfiles_for_STACKS(options.p) + + # check if zipped files are into the tab and change tab content + + extract_compress_files_from_tabfiles(tab_parent_files, 'inputs') + + # check files extension (important to have .fq or .fasta files) + + check_fastq_extension_and_add(tab_parent_files, 'inputs') + + # create symlink into the temp dir + + create_symlinks_from_tabfiles(tab_parent_files, 'inputs') + + # parse the input dir and store all file names into a tab + + fastq_files = [] + for fastq_file in glob.glob('inputs/*'): + # if is a file (skip repository created after a decompression) + if os.path.isfile(fastq_file): + fastq_files.append(fastq_file) + + fastq_files.sort() + + # test if fastq are paired-end + if options.b == 'true': + for n in range(0, len(fastq_files), 2): + cmd_line.extend(['-p', fastq_files[n]]) + else: + for myfastqfile in fastq_files: + cmd_line.extend(['-p', myfastqfile]) + + # if genetic map with progeny files + + if options.r: + + # parse config files + tab_progeny_files = galaxy_config_to_tabfiles_for_STACKS(options.r) + + # check if zipped files are into the tab and change tab content + extract_compress_files_from_tabfiles(tab_progeny_files, 'inputs') + + # check files extension (important to have .fq or .fasta files) + check_fastq_extension_and_add(tab_progeny_files, 'inputs') + + # create symlink into the temp dir + create_symlinks_from_tabfiles(tab_progeny_files, 'inputs') + + for key in tab_progeny_files: + + # if is a file (skip repository created after a decompression) + + if os.path.isfile('inputs/' + key): + cmd_line.extend(['-r', 'inputs/' + key]) + + # if population is checked + if options.s: + + tab_individual_files = galaxy_config_to_tabfiles_for_STACKS(options.s) + + # check if zipped files are into the tab and change tab content + extract_compress_files_from_tabfiles(tab_individual_files, 'inputs') + + # check files extension (important to have .fq or .fasta files) + check_fastq_extension_and_add(tab_individual_files, 'inputs') + + # create symlink into the temp dir + create_symlinks_from_tabfiles(tab_individual_files, 'inputs') + + # create the command input line + for key in tab_individual_files: + + # if is a file (skip repository created after a decompression) + if os.path.isfile('inputs/' + key): + cmd_line.extend(['-s', 'inputs/' + key]) + + # create the command line + cmd_line.extend([ + '-S', + '-b', + '1', + '-T', + '4', + '-o', + 'job_outputs/' + ]) + + if options.O: + cmd_line.extend(['-O', options.O]) + + if options.m and options.m != '-1': + cmd_line.extend(['-m', options.m]) + + if options.P and options.P != '-1': + cmd_line.extend(['-P', options.P]) + + if options.M and options.M != '-1': + cmd_line.extend(['-M', options.M]) + + if options.N and options.N != '-1': + cmd_line.extend(['-N', options.N]) + + if options.n and options.n != '-1': + cmd_line.extend(['-n', options.n]) + + if options.t and options.t == 'true': + cmd_line.append('-t') + + if options.H and options.H == 'true': + cmd_line.append('-H') + + ## SNP model + if options.bound_low: + cmd_line.extend(['--bound_low', options.bound_low]) + cmd_line.extend(['--bound_high', options.bound_high]) + + if options.alpha: + cmd_line.extend(['--alpha', options.alpha]) + + # launch the command line + print "[CMD_LINE] : "+' '.join(cmd_line) + + p = subprocess.call(cmd_line) + + # postprocesses + try: + shutil.move('job_outputs/denovo_map.log', options.logfile) + except: + sys.stderr.write('Error in denovo_map execution; Please read the additional output (stdout)\n') + sys.exit(1) + + # go inside the outputs dir + os.chdir('job_outputs') + + # move files + for i in glob.glob('*'): + if re.search('catalog.snps.tsv$', i): + shutil.copy(i, options.catalogsnps) + if re.search('catalog.alleles.tsv$', i): + shutil.copy(i, options.catalogalleles) + if re.search('catalog.tags.tsv$', i): + shutil.copy(i, options.catalogtags) + + list_files = glob.glob('*') + + # if compress output is total + if options.compress_output == 'total': + + mytotalzipfile = zipfile.ZipFile('total.zip.temp', 'w', + allowZip64=True) + + for i in list_files: + mytotalzipfile.write(os.path.basename(i)) + + # return the unique archive + shutil.move('total.zip.temp', options.total_output) + elif options.compress_output == 'categories': + + # if compress output is by categories + mytagszip = zipfile.ZipFile('tags.zip.temp', 'w', allowZip64=True) + mysnpszip = zipfile.ZipFile('snps.zip.temp', 'w', allowZip64=True) + myalleleszip = zipfile.ZipFile('alleles.zip.temp', 'w', allowZip64=True) + mymatcheszip = zipfile.ZipFile('matches.zip.temp', 'w', allowZip64=True) + + for i in list_files: + # for each type of files + if re.search("tags\.tsv$", i) and not re.search('batch', i): + mytagszip.write(os.path.basename(i)) + os.remove(i) + elif re.search("snps\.tsv$", i) and not re.search('batch', i): + mysnpszip.write(os.path.basename(i)) + os.remove(i) + elif re.search("alleles\.tsv$", i) and not re.search('batch', i): + myalleleszip.write(os.path.basename(i)) + os.remove(i) + elif re.search("matches\.tsv$", i) and not re.search('batch', i): + mymatcheszip.write(os.path.basename(i)) + os.remove(i) + else: + shutil.move(os.path.basename(i), '../galaxy_outputs') + + # return archives.... + shutil.move('tags.zip.temp', options.tags_output) + shutil.move('snps.zip.temp', options.snps_output) + shutil.move('alleles.zip.temp', options.alleles_output) + shutil.move('matches.zip.temp', options.matches_output) + else: + # else no compression + for i in list_files: + shutil.move(os.path.basename(i), '../galaxy_outputs') + + +if __name__ == '__main__': + __main__() + + diff -r 000000000000 -r d6ba40f6c824 STACKS_denovomap.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_denovomap.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,387 @@ + + Run the STACKS denovo_map.pl wrapper + + + +#if str( $options_usage.options_usage_selector ) == "genetic" +#for $input in $options_usage.parent_sequence: +${input.display_name}::${input} +#end for +#end if + + +#if str( $options_usage.options_usage_selector ) == "genetic" and str( $options_usage.options_progeny.options_progeny_selector ) == "yes" +#for $input in $options_usage.options_progeny.progeny_sequence: +${input.display_name}::${input} +#end for +#end if + + +#if str( $options_usage.options_usage_selector ) == "population" +#for $input in $options_usage.individual_sample: +${input.display_name}::${input} +#end for +#end if + + + + + stacks + + + +STACKS_denovomap.py +#if str( $options_usage.options_usage_selector ) == "genetic" +-p $parent_sequences +-b $options_usage.paired +#if str( $options_usage.options_progeny.options_progeny_selector ) == "yes" +-r $progeny_sequences +#end if +#else +-s $individual_samples +#if str( $options_usage.options_popmap.popmap_selector) == "yes" +-O $options_usage.options_popmap.popmap +#end if +#end if +-m $advanced_options.minident +-P $advanced_options.minidentprogeny +-M $advanced_options.mismatchbetlociproc +-N $advanced_options.mismatchsecond +-n $advanced_options.mismatchbetlocibuild +-t $advanced_options.remove_hightly +-H $advanced_options.disable_calling +## snp_model +#if str( $snp_options.select_model.model_type) == "bounded" +--bound_low $snp_options.select_model.boundlow +--bound_high $snp_options.select_model.boundhigh +--alpha $snp_options.select_model.alpha +#else +--alpha $snp_options.select_model.alpha +#end if +## outputs +--catalogsnps $catalogsnps +--catalogalleles $catalogalleles +--catalogtags $catalogtags +--logfile $output +--compress_output $output_compress +##additionnal outputs +--total_output $total_output +--tags_output $tags_output +--snps_output $snps_output +--alleles_output $alleles_output +--matches_output $matches_output + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+ +
+ + + + + + + + + + + + + + +
+ + + + + + +
+ + + + + + + + + + + + output_compress == "total" + + + output_compress == "categories" + + + output_compress == "categories" + + + output_compress == "categories" + + + output_compress == "categories" + + + + + + + + +.. class:: infomark + +**What it does** + +This program will run each of the Stacks components: first, running ustacks on each of the samples specified, building loci and calling SNPs in each. Second, cstacks will be run to create a catalog of all loci that were marked as 'parents' or 'samples' on the command line, and finally, sstacks will be executed to match each sample against the catalog. A bit more detail on this process can be found in the FAQ. The denovo_map.pl program will also load the results of each stage of the analysis: individual loci, the catalog, and matches against the catalog into the database (although this can be disabled). After matching, the program will build a database index to speed up access (index_radtags.pl) and enable web-based filtering. + +-------- + +**Created by:** + +Stacks was developed by Julian Catchen with contributions from Angel Amores, Paul Hohenlohe, and Bill Cresko + +-------- + +**Example:** + +Input files: + +FASTQ, FASTA, zip, tar.gz + +- Population map:: + + indv_01 1 + indv_02 1 + indv_03 1 + indv_04 2 + indv_05 2 + indv_06 2 + + +Output files: + +- XXX.tags.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID Each sample passed through Stacks gets a unique id for that sample. + 3 Stack ID Each stack formed gets an ID. + 4 Chromosome If aligned to a reference genome using pstacks, otherwise it is blank. + 5 Basepair If aligned to ref genome using pstacks. + 6 Strand If aligned to ref genome using pstacks. + 7 Sequence Type Either 'consensus', 'primary' or 'secondary', see the Stacks paper for definitions of these terms. + 8 Sequence ID The individual sequence read that was merged into this stack. + 9 Sequence The raw sequencing read. + 10 Deleveraged Flag If "1", this stack was processed by the deleveraging algorithm and was broken down from a larger stack. + 11 Blacklisted Flag If "1", this stack was still confounded depsite processing by the deleveraging algorithm. + 12 Lumberja ckstack Flag If "1", this stack was set aside due to having an extreme depth of coverage. + +Notes: For the tags file, each stack will start in the file with a consensus sequence for the entire stack followed by the flags for that stack. Then, each individual read that was merged into that stack will follow. The next stack will start with another consensus sequence. + + +- XXX.snps.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID + 3 Stack ID + 4 SNP Column + 5 Likelihood ratio From the SNP-calling model. + 6 Rank_1 Majority nucleotide. + 7 Rank_2 Alternative nucleotide. + +Notes: If a stack has two SNPs called within it, then there will be two lines in this file listing each one. + + +- XXX.alleles.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID + 3 Stack ID + 4 Haplotype The haplotype, as constructed from the called SNPs at each locus. + 5 Percent Percentage of reads that have this haplotype + 6 Count Raw number of reads that have this haplotype + + +- XXX.matches.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Batch ID + 3 Catalog ID + 4 Sample ID + 5 Stack ID + 6 Haplotype + 7 Stack Depth + +Notes: Each line in this file records a match between a catalog locus and a locus in an individual, for a particular haplotype. The Batch ID plus the Catalog ID together represent a unique locus in the entire population, while the Sample ID and the Stack ID together represent a unique locus in an individual sample. + + +- batch_X.sumstats.tsv Summary Statistics Output:: + + Batch ID The batch identifier for this data set. + Locus ID Catalog locus identifier. + Chromosome If aligned to a reference genome. + Basepair If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand). + Column The nucleotide site within the catalog locus. + Population ID The ID supplied to the populations program, as written in the population map file. + P Nucleotide The most frequent allele at this position in this population. + Q Nucleotide The alternative allele. + Number of Individuals Number of individuals sampled in this population at this site. + P Frequency of most frequent allele. + Observed Heterozygosity The proportion of individuals that are heterozygotes in this population. + Observed Homozygosity The proportion of individuals that are homozygotes in this population. + Expected Heterozygosity Heterozygosity expected under Hardy-Weinberg equilibrium. + Expected Homozygosity Homozygosity expected under Hardy-Weinberg equilibrium. + pi An estimate of nucleotide diversity. + Smoothed pi A weighted average of p depending on the surrounding 3s of sequence in both directions. + Smoothed pi P-value If bootstrap resampling is enabled, a p-value ranking the significance of p within this population. + FIS The inbreeding coefficient of an individual (I) relative to the subpopulation (S). + Smoothed FIS A weighted average of FIS depending on the surrounding 3s of sequence in both directions. + Smoothed FIS P-value If bootstrap resampling is enabled, a p-value ranking the significance of FIS within this population. + Private allele True (1) or false (0), depending on if this allele is only occurs in this population. + +- batch_X.fst_Y-Z.tsv Pairwise FST Output:: + + Batch ID The batch identifier for this data set. + Locus ID Catalog locus identifier. + Population ID 1 The ID supplied to the populations program, as written in the population map file. + Population ID 2 The ID supplied to the populations program, as written in the population map file. + Chromosome If aligned to a reference genome. + Basepair If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand). + Column The nucleotide site within the catalog locus. + Overall pi An estimate of nucleotide diversity across the two populations. + FST A measure of population differentiation. + FET p-value P-value describing if the FST measure is statistically significant according to Fisher's Exact Test. + Odds Ratio Fisher's Exact Test odds ratio + CI High Fisher's Exact Test confidence interval. + CI Low Fisher's Exact Test confidence interval. + LOD Score Logarithm of odds score. + Expected Heterozygosity Heterozygosity expected under Hardy-Weinberg equilibrium. + Expected Homozygosity Homozygosity expected under Hardy-Weinberg equilibrium. + Corrected FST FST with either the FET p-value, or a window-size or genome size Bonferroni correction. + Smoothed FST A weighted average of FST depending on the surrounding 3s of sequence in both directions. + Smoothed FST P-value If bootstrap resampling is enabled, a p-value ranking the significance of FST within this pair of populations. + + +Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki <https://www.e-biogenouest.org/wiki/ManArchiveGalaxy>`_ . + +-------- + +**Output type:** + +- Output type details:: + + No compression All files will be added in the current history. + Compressed by categories Files will be compressed by categories (snps, allele, matches and tags) into 4 zip archives. These archives and batch files will be added in the current history. + Compressed all outputs All files will be compressed in an unique zip archive. Batch files will be added in the current history with the archive. + + +-------- + +**Project links:** + +`STACKS website <http://creskolab.uoregon.edu/stacks/>`_ . + +`STACKS manual <http://creskolab.uoregon.edu/stacks/stacks_manual.pdf>`_ . + +`STACKS google group <https://groups.google.com/forum/#!forum/stacks-users>`_ . + +-------- + +**References:** + +-J. Catchen, P. Hohenlohe, S. Bassham, A. Amores, and W. Cresko. Stacks: an analysis tool set for population genomics. Molecular Ecology. 2013. + +-J. Catchen, S. Bassham, T. Wilson, M. Currey, C. O'Brien, Q. Yeates, and W. Cresko. The population structure and recent colonization history of Oregon threespine stickleback determined using restriction-site associated DNA-sequencing. Molecular Ecology. 2013. + +-J. Catchen, A. Amores, P. Hohenlohe, W. Cresko, and J. Postlethwait. Stacks: building and genotyping loci de novo from short-read sequences. G3: Genes, Genomes, Genetics, 1:171-182, 2011. + +-A. Amores, J. Catchen, A. Ferrara, Q. Fontenot and J. Postlethwait. Genome evolution and meiotic maps by massively parallel DNA sequencing: Spotted gar, an outgroup for the teleost genome duplication. Genetics, 188:799'808, 2011. + +-P. Hohenlohe, S. Amish, J. Catchen, F. Allendorf, G. Luikart. RAD sequencing identifies thousands of SNPs for assessing hybridization between rainbow trout and westslope cutthroat trout. Molecular Ecology Resources, 11(s1):117-122, 2011. + +-K. Emerson, C. Merz, J. Catchen, P. Hohenlohe, W. Cresko, W. Bradshaw, C. Holzapfel. Resolving postglacial phylogeography using high-throughput sequencing. Proceedings of the National Academy of Science, 107(37):16196-200, 2010. + +-------- + +**Integrated by:** + +Yvan Le Bras and Cyril Monjeaud + +GenOuest Bio-informatics Core Facility + +UMR 6074 IRISA INRIA-CNRS-UR1 Rennes (France) + +support@genouest.org + + + + 10.1111/mec.12354 + 10.1111/mec.12330 + 10.1534/g3.111.000240 + 10.1534/genetics.111.127324 + 10.1111/j.1755-0998.2010.02967.x + 10.1073/pnas.1006538107 + + @INPROCEEDINGS{JOBIM2013, + author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.}, + title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France}, + booktitle = {JOBIM 2013 Proceedings}, + year = {2013}, + url = {https://www.e-biogenouest.org/resources/128}, + pages = {97-106} + } + +
+ diff -r 000000000000 -r d6ba40f6c824 STACKS_genotypes.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_genotypes.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,143 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import sys +import re +import os +import tempfile +import shutil +import subprocess +import glob +import argparse +from os.path import basename +import zipfile +import tarfile +import gzip +from galaxy.datatypes.checkers import * +from stacks import * + + +def __main__(): + + # arguments recuperation + + parser = argparse.ArgumentParser() + parser.add_argument('-P') + parser.add_argument('-b') + parser.add_argument('-c') + parser.add_argument('-t') + parser.add_argument('-o') + parser.add_argument('-e') + parser.add_argument('--active_advanced') + parser.add_argument('-r') + parser.add_argument('-m') + parser.add_argument('-B') + parser.add_argument('-W') + parser.add_argument('--active_autocorrect') + parser.add_argument('--min_hom_seqs') + parser.add_argument('--min_het_seqs') + parser.add_argument('--max_het_seqs') + + # multifile management + + parser.add_argument('--logfile') + parser.add_argument('--compress_output') + + # additionnal outputs + + parser.add_argument('--total_output') + + options = parser.parse_args() + + # create the working dir + + os.mkdir('job_outputs') + os.mkdir('galaxy_outputs') + + os.chdir('job_outputs') + + # edit the command line + + cmd_line = [] + cmd_line.append("genotypes") + + # STACKS_archive + # check if zipped files are into the tab + + extract_compress_files(options.P, os.getcwd()) + + # create the genotypes command input line + + cmd_line.extend(["-b", options.b, "-P", os.getcwd()]) + + # create the genotypes command line + + if options.e: + cmd_line.extend(["-e", options.e]) + if options.c == 'true': + cmd_line.append("-c") + if options.t: + cmd_line.extend(["-t", options.t]) + if options.o: + cmd_line.extend(["-o", options.o]) + + # if advanced is activate + if options.active_advanced == "true": + cmd_line.extend(["-r", options.r]) + cmd_line.extend(["-m", options.m]) + if options.B: + cmd_line.extend(["-B", options.B]) + if options.W: + cmd_line.extend(["-W", options.W]) + + # if autocorrect is activate + if options.active_autocorrect == "true": + cmd_line.extend(["--min_hom_seqs", options.min_hom_seqs]) + cmd_line.extend(["--min_het_seqs", options.min_het_seqs]) + cmd_line.extend(["--max_het_seqs", options.max_het_seqs]) + + # command with dependencies installed + print "[CMD]:"+' '.join(cmd_line) + subprocess.call(cmd_line) + + # postprocesses + try: + shutil.copy('batch_1.haplotypes_1.tsv', options.logfile) + except: + sys.stderr.write('Error in genotypes execution; Please read the additional output (stdout)\n') + sys.exit(1) + + # copy all files inside tmp_dir into workdir + + list_files = glob.glob('*') + + # if compress output is total + + if options.compress_output == 'total': + mytotalzipfile = zipfile.ZipFile('total.zip.temp', 'w') + + for i in list_files: + if re.search('^batch', os.path.basename(i)) \ + and not re.search("\.tsv$", os.path.basename(i)) \ + or re.search(".*_[0-9]*\.tsv$", os.path.basename(i)) \ + or re.search('.*genotypes.*', os.path.basename(i)): + mytotalzipfile.write(i, os.path.basename(i)) + + # return the unique archive + + shutil.move('total.zip.temp', options.total_output) + + # if compress output is default + if options.compress_output == 'default': + for i in list_files: + if re.search('^batch', os.path.basename(i)) \ + and not re.search("\.tsv$", os.path.basename(i)) \ + or re.search(".*_[0-9]*\.tsv$", os.path.basename(i)) \ + or re.search('.*genotypes.*', os.path.basename(i)): + shutil.move(i, '../galaxy_outputs') + + +if __name__ == '__main__': + __main__() + + diff -r 000000000000 -r d6ba40f6c824 STACKS_genotypes.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_genotypes.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,379 @@ + + Run the STACKS genotypes program + + + + stacks + + + + +STACKS_genotypes.py +-P $STACKS_archive +-b $batchid +-c $corrections +#if str( $options_output.options_output_selector ) == "1": +-t $options_output.map +-o $options_output.filetype +#end if +#if str( $options_enzyme.options_enzyme_selector ) == "1": +-e $options_enzyme.enzyme +#end if +--active_advanced $active_advanced +-r $advanced_options.minprogeny +-m $advanced_options.mindepth +#if str( $advanced_options.blacklistselect.advanced_blackoptions_selector) == "advanced" +-B $advanced_options.blacklistselect.blacklist +#end if +#if str( $advanced_options.whitelistselect.advanced_whiteoptions_selector) == "advanced" +-W $advanced_options.whitelistselect.whitelist +#end if +--active_autocorrect $active_autocorrect +--min_hom_seqs $options_autocorrect.hom +--min_het_seqs $options_autocorrect.het +--max_het_seqs $options_autocorrect.hetmax +--logfile $output +--compress_output $output_compress +##additionnal outputs +--total_output $total_output + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + +
+ + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ + + + + + + + + + output_compress == "total" + + + + + + + + + +.. class:: infomark + +**What it does** + +This program exports a Stacks data set either as a set of observed haplotypes at each locus in the population, or with the haplotypes encoded into genotypes. The -r option allows only loci that exist in a certain number of population individuals to be exported. In a mapping context, raising or lowering this limit is an effective way to control the quality level of markers exported as genuine markers will be found in a large number of progeny. If exporting a set of observed haplotypes in a population, the "min stack depth" option can be used to restict exported loci to those that have a minimum depth of reads. + +By default, when executing the pipeline (either denovo_map or ref_map) the genotypes program will be executed last and will identify mappable markers in the population and export both a set of observed haplotypes and a set of generic genotypes with "min number of progeny" option = 1. + + +Making Corrections + +If enabled with the "make automated corrections to the data" option, the genotypes program will make automated corrections to the data. Since loci are matched up in the population, the script can correct false-negative heterozygote alleles since it knows the existence of alleles at a particular locus in the other individuals. For example, the program will identify loci with SNPs that didn’t have high enough coverage to be identified by the SNP caller. It will also check that homozygous tags have a minimum depth of coverage, since a low-coverage polymorphic locus may appear homozygous simply because the other allele wasn’t sequenced. + + +Correction Thresholds + +The thresholds for automatic corrections can be modified by using the "automated corrections option" and changing the default values for the "min number of reads for homozygous genotype", "homozygote minor minimum allele frequency" and "heterozygote minor minimum allele frequency" parameters to genotypes. "min number of reads for homozygous genotype" is the minimum number of reads required to consider a stack homozygous (default of 5). The "homozygote minor minimum allele frequency" and "heterozygote minor minimum allele frequency" variables represent fractions. If the ratio of the depth of the the smaller allele to the bigger allele is greater than "heterozygote minor minimum allele frequency" (default of 1/10) a stack is called a het. If the ratio is less than homozygote minor minimum allele frequency (default of 1/20) a stack is called homozygous. If the ratio is in between the two values it is unknown and a genotype will not be assigned. + +Automated corrections made by the program are shown in the output file in capital letters. + +-------- + +**Created by:** + +Stacks was developed by Julian Catchen with contributions from Angel Amores, Paul Hohenlohe, and Bill Cresko + +-------- + +**Example:** + +Input files: + +FASTQ, FASTA, zip, tar.gz + +Output files: + +- XXX.tags.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID Each sample passed through Stacks gets a unique id for that sample. + 3 Stack ID Each stack formed gets an ID. + 4 Chromosome If aligned to a reference genome using pstacks, otherwise it is blank. + 5 Basepair If aligned to ref genome using pstacks. + 6 Strand If aligned to ref genome using pstacks. + 7 Sequence Type Either 'consensus', 'primary' or 'secondary', see the Stacks paper for definitions of these terms. + 8 Sequence ID The individual sequence read that was merged into this stack. + 9 Sequence The raw sequencing read. + 10 Deleveraged Flag If "1", this stack was processed by the deleveraging algorithm and was broken down from a larger stack. + 11 Blacklisted Flag If "1", this stack was still confounded depsite processing by the deleveraging algorithm. + 12 Lumberja ckstack Flag If "1", this stack was set aside due to having an extreme depth of coverage. + +Notes: For the tags file, each stack will start in the file with a consensus sequence for the entire stack followed by the flags for that stack. Then, each individual read that was merged into that stack will follow. The next stack will start with another consensus sequence. + + +- XXX.snps.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID + 3 Stack ID + 4 SNP Column + 5 Likelihood ratio From the SNP-calling model. + 6 Rank_1 Majority nucleotide. + 7 Rank_2 Alternative nucleotide. + +Notes: If a stack has two SNPs called within it, then there will be two lines in this file listing each one. + + +- XXX.alleles.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID + 3 Stack ID + 4 Haplotype The haplotype, as constructed from the called SNPs at each locus. + 5 Percent Percentage of reads that have this haplotype + 6 Count Raw number of reads that have this haplotype + + +- XXX.matches.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Batch ID + 3 Catalog ID + 4 Sample ID + 5 Stack ID + 6 Haplotype + 7 Stack Depth + +Notes: Each line in this file records a match between a catalog locus and a locus in an individual, for a particular haplotype. The Batch ID plus the Catalog ID together represent a unique locus in the entire population, while the Sample ID and the Stack ID together represent a unique locus in an individual sample. + + +- batch_X.sumstats.tsv Summary Statistics Output:: + + Batch ID The batch identifier for this data set. + Locus ID Catalog locus identifier. + Chromosome If aligned to a reference genome. + Basepair If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand). + Column The nucleotide site within the catalog locus. + Population ID The ID supplied to the populations program, as written in the population map file. + P Nucleotide The most frequent allele at this position in this population. + Q Nucleotide The alternative allele. + Number of Individuals Number of individuals sampled in this population at this site. + P Frequency of most frequent allele. + Observed Heterozygosity The proportion of individuals that are heterozygotes in this population. + Observed Homozygosity The proportion of individuals that are homozygotes in this population. + Expected Heterozygosity Heterozygosity expected under Hardy-Weinberg equilibrium. + Expected Homozygosity Homozygosity expected under Hardy-Weinberg equilibrium. + pi An estimate of nucleotide diversity. + Smoothed pi A weighted average of p depending on the surrounding 3s of sequence in both directions. + Smoothed pi P-value If bootstrap resampling is enabled, a p-value ranking the significance of p within this population. + FIS The inbreeding coefficient of an individual (I) relative to the subpopulation (S). + Smoothed FIS A weighted average of FIS depending on the surrounding 3s of sequence in both directions. + Smoothed FIS P-value If bootstrap resampling is enabled, a p-value ranking the significance of FIS within this population. + Private allele True (1) or false (0), depending on if this allele is only occurs in this population. + +- batch_X.fst_Y-Z.tsv Pairwise FST Output:: + + Batch ID The batch identifier for this data set. + Locus ID Catalog locus identifier. + Population ID 1 The ID supplied to the populations program, as written in the population map file. + Population ID 2 The ID supplied to the populations program, as written in the population map file. + Chromosome If aligned to a reference genome. + Basepair If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand). + Column The nucleotide site within the catalog locus. + Overall pi An estimate of nucleotide diversity across the two populations. + FST A measure of population differentiation. + FET p-value P-value describing if the FST measure is statistically significant according to Fisher's Exact Test. + Odds Ratio Fisher's Exact Test odds ratio + CI High Fisher's Exact Test confidence interval. + CI Low Fisher's Exact Test confidence interval. + LOD Score Logarithm of odds score. + Expected Heterozygosity Heterozygosity expected under Hardy-Weinberg equilibrium. + Expected Homozygosity Homozygosity expected under Hardy-Weinberg equilibrium. + Corrected FST FST with either the FET p-value, or a window-size or genome size Bonferroni correction. + Smoothed FST A weighted average of FST depending on the surrounding 3s of sequence in both directions. + Smoothed FST P-value If bootstrap resampling is enabled, a p-value ranking the significance of FST within this pair of populations. + + +Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki <https://www.e-biogenouest.org/wiki/ManArchiveGalaxy>`_ . + +-------- + +**Output type:** + +- Output type details:: + + No compression All files will be added in the current history. + Compressed by categories Files will be compressed by categories (snps, allele, matches and tags) into 4 zip archives. These archives and batch files will be added in the current history. + Compressed all outputs All files will be compressed in an unique zip archive. Batch files will be added in the current history with the archive. + + +-------- + +**Project links:** + +`STACKS website <http://creskolab.uoregon.edu/stacks/>`_ . + +`STACKS manual <http://creskolab.uoregon.edu/stacks/stacks_manual.pdf>`_ . + +`STACKS google group <https://groups.google.com/forum/#!forum/stacks-users>`_ . + +-------- + +**References:** + +-J. Catchen, P. Hohenlohe, S. Bassham, A. Amores, and W. Cresko. Stacks: an analysis tool set for population genomics. Molecular Ecology. 2013. + +-J. Catchen, S. Bassham, T. Wilson, M. Currey, C. O'Brien, Q. Yeates, and W. Cresko. The population structure and recent colonization history of Oregon threespine stickleback determined using restriction-site associated DNA-sequencing. Molecular Ecology. 2013. + +-J. Catchen, A. Amores, P. Hohenlohe, W. Cresko, and J. Postlethwait. Stacks: building and genotyping loci de novo from short-read sequences. G3: Genes, Genomes, Genetics, 1:171-182, 2011. + +-A. Amores, J. Catchen, A. Ferrara, Q. Fontenot and J. Postlethwait. Genome evolution and meiotic maps by massively parallel DNA sequencing: Spotted gar, an outgroup for the teleost genome duplication. Genetics, 188:799'808, 2011. + +-P. Hohenlohe, S. Amish, J. Catchen, F. Allendorf, G. Luikart. RAD sequencing identifies thousands of SNPs for assessing hybridization between rainbow trout and westslope cutthroat trout. Molecular Ecology Resources, 11(s1):117-122, 2011. + +-K. Emerson, C. Merz, J. Catchen, P. Hohenlohe, W. Cresko, W. Bradshaw, C. Holzapfel. Resolving postglacial phylogeography using high-throughput sequencing. Proceedings of the National Academy of Science, 107(37):16196-200, 2010. + +-------- + +**Integrated by:** + +Yvan Le Bras and Cyril Monjeaud + +GenOuest Bio-informatics Core Facility + +UMR 6074 IRISA INRIA-CNRS-UR1 Rennes (France) + +support@genouest.org + +If you use this tool in Galaxy, please cite : + +`Y. Le Bras, A. Roult, C. Monjeaud, M. Bahin, O. Quenez, C. Heriveau, A. Bretaudeau, O. Sallou, O. Collin, Towards a Life Sciences Virtual Research Environment : an e-Science initiative in Western France. JOBIM 2013. <https://www.e-biogenouest.org/resources/128>`_ + + + + + 10.1111/mec.12354 + 10.1111/mec.12330 + 10.1534/g3.111.000240 + 10.1534/genetics.111.127324 + 10.1111/j.1755-0998.2010.02967.x + 10.1073/pnas.1006538107 + + @INPROCEEDINGS{JOBIM2013, + author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.}, + title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France}, + booktitle = {JOBIM 2013 Proceedings}, + year = {2013}, + url = {https://www.e-biogenouest.org/resources/128}, + pages = {97-106} + } + +
+ diff -r 000000000000 -r d6ba40f6c824 STACKS_population.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_population.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,243 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import sys +import re +import os +import tempfile +import shutil +import subprocess +import glob +import argparse +from os.path import basename +import zipfile +import tarfile +import gzip +from galaxy.datatypes.checkers import * +from stacks import * + + +def __main__(): + + # arguments recuperation + + parser = argparse.ArgumentParser() + parser.add_argument('-P') + parser.add_argument('-M') + parser.add_argument('-b') + parser.add_argument('--vcf', action='store_true') + parser.add_argument('--genepop', action='store_true') + parser.add_argument('--structure', action='store_true') + parser.add_argument('-e') + parser.add_argument('--genomic', action='store_true') + parser.add_argument('--fasta', action='store_true') + parser.add_argument('--phase', action='store_true') + parser.add_argument('--beagle', action='store_true') + parser.add_argument('--plink', action='store_true') + parser.add_argument('--phylip', action='store_true') + parser.add_argument('--phylip_var', action='store_true') + parser.add_argument('--write_single_snp', action='store_true') + parser.add_argument('-k', action='store_true') + + # advanced options + parser.add_argument('--advanced_options_activate') + parser.add_argument('-B') + parser.add_argument('-W') + parser.add_argument('-r') + parser.add_argument('-p') + parser.add_argument('-m') + parser.add_argument('-a') + parser.add_argument('-f') + parser.add_argument('--p_value_cutoff') + parser.add_argument('--window_size') + parser.add_argument('--bootstrap') + parser.add_argument('--bootstrap_reps') + + # multifile management + parser.add_argument('--logfile') + + # outputs + parser.add_argument('--ss') + parser.add_argument('--s') + + # optional outputs + parser.add_argument('--ov') + parser.add_argument('--op') + parser.add_argument('--ol') + parser.add_argument('--of') + parser.add_argument('--os') + parser.add_argument('--oe') + parser.add_argument('--om') + parser.add_argument('--og') + + parser.add_argument('--unphased_output') + parser.add_argument('--markers_output') + parser.add_argument('--phase_output') + parser.add_argument('--fst_output') + + options = parser.parse_args() + + # create the working dir + os.mkdir('job_outputs') + os.mkdir('galaxy_outputs') + + os.chdir('job_outputs') + + # STACKS_archive + # check if zipped files are into the tab + extract_compress_files(options.P, os.getcwd()) + + # create the populations command input line + cmd_line=['populations'] + cmd_line.extend(['-b', options.b, '-P', os.getcwd(), '-M', options.M]) + + if options.e: + cmd_line.extend(['-e', options.e, options.genomic]) + + # output options + if options.vcf: + cmd_line.append('--vcf') + if options.genepop: + cmd_line.append('--genepop') + if options.structure: + cmd_line.append('--structure') + if options.fasta: + cmd_line.append('--fasta') + if options.phase: + cmd_line.append('--phase') + if options.beagle: + cmd_line.append('--beagle') + if options.plink: + cmd_line.append('--plink') + if options.phylip: + cmd_line.append('--phylip') + if options.phylip_var and options.phylip: + cmd_line.append('--phylip_var') + if options.write_single_snp and (options.genepop or options.structure): + cmd_line.append('--write_single_snp') + + if options.k: + cmd_line.extend(['-k', '--window_size', options.window_size]) + + if options.advanced_options_activate == 'true': + if options.B: + cmd_line.extend(['-B', options.B]) + if options.W: + cmd_line.extend(['-W', options.W]) + + cmd_line.extend(['-r', options.r]) + cmd_line.extend(['-p', options.p]) + cmd_line.extend(['-m', options.m]) + cmd_line.extend(['-a', options.a]) + + if options.f: + cmd_line.extend(['-f', options.f, '--p_value_cutoff', options.p_value_cutoff]) + if options.bootstrap: + cmd_line.extend(['--bootstrap', options.bootstrap, '--bootstrap_reps', options.bootstrap_reps]) + + print "[CMD]:"+' '.join(cmd_line) + subprocess.call(cmd_line) + + # postprocesses + try: + shutil.copy('batch_1.populations.log', options.logfile) + except: + sys.stderr.write('Error in population execution; Please read the additional output (stdout)\n') + sys.exit(1) + + try: + shutil.move(glob.glob('*.sumstats_summary.tsv')[0], options.ss) + except: + print "No sumstats summary file" + + try: + shutil.move(glob.glob('*.sumstats.tsv')[0], options.s) + except: + print "No sumstats file" + + # move additionnal output files + if options.vcf: + try: + shutil.move(glob.glob('*.vcf')[0], options.ov) + except: + print "No VCF files" + + if options.phylip: + try: + shutil.move(glob.glob('*.phylip')[0], options.op) + shutil.move(glob.glob('*.phylip.log')[0], options.ol) + except: + print "No phylip file" + + if options.fasta: + try: + shutil.move(glob.glob('*.fa')[0], options.of) + except: + print "No fasta files" + + if options.structure: + try: + shutil.move(glob.glob('*.structure.tsv')[0], options.os) + except: + print "No structure file" + + if options.plink : + try: + shutil.move(glob.glob('*.ped')[0], options.oe) + shutil.move(glob.glob('*.map')[0], options.om) + except: + print "No ped and map file" + + if options.genepop : + try: + shutil.move(glob.glob('*.genepop')[0], options.og) + except: + print "No genepop file" + + # copy all files inside tmp_dir into workdir or into an archive.... + list_files = glob.glob('*') + + markerszip = zipfile.ZipFile('markers.zip.temp', 'w', + allowZip64=True) + phasezip = zipfile.ZipFile('phase.zip.temp', 'w', allowZip64=True) + unphasedzip = zipfile.ZipFile('unphased.zip.temp', 'w', + allowZip64=True) + fstzip = zipfile.ZipFile('fst.zip.temp', 'w', allowZip64=True) + + for i in list_files: + # for each type of files + if re.search("\.markers$", i): + markerszip.write(i) + elif re.search("phase\.inp$", i): + phasezip.write(i) + elif re.search("unphased\.bgl$", i): + unphasedzip.write(i) + elif re.search('fst', i): + fstzip.write(i) + else: + # else return original files + if re.search('^batch', os.path.basename(i)) \ + and not re.search("\.tsv$", os.path.basename(i)) \ + or re.search(".*_[0-9]*\.tsv$", os.path.basename(i)): + shutil.move(i, '../galaxy_outputs') + + # close zip files + markerszip.close() + phasezip.close() + unphasedzip.close() + fstzip.close() + + # return archives + shutil.move('fst.zip.temp', options.fst_output) + if options.beagle: + shutil.move('markers.zip.temp', options.markers_output) + shutil.move('unphased.zip.temp', options.unphased_output) + if options.phase: + shutil.move('phase.zip.temp', options.phase_output) + + +if __name__ == '__main__': + __main__() + + + diff -r 000000000000 -r d6ba40f6c824 STACKS_population.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_population.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,497 @@ + + Run the STACKS populations program + + + + stacks + + + + +STACKS_population.py +-P $STACKS_archive +-b $batchid +-M $popmap + +#if $options_kernel.kernel + -k + --window_size $options_kernel.window +#end if + +#if str( $options_enzyme.options_enzyme_selector ) == "1": + -e $options_enzyme.enzyme + --genomic $options_enzyme.genomic +#end if + +## advanced options +--advanced_options_activate $advanced_options_activate +#if $advanced_options_activate + -r $advanced_options.minperc + -p $advanced_options.minpop + -m $advanced_options.mindepth + -a $advanced_options.minminor + #if str( $advanced_options.correction_select.correction ) != "no_corr": + -f $advanced_options.correction_select.correction + --p_value_cutoff $advanced_options.correction_select.pcutoff + #end if + #if str( $advanced_options.blacklistselect.advanced_blackoptions_selector) == "advanced" + -B $advanced_options.blacklistselect.blacklist + #end if + #if str( $advanced_options.whitelistselect.advanced_whiteoptions_selector) == "advanced" + -W $advanced_options.whitelistselect.whitelist + #end if + #if str( $advanced_options.bootstrapresampling.advanced_bootoptions_selector) == "advanced" + --bootstrap $advanced_options.bootstrapresampling.bootstrap + --bootstrap_reps $advanced_options.bootstrapresampling.bootstrapreps + #end if +#end if + +## output files +--ss $sumstatssum +--s $sumstats +--fst_output $outfst + +## output section +#if $options_output.vcf +--vcf +--ov $outvcf +#end if +#if $options_output.phylip +--phylip +--op $outphylip +#end if +#if $options_output.phylip +--phylip_var +--ol $outphyliplog +#end if +#if $options_output.fasta +--fasta +--of $outfasta +#end if +#if $options_output.structure +--structure +--os $outstructure +#end if +#if $options_output.plink +--plink +--oe $outplinkped +--om=$outplinkmap +#end if +#if $options_output.phase +--phase +--phase_output $outphase +#end if +#if $options_output.beagle +--beagle +--unphased_output $outbeagle +#end if +--markers_output $outmarkers +#if $options_output.genepop +--genepop +--og=$outgenepop +#end if +#if $options_output.write_single_snp +--write_single_snp +#end if +--logfile $output + + + + + + + +
+ + + + + + + + + + +
+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + + + + + + + options_output['vcf'] + options_output['options_output_selector'] == '1' + + + options_output['phylip'] + options_output['options_output_selector'] == '1' + + + options_output['phylip'] + options_output['options_output_selector'] == '1' + + + options_output['beagle'] + options_output['options_output_selector'] == '1' + + + options_output['fasta'] + options_output['options_output_selector'] == '1' + + + options_output['structure'] + options_output['options_output_selector'] == '1' + + + options_output['plink'] + options_output['options_output_selector'] == '1' + + + options_output['plink'] + options_output['options_output_selector'] == '1' + + + options_output['genepop'] + options_output['options_output_selector'] == '1' + + + options_output['phase'] + options_output['options_output_selector'] == '1' + + + options_output['beagle'] + options_output['options_output_selector'] == '1' + + + options_output['beagle'] + options_output['options_output_selector'] == '1' + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This program will be executed in place of the genotypes program when a population is being processed through the pipeline. A map specifiying which individuals belong to which population is submitted to the program and the program will then calculate population genetics statistics, expected/observed heterzygosity, π, and FIS at each nucleotide position. The populations program will compare all populations pairwise to compute FST. If a set of data is reference aligned, then a kernel-smoothed FST will also be calculated. + +-------- + +**Created by:** + +Stacks was developed by Julian Catchen with contributions from Angel Amores, Paul Hohenlohe, and Bill Cresko + +-------- + +**Example:** + +Input files: + +FASTQ, FASTA, zip, tar.gz + +- Population map:: + + indv_01 1 + indv_02 1 + indv_03 1 + indv_04 2 + indv_05 2 + indv_06 2 + + +Output files: + +- XXX.tags.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID Each sample passed through Stacks gets a unique id for that sample. + 3 Stack ID Each stack formed gets an ID. + 4 Chromosome If aligned to a reference genome using pstacks, otherwise it is blank. + 5 Basepair If aligned to ref genome using pstacks. + 6 Strand If aligned to ref genome using pstacks. + 7 Sequence Type Either 'consensus', 'primary' or 'secondary', see the Stacks paper for definitions of these terms. + 8 Sequence ID The individual sequence read that was merged into this stack. + 9 Sequence The raw sequencing read. + 10 Deleveraged Flag If "1", this stack was processed by the deleveraging algorithm and was broken down from a larger stack. + 11 Blacklisted Flag If "1", this stack was still confounded depsite processing by the deleveraging algorithm. + 12 Lumberja ckstack Flag If "1", this stack was set aside due to having an extreme depth of coverage. + +Notes: For the tags file, each stack will start in the file with a consensus sequence for the entire stack followed by the flags for that stack. Then, each individual read that was merged into that stack will follow. The next stack will start with another consensus sequence. + + +- XXX.snps.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID + 3 Stack ID + 4 SNP Column + 5 Likelihood ratio From the SNP-calling model. + 6 Rank_1 Majority nucleotide. + 7 Rank_2 Alternative nucleotide. + +Notes: If a stack has two SNPs called within it, then there will be two lines in this file listing each one. + + +- XXX.alleles.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID + 3 Stack ID + 4 Haplotype The haplotype, as constructed from the called SNPs at each locus. + 5 Percent Percentage of reads that have this haplotype + 6 Count Raw number of reads that have this haplotype + + +- XXX.matches.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Batch ID + 3 Catalog ID + 4 Sample ID + 5 Stack ID + 6 Haplotype + 7 Stack Depth + +Notes: Each line in this file records a match between a catalog locus and a locus in an individual, for a particular haplotype. The Batch ID plus the Catalog ID together represent a unique locus in the entire population, while the Sample ID and the Stack ID together represent a unique locus in an individual sample. + + +- batch_X.sumstats.tsv Summary Statistics Output:: + + Batch ID The batch identifier for this data set. + Locus ID Catalog locus identifier. + Chromosome If aligned to a reference genome. + Basepair If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand). + Column The nucleotide site within the catalog locus. + Population ID The ID supplied to the populations program, as written in the population map file. + P Nucleotide The most frequent allele at this position in this population. + Q Nucleotide The alternative allele. + Number of Individuals Number of individuals sampled in this population at this site. + P Frequency of most frequent allele. + Observed Heterozygosity The proportion of individuals that are heterozygotes in this population. + Observed Homozygosity The proportion of individuals that are homozygotes in this population. + Expected Heterozygosity Heterozygosity expected under Hardy-Weinberg equilibrium. + Expected Homozygosity Homozygosity expected under Hardy-Weinberg equilibrium. + pi An estimate of nucleotide diversity. + Smoothed pi A weighted average of p depending on the surrounding 3s of sequence in both directions. + Smoothed pi P-value If bootstrap resampling is enabled, a p-value ranking the significance of p within this population. + FIS The inbreeding coefficient of an individual (I) relative to the subpopulation (S). + Smoothed FIS A weighted average of FIS depending on the surrounding 3s of sequence in both directions. + Smoothed FIS P-value If bootstrap resampling is enabled, a p-value ranking the significance of FIS within this population. + Private allele True (1) or false (0), depending on if this allele is only occurs in this population. + +- batch_X.fst_Y-Z.tsv Pairwise FST Output:: + + Batch ID The batch identifier for this data set. + Locus ID Catalog locus identifier. + Population ID 1 The ID supplied to the populations program, as written in the population map file. + Population ID 2 The ID supplied to the populations program, as written in the population map file. + Chromosome If aligned to a reference genome. + Basepair If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand). + Column The nucleotide site within the catalog locus. + Overall pi An estimate of nucleotide diversity across the two populations. + FST A measure of population differentiation. + FET p-value P-value describing if the FST measure is statistically significant according to Fisher's Exact Test. + Odds Ratio Fisher's Exact Test odds ratio + CI High Fisher's Exact Test confidence interval. + CI Low Fisher's Exact Test confidence interval. + LOD Score Logarithm of odds score. + Expected Heterozygosity Heterozygosity expected under Hardy-Weinberg equilibrium. + Expected Homozygosity Homozygosity expected under Hardy-Weinberg equilibrium. + Corrected FST FST with either the FET p-value, or a window-size or genome size Bonferroni correction. + Smoothed FST A weighted average of FST depending on the surrounding 3s of sequence in both directions. + Smoothed FST P-value If bootstrap resampling is enabled, a p-value ranking the significance of FST within this pair of populations. + + +Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki <https://www.e-biogenouest.org/wiki/ManArchiveGalaxy>`_ . + +-------- + +**Output type:** + +- Output type details:: + + No compression All files will be added in the current history. + Compressed by categories Files will be compressed by categories (snps, allele, matches and tags) into 4 zip archives. These archives and batch files will be added in the current history. + Compressed all outputs All files will be compressed in an unique zip archive. Batch files will be added in the current history with the archive. + + +-------- + +**Project links:** + +`STACKS website <http://creskolab.uoregon.edu/stacks/>`_ . + +`STACKS manual <http://creskolab.uoregon.edu/stacks/stacks_manual.pdf>`_ . + +`STACKS google group <https://groups.google.com/forum/#!forum/stacks-users>`_ . + +-------- + +**References:** + +-J. Catchen, P. Hohenlohe, S. Bassham, A. Amores, and W. Cresko. Stacks: an analysis tool set for population genomics. Molecular Ecology. 2013. + +-J. Catchen, S. Bassham, T. Wilson, M. Currey, C. O'Brien, Q. Yeates, and W. Cresko. The population structure and recent colonization history of Oregon threespine stickleback determined using restriction-site associated DNA-sequencing. Molecular Ecology. 2013. + +-J. Catchen, A. Amores, P. Hohenlohe, W. Cresko, and J. Postlethwait. Stacks: building and genotyping loci de novo from short-read sequences. G3: Genes, Genomes, Genetics, 1:171-182, 2011. + +-A. Amores, J. Catchen, A. Ferrara, Q. Fontenot and J. Postlethwait. Genome evolution and meiotic maps by massively parallel DNA sequencing: Spotted gar, an outgroup for the teleost genome duplication. Genetics, 188:799'808, 2011. + +-P. Hohenlohe, S. Amish, J. Catchen, F. Allendorf, G. Luikart. RAD sequencing identifies thousands of SNPs for assessing hybridization between rainbow trout and westslope cutthroat trout. Molecular Ecology Resources, 11(s1):117-122, 2011. + +-K. Emerson, C. Merz, J. Catchen, P. Hohenlohe, W. Cresko, W. Bradshaw, C. Holzapfel. Resolving postglacial phylogeography using high-throughput sequencing. Proceedings of the National Academy of Science, 107(37):16196-200, 2010. + +-------- + +**Integrated by:** + +Yvan Le Bras and Cyril Monjeaud + +GenOuest Bio-informatics Core Facility + +UMR 6074 IRISA INRIA-CNRS-UR1 Rennes (France) + +support@genouest.org + +If you use this tool in Galaxy, please cite : + +`Y. Le Bras, A. Roult, C. Monjeaud, M. Bahin, O. Quenez, C. Heriveau, A. Bretaudeau, O. Sallou, O. Collin, Towards a Life Sciences Virtual Research Environment : an e-Science initiative in Western France. JOBIM 2013. <https://www.e-biogenouest.org/resources/128>`_ + + + + + 10.1111/mec.12354 + 10.1111/mec.12330 + 10.1534/g3.111.000240 + 10.1534/genetics.111.127324 + 10.1111/j.1755-0998.2010.02967.x + 10.1073/pnas.1006538107 + + @INPROCEEDINGS{JOBIM2013, + author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.}, + title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France}, + booktitle = {JOBIM 2013 Proceedings}, + year = {2013}, + url = {https://www.e-biogenouest.org/resources/128}, + pages = {97-106} + } + +
+ diff -r 000000000000 -r d6ba40f6c824 STACKS_prepare_population_map.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_prepare_population_map.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,74 @@ +#!/usr/bin/env python + +import sys, re +import os +import tempfile +import shutil, subprocess, glob +import optparse +from os.path import basename +import zipfile, tarfile, gzip +from galaxy.datatypes.checkers import * +from stacks import * + +""" + +Created by Cyril Monjeaud +Cyril.Monjeaud@irisa.fr + +Last modifications : 01/10/2014 + +WARNING : + +STACKS_denovomap.py needs: + +- STACKS scripts in your $PATH + +These scripts are available after compiling the sources of STACKS : + +http://creskolab.uoregon.edu/stacks/ + +or with the galaxy_stacks package in the Genouest toolshed (http://toolshed.genouest.org) + +""" +def __main__(): + + # arguments recuperation + parser = optparse.OptionParser() + parser.add_option("-f") + parser.add_option("-s") + parser.add_option("-t") + parser.add_option("-o") + parser.add_option("-d") + (options, args) = parser.parse_args() + + # create the working dir + tmp_dir = tempfile.mkdtemp(dir=options.d) + + print tmp_dir + #os.chdir(tmp_dir) + + # parse config files + tab_fq_files=galaxy_config_to_tabfiles_for_STACKS(options.f) + + # check if zipped files are into the tab and change tab content + extract_compress_files_from_tabfiles(tab_fq_files, tmp_dir) + + # generate population map for denovo map + if not options.s: + generate_popmap_for_denovo(tab_fq_files, options.t, options.o) + else: + # parse config files + tab_sam_files=galaxy_config_to_tabfiles_for_STACKS(options.s) + extract_compress_files_from_tabfiles(tab_sam_files, tmp_dir) + generate_popmap_for_refmap(tab_fq_files, tab_sam_files, options.t, options.o) + + + #clean up temp files + shutil.rmtree( tmp_dir ) + + + + + + +if __name__ == "__main__": __main__() diff -r 000000000000 -r d6ba40f6c824 STACKS_prepare_population_map.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_prepare_population_map.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,168 @@ + + for STACKS denovomap and refmap + + + +#for $input in $fastq_file: +${input.display_name}::${input} +#end for + + +#if str( $options_target.options_target_selector ) == "refmap": +#for $input in $options_target.sam_file: +${input.display_name}::${input} +#end for +#end if + + + + +STACKS_prepare_population_map.py +-f $fastq_files +#if str( $options_target.options_target_selector ) == "refmap": +-s $sam_files +#end if +-t $info_file +-o $output +-d $__new_file_path__ + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This program will prepare a population map dataset from a 2 columns file containing relation between barcode and population. + +-------- + +**Created by:** + +Stacks was developed by Julian Catchen with contributions from Angel Amores, Paul Hohenlohe, and Bill Cresko + +-------- + +**Example:** + +Input files: + +- FASTQ, FASTA, zip, tar.gz + + +- File with population informations: + +This file must have exactly 2 columns, separated by a tab, the first with barcode, second with population name or ID :: + + CGATA pop1 + CGGCG pop1 + GAAGC pop1 + GAGAT pop1 + CGATA pop2 + CGGCG pop2 + GAAGC pop2 + GAGAT pop2 + + +Output file: + +- Population map:: + + indv_01 1 + indv_02 1 + indv_03 1 + indv_04 2 + indv_05 2 + indv_06 2 + +WARNING : the file name in the population map output may be different from the history file name. Don't worry about this, it's safe. + + +Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki <https://www.e-biogenouest.org/wiki/ManArchiveGalaxy>`_ . + +-------- + +**Project links:** + +`STACKS website <http://creskolab.uoregon.edu/stacks/>`_ . + +`STACKS manual <http://creskolab.uoregon.edu/stacks/stacks_manual.pdf>`_ . + +`STACKS google group <https://groups.google.com/forum/#!forum/stacks-users>`_ . + +-------- + +**References:** + +-J. Catchen, P. Hohenlohe, S. Bassham, A. Amores, and W. Cresko. Stacks: an analysis tool set for population genomics. Molecular Ecology. 2013. + +-J. Catchen, S. Bassham, T. Wilson, M. Currey, C. O'Brien, Q. Yeates, and W. Cresko. The population structure and recent colonization history of Oregon threespine stickleback determined using restriction-site associated DNA-sequencing. Molecular Ecology. 2013. + +-J. Catchen, A. Amores, P. Hohenlohe, W. Cresko, and J. Postlethwait. Stacks: building and genotyping loci de novo from short-read sequences. G3: Genes, Genomes, Genetics, 1:171-182, 2011. + +-A. Amores, J. Catchen, A. Ferrara, Q. Fontenot and J. Postlethwait. Genome evolution and meiotic maps by massively parallel DNA sequencing: Spotted gar, an outgroup for the teleost genome duplication. Genetics, 188:799'808, 2011. + +-P. Hohenlohe, S. Amish, J. Catchen, F. Allendorf, G. Luikart. RAD sequencing identifies thousands of SNPs for assessing hybridization between rainbow trout and westslope cutthroat trout. Molecular Ecology Resources, 11(s1):117-122, 2011. + +-K. Emerson, C. Merz, J. Catchen, P. Hohenlohe, W. Cresko, W. Bradshaw, C. Holzapfel. Resolving postglacial phylogeography using high-throughput sequencing. Proceedings of the National Academy of Science, 107(37):16196-200, 2010. + +-------- + +**Integrated by:** + +Yvan Le Bras and Cyril Monjeaud + +GenOuest Bio-informatics Core Facility + +UMR 6074 IRISA INRIA-CNRS-UR1 Rennes (France) + +support@genouest.org + +If you use this tool in Galaxy, please cite : + +`Y. Le Bras, A. Roult, C. Monjeaud, M. Bahin, O. Quenez, C. Heriveau, A. Bretaudeau, O. Sallou, O. Collin, Towards a Life Sciences Virtual Research Environment : an e-Science initiative in Western France. JOBIM 2013. <https://www.e-biogenouest.org/resources/128>`_ + + + + + 10.1111/mec.12354 + 10.1111/mec.12330 + 10.1534/g3.111.000240 + 10.1534/genetics.111.127324 + 10.1111/j.1755-0998.2010.02967.x + 10.1073/pnas.1006538107 + + @INPROCEEDINGS{JOBIM2013, + author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.}, + title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France}, + booktitle = {JOBIM 2013 Proceedings}, + year = {2013}, + url = {https://www.e-biogenouest.org/resources/128}, + pages = {97-106} + } + + + diff -r 000000000000 -r d6ba40f6c824 STACKS_procrad.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_procrad.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,177 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import sys +import os +import re +import tempfile +import subprocess +import glob +import shutil +import argparse +from os.path import basename +import zipfile +import tarfile +import gzip +from stacks import * + + +def __main__(): + + # arguments recuperation + + parser = argparse.ArgumentParser() + parser.add_argument('--input_type') + parser.add_argument('--input_enzyme') + parser.add_argument('--input_single') + parser.add_argument('--input_paired1') + parser.add_argument('--input_paired2') + parser.add_argument('--inputype') + parser.add_argument('--sample_name') + parser.add_argument('--barcode') + parser.add_argument('--output_choice') + parser.add_argument('--output_archive') + parser.add_argument('--enzyme1') + parser.add_argument('--enzyme2') + parser.add_argument('--outype') + parser.add_argument('--qualitenc') + parser.add_argument('-D', action='store_true') + parser.add_argument('-t') + parser.add_argument('-q', action='store_true') + parser.add_argument('--activate_advanced_options') + parser.add_argument('-r', action='store_true') + parser.add_argument('-w', default='0.15') + parser.add_argument('-s', default='10') + parser.add_argument('-c', action='store_true') + parser.add_argument('--inline_null', action='store_true') + parser.add_argument('--index_null', action='store_true') + parser.add_argument('--inline_inline', action='store_true') + parser.add_argument('--index_index', action='store_true') + parser.add_argument('--inline_index', action='store_true') + parser.add_argument('--index_inline', action='store_true') + parser.add_argument('--logfile') + options = parser.parse_args() + + # create the working dir + os.mkdir('inputs') + os.mkdir('job_outputs') + os.mkdir('galaxy_outputs') + + cmd_line = [] + cmd_line.append('process_radtags') + cmd_line.extend(['-p', 'inputs']) + cmd_line.extend(['-i', options.inputype]) + cmd_line.extend(['-b', options.barcode]) + + # parse config files and create symlink into the temp dir + + if options.input_type == 'single': + + # load the config file + input_single = options.input_single + + # parse the input_file to extract filenames and filepaths + tab_files = galaxy_config_to_tabfiles(input_single) + + # create symlink into the temp dir + create_symlinks_from_tabfiles(tab_files, 'inputs') + else: + + # load config files + input_paired1 = options.input_paired1 + input_paired2 = options.input_paired2 + + # parse the input_file to extract filenames and filepaths + + tab_files_paired1 = galaxy_config_to_tabfiles(input_paired1) + tab_files_paired2 = galaxy_config_to_tabfiles(input_paired2) + + # create symlinks into the temp dir + + create_symlinks_from_tabfiles(tab_files_paired1, 'inputs') + create_symlinks_from_tabfiles(tab_files_paired2, 'inputs') + + cmd_line.append('-P') + + # test nb enzyme + if options.input_enzyme == '1': + cmd_line.extend(['-e', options.enzyme1]) + + if options.input_enzyme == '2': + cmd_line.extend(['---renz_1', options.enzyme1, '--renz_2', options.enzyme2]) + + # quality + cmd_line.extend(['-E', options.qualitenc]) + + # outputs + cmd_line.extend(['-o', 'job_outputs/']) + cmd_line.extend(['-y', options.outype]) + + # test capture discards + if options.D: + cmd_line.append('-D') + + # optional options + if options.activate_advanced_options == "true": + + if options.q: + cmd_line.append('-q') + if options.r: + cmd_line.append('-r') + + cmd_line.extend(['-w', options.w, '-s', options.s]) + + if options.c: + cmd_line.append('-c') + if options.t != '-1': + cmd_line.extend(['-t', options.t]) + if options.inline_null: + cmd_line.append('--inline_null') + if options.index_null: + cmd_line.append('--index_null') + if options.inline_inline: + cmd_line.append('--inline_inline') + if options.index_index: + cmd_line.append('--index_index') + if options.inline_index: + cmd_line.append('--inline_index') + if options.index_inline: + cmd_line.append('--index_inline') + + print '[CMD_LINE] : ' + ' '.join(cmd_line) + + p = subprocess.call(cmd_line) + + # postprocesses + + try: + shutil.move('job_outputs/process_radtags.log', options.logfile) + except: + sys.stderr.write('Error in process_radtags execution; Please read the additional output (stdout)\n') + sys.exit(1) + + if options.discard_file: + discards_file_name = glob.glob('job_outputs/*.discards')[0] + shutil.move(discards_file_name, options.discard_file) + + # manage outputs names + + change_outputs_procrad_name(os.getcwd() + '/job_outputs', options.sample_name) + + # generate additional output archive file + + if options.output_choice != '1': + generate_additional_archive_file(os.getcwd() + '/job_outputs', options.output_archive) + + # if user has not choose the only zip archive + + if options.output_choice != '3': + list_files = glob.glob('job_outputs/*') + for i in list_files: + shutil.move(i, 'galaxy_outputs') + + +if __name__ == '__main__': + __main__() + + diff -r 000000000000 -r d6ba40f6c824 STACKS_procrad.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_procrad.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,382 @@ + + +Run the STACKS cleaning script + + +#if str( $options_type.options_type_selector ) == "single": +#for $input in $options_type.inputs_single: +${input.display_name}::${input} +#end for +#end if + + +#if str( $options_type.options_type_selector ) == "paired": +#for $input in $options_type.inputs_paired1: +${input.display_name}::${input} +#end for +#end if + + +#if str( $options_type.options_type_selector ) == "paired": +#for $input in $options_type.inputs_paired2: +${input.display_name}::${input} +#end for +#end if + + + + stacks + + + +STACKS_procrad.py + --input_type $options_type.options_type_selector + #if str( $options_type.options_type_selector ) == "single": + --input_single $input_single + #else + --input_paired1 $input_paired1 + --input_paired2 $input_paired2 + #end if + --inputype $inputype + --barcode $barcode + --sample_name $sample_name + --output_choice $options_output_infos_selector + #if str( $options_output_infos_selector ) != "1": + --output_archive $output_archive + #end if + --input_enzyme $options_enzyme.options_enzyme_selector + --enzyme1 $options_enzyme.enzyme + #if str( $options_enzyme.options_enzyme_selector ) == "2": + --enzyme2 $options_enzyme.enzyme2 + #end if + --outype $outype + --qualitenc $options_quality.qualitenc + #if $capture: + -D + #end if + --activate_advanced_options $activate_advanced_options + -t $options_advanced.truncate + #if $options_advanced.discard: + -q + #end if + #if $options_advanced.rescue: + -r + #end if + -w $options_advanced.sliding + -s $options_advanced.score + #if $options_advanced.remove: + -c + #end if + #if $options_advanced.inline: + --inline_null + #end if + #if $options_advanced.index: + --index_null + #end if + #if $options_advanced.inlinein: + --inline_inline + #end if + #if $options_advanced.indexind: + --index_index + #end if + #if $options_advanced.inlineind: + --inline_index + #end if + #if $options_advanced.indexin: + --index_inline + #end if + --logfile $output + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ +
+ + + + + + + + + + + + +
+ + + + + + + + + + + +
+ + + + + + options_output_infos_selector != "1" + + + capture + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This program examines raw reads from an Illumina sequencing run and first, checks that the barcode and the RAD cutsite are intact, and demultiplexes the data. If there are +errors in the barcode or the RAD site within a certain allowance process_radtags can correct them. Second, it slides a window down the length of the read and checks the +average quality score within the window. If the score drops below 90% probability of being correct (a raw phred score of 10), the read is discarded. This allows for some +seqeuncing errors while elimating reads where the sequence is degrading as it is being sequenced. By default the sliding window is 15% of the length of the read, but can be +specified on the command line (the threshold and window size can be adjusted). +The process_radtags program can: +handle data that is barcoded, either inline or using an index, or unbarcoded. +use combinatorial barcodes. +check and correct for a restriction enzyme cutsite for single or double-digested +data. +filter adapter sequence while allowing for sequencing error in the adapter pattern. +process individual files or whole directories of files. +directly read gzipped data +filter reads based on Illumina's Chastity filter + +-------- + +**Help** + +Input files: + +- FASTQ, FASTA, zip, tar.gz + +- Barcode File Format + +The barcode file is a very simple format : one barcode per line. + + CGATA + CGGCG + GAAGC + GAGAT + CGATA + CGGCG + GAAGC + GAGAT + +Combinatorial barcodes are specified, one per column, separated by a tab:: + + CGATA ACGTA + CGGCG CGTA + GAAGC CGTA + GAGAT CGTA + CGATA AGCA + CGGCG AGCA + GAAGC AGCA + GAGAT AGCA + + +Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki <https://www.e-biogenouest.org/wiki/ManArchiveGalaxy>`_ . + +-------- + + +**Created by:** + +Stacks was developed by Julian Catchen with contributions from Angel Amores, Paul Hohenlohe, and Bill Cresko + +-------- + +**Project links:** + +`STACKS website <http://creskolab.uoregon.edu/stacks/>`_ . + +`STACKS manual <http://creskolab.uoregon.edu/stacks/stacks_manual.pdf>`_ . + +`STACKS google group <https://groups.google.com/forum/#!forum/stacks-users>`_ . + +-------- + +**References:** + +-J. Catchen, P. Hohenlohe, S. Bassham, A. Amores, and W. Cresko. Stacks: an analysis tool set for population genomics. Molecular Ecology. 2013. + +-J. Catchen, S. Bassham, T. Wilson, M. Currey, C. O'Brien, Q. Yeates, and W. Cresko. The population structure and recent colonization history of Oregon threespine stickleback determined using restriction-site associated DNA-sequencing. Molecular Ecology. 2013. + +-J. Catchen, A. Amores, P. Hohenlohe, W. Cresko, and J. Postlethwait. Stacks: building and genotyping loci de novo from short-read sequences. G3: Genes, Genomes, Genetics, 1:171-182, 2011. + +-A. Amores, J. Catchen, A. Ferrara, Q. Fontenot and J. Postlethwait. Genome evolution and meiotic maps by massively parallel DNA sequencing: Spotted gar, an outgroup for the teleost genome duplication. Genetics, 188:799'808, 2011. + +-P. Hohenlohe, S. Amish, J. Catchen, F. Allendorf, G. Luikart. RAD sequencing identifies thousands of SNPs for assessing hybridization between rainbow trout and westslope cutthroat trout. Molecular Ecology Resources, 11(s1):117-122, 2011. + +-K. Emerson, C. Merz, J. Catchen, P. Hohenlohe, W. Cresko, W. Bradshaw, C. Holzapfel. Resolving postglacial phylogeography using high-throughput sequencing. Proceedings of the National Academy of Science, 107(37):16196-200, 2010. + +-------- + +**Integrated by:** + +Yvan Le Bras and Cyril Monjeaud + +GenOuest Bio-informatics Core Facility + +UMR 6074 IRISA INRIA-CNRS-UR1 Rennes (France) + +support@genouest.org + +If you use this tool in Galaxy, please cite : + +`Y. Le Bras, A. Roult, C. Monjeaud, M. Bahin, O. Quenez, C. Heriveau, A. Bretaudeau, O. Sallou, O. Collin, Towards a Life Sciences Virtual Research Environment : an e-Science initiative in Western France. JOBIM 2013. <https://www.e-biogenouest.org/resources/128>`_ + + + + + + 10.1111/mec.12354 + 10.1111/mec.12330 + 10.1534/g3.111.000240 + 10.1534/genetics.111.127324 + 10.1111/j.1755-0998.2010.02967.x + 10.1073/pnas.1006538107 + + @INPROCEEDINGS{JOBIM2013, + author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.}, + title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France}, + booktitle = {JOBIM 2013 Proceedings}, + year = {2013}, + url = {https://www.e-biogenouest.org/resources/128}, + pages = {97-106} + } + +
+ diff -r 000000000000 -r d6ba40f6c824 STACKS_refmap.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_refmap.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,258 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import sys +import re +import os +import tempfile +import shutil +import subprocess +import glob +import optparse +from os.path import basename +import zipfile +import tarfile +import gzip +from galaxy.datatypes.checkers import * +from stacks import * + + +def __main__(): + + # arguments recuperation + + parser = optparse.OptionParser() + parser.add_option('-p') + parser.add_option('-r') + parser.add_option('-s') + parser.add_option('-O') + parser.add_option('-n') + parser.add_option('-m') + parser.add_option('--bound_low') + parser.add_option('--bound_high') + parser.add_option('--alpha') + parser.add_option('--logfile') + parser.add_option('--compress_output') + parser.add_option('--catalogsnps') + parser.add_option('--catalogalleles') + parser.add_option('--catalogtags') + + # additionnal outputs + + parser.add_option('--total_output') + parser.add_option('--tags_output') + parser.add_option('--snps_output') + parser.add_option('--alleles_output') + parser.add_option('--matches_output') + (options, args) = parser.parse_args() + + # create working directories + + os.mkdir('inputs') + os.mkdir('job_outputs') + os.mkdir('galaxy_outputs') + + cmd_line = [] + cmd_line.append('ref_map.pl') + + # if genetic map + + if options.p: + + # parse config files + + tab_parent_files = galaxy_config_to_tabfiles_for_STACKS(options.p) + + # check if zipped files are into the tab and change tab content + + extract_compress_files_from_tabfiles(tab_parent_files, 'inputs') + + # check files extension (important to have .sam files) + + check_sam_extension_and_add(tab_parent_files, 'inputs') + + # create symlink into the temp dir + + create_symlinks_from_tabfiles(tab_parent_files, 'inputs') + + # create the command input line + + for key in tab_parent_files: + + # if is a file (skip repository created after a decompression) + + if os.path.isfile('inputs/'+key): + cmd_line.extend(['-p', os.path.normpath('inputs/'+key)]) + + # if genetic map with progeny files + + if options.r: + + # parse config files + + tab_progeny_files = galaxy_config_to_tabfiles_for_STACKS(options.r) + + # check if zipped files are into the tab and change tab content + + extract_compress_files_from_tabfiles(tab_progeny_files, 'inputs') + + # check files extension (important to have .sam files) + + check_sam_extension_and_add(tab_progeny_files, 'inputs') + + # create symlink into the temp dir + + create_symlinks_from_tabfiles(tab_progeny_files, 'inputs') + + for key in tab_progeny_files: + + # if is a file (skip repository created after a decompression) + + if os.path.isfile('inputs/' + key): + cmd_line.extend(['-r', 'inputs/' + key]) + + # parse config files and create symlink if individual files are selected + + if options.s: + + # parse config files + + tab_individual_files = galaxy_config_to_tabfiles_for_STACKS(options.s) + + # check if zipped files are into the tab and change tab content + + extract_compress_files_from_tabfiles(tab_individual_files, 'inputs') + + # check files extension (important to have .sam files) + + check_sam_extension_and_add(tab_individual_files, 'inputs') + + # create symlink into the temp dir + + create_symlinks_from_tabfiles(tab_individual_files, 'inputs') + + # create the command input line + + for key in tab_individual_files: + cmd_line.extend(['-s', 'inputs/' + key]) + + # create the options command line + + cmd_line.extend([ + '-S', + '-b', '1', + '-T', '4', + '-o', 'job_outputs', + '-n', options.n, + '-m', options.m, + ]) + + if options.O: + cmd_line.extend(['-O', options.O]) + + if options.bound_low: + cmd_line.extend(['--bound_low', options.bound_low]) + + if options.bound_high: + cmd_line.extend(['--bound_high', options.bound_high]) + + if options.alpha: + cmd_line.extend(['--alpha', options.alpha]) + + # execute job + + print '[COMMAND LINE]' + ' '.join(cmd_line) + + p = subprocess.Popen(cmd_line, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + (stdoutput, stderror) = p.communicate() + + print stdoutput + print stderror + + # postprocesses + + try: + shutil.move('job_outputs/ref_map.log', options.logfile) + except: + sys.stderr.write('Error in ref_map execution; Please read the additional output (stdout)\n') + + # go inside the outputs dir + + os.chdir('job_outputs') + + # move files + + for i in glob.glob('*'): + if re.search('catalog.snps.tsv$', i): + shutil.copy(i, options.catalogsnps) + if re.search('catalog.alleles.tsv$', i): + shutil.copy(i, options.catalogalleles) + if re.search('catalog.tags.tsv$', i): + shutil.copy(i, options.catalogtags) + + # copy all files inside tmp_dir into workdir + + list_files = glob.glob('*') + + # if compress output is total + + if options.compress_output == 'total': + + mytotalzipfile = zipfile.ZipFile('total.zip.temp', 'w', + allowZip64=True) + + for i in list_files: + + mytotalzipfile.write(os.path.basename(i)) + + # return the unique archive + + shutil.move('total.zip.temp', options.total_output) + elif options.compress_output == 'categories': + + # if compress output is by categories + + mytagszip = zipfile.ZipFile('tags.zip.temp', 'w', allowZip64=True) + mysnpszip = zipfile.ZipFile('snps.zip.temp', 'w', allowZip64=True) + myalleleszip = zipfile.ZipFile('alleles.zip.temp', 'w', allowZip64=True) + mymatcheszip = zipfile.ZipFile('matches.zip.temp', 'w', allowZip64=True) + + for i in list_files: + + # for each type of files + + if re.search("tags\.tsv$", i) and not re.search('batch', i): + mytagszip.write(os.path.basename(i)) + os.remove(i) + elif re.search("snps\.tsv$", i) and not re.search('batch', i): + mysnpszip.write(os.path.basename(i)) + os.remove(i) + elif re.search("alleles\.tsv$", i) and not re.search('batch', i): + myalleleszip.write(os.path.basename(i)) + os.remove(i) + elif re.search("matches\.tsv$", i) and not re.search('batch', i): + mymatcheszip.write(os.path.basename(i)) + os.remove(i) + else: + shutil.move(os.path.basename(i), '../galaxy_outputs') + + # return archives.... + + shutil.move('tags.zip.temp', options.tags_output) + shutil.move('snps.zip.temp', options.snps_output) + shutil.move('alleles.zip.temp', options.alleles_output) + shutil.move('matches.zip.temp', options.matches_output) + else: + + # else no compression + + for i in list_files: + shutil.move(os.path.basename(i), '../galaxy_outputs') + + +if __name__ == '__main__': + __main__() + + diff -r 000000000000 -r d6ba40f6c824 STACKS_refmap.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_refmap.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,347 @@ + + Run the STACKS ref_map.pl wrapper + + + +#if str( $options_usage.options_usage_selector ) == "genetic" +#for $input in $options_usage.parent_sequence: +${input.display_name}::${input} +#end for +#end if + + +#if str( $options_usage.options_usage_selector ) == "genetic" and str( $options_usage.options_progeny.options_progeny_selector ) == "yes" +#for $input in $options_usage.options_progeny.progeny_sequence: +${input.display_name}::${input} +#end for +#end if + + +#if str( $options_usage.options_usage_selector ) == "population" +#for $input in $options_usage.individual_sample: +${input.display_name}::${input} +#end for +#end if + + + + stacks + + + +STACKS_refmap.py +#if str( $options_usage.options_usage_selector ) == "genetic" + -p $parent_sequences + #if str( $options_usage.options_progeny.options_progeny_selector ) == "yes" + -r $progeny_sequences + #end if +#else + -s $individual_samples + #if str( $options_usage.options_popmap.popmap_selector) == "yes" + -O $options_usage.options_popmap.popmap + #end if +#end if +-n $mismatchbetlocibuild +-m $mincov +--bound_low $snp_options.boundlow +--bound_high $snp_options.boundhigh +--alpha $snp_options.alpha +--catalogsnps $catalogsnps +--catalogalleles $catalogalleles +--catalogtags $catalogtags +--logfile $output +--compress_output $output_compress +##additionnal outputs +--total_output $total_output +--tags_output $tags_output +--snps_output $snps_output +--alleles_output $alleles_output +--matches_output $matches_output + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + + + +
+ + + + + + + + + + + + output_compress == "total" + + + output_compress == "categories" + + + output_compress == "categories" + + + output_compress == "categories" + + + output_compress == "categories" + + + + +.. class:: infomark + +**What it does** + +This program expects data that have been aligned to a reference genome, and can accept data directly from Bowtie, or from any aligner that can produce SAM format. To avoid datasets names problems, we recommand the use of the *Map with BWA for STACKS tool*. This program will execute each of the Stacks components: first, running pstacks on each of the samples specified, building loci (based on the reference alignment) and calling SNPs in each. Second, cstacks will be run to create a catalog of all loci specified as 'parents' or 'samples' on the command line, again using alignment to match loci in the catalog. Finally, sstacks will be executed to match each sample against the catalog. The ref_map.pl program will also load the results of each stage of the analysis: individual loci, the catalog, and matches against the catalog into the database (although this can be disabled). After matching the program will build a database index to speed up access (index_radtags.pl) and enable web-based filtering. + + +-------- + + +**Created by:** + +Stacks was developed by Julian Catchen with contributions from Angel Amores, Paul Hohenlohe, and Bill Cresko + +-------- + +**Example:** + +Input files: + +- SAM, zip, tar.gz + +- Population map:: + + indv_01 1 + indv_02 1 + indv_03 1 + indv_04 2 + indv_05 2 + indv_06 2 + +Output files: + +- XXX.tags.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID Each sample passed through Stacks gets a unique id for that sample. + 3 Stack ID Each stack formed gets an ID. + 4 Chromosome If aligned to a reference genome using pstacks, otherwise it is blank. + 5 Basepair If aligned to ref genome using pstacks. + 6 Strand If aligned to ref genome using pstacks. + 7 Sequence Type Either 'consensus', 'primary' or 'secondary', see the Stacks paper for definitions of these terms. + 8 Sequence ID The individual sequence read that was merged into this stack. + 9 Sequence The raw sequencing read. + 10 Deleveraged Flag If "1", this stack was processed by the deleveraging algorithm and was broken down from a larger stack. + 11 Blacklisted Flag If "1", this stack was still confounded depsite processing by the deleveraging algorithm. + 12 Lumberja ckstack Flag If "1", this stack was set aside due to having an extreme depth of coverage. + +Notes: For the tags file, each stack will start in the file with a consensus sequence for the entire stack followed by the flags for that stack. Then, each individual read that was merged into that stack will follow. The next stack will start with another consensus sequence. + + +- XXX.snps.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID + 3 Stack ID + 4 SNP Column + 5 Likelihood ratio From the SNP-calling model. + 6 Rank_1 Majority nucleotide. + 7 Rank_2 Alternative nucleotide. + +Notes: If a stack has two SNPs called within it, then there will be two lines in this file listing each one. + + +- XXX.alleles.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Sample ID + 3 Stack ID + 4 Haplotype The haplotype, as constructed from the called SNPs at each locus. + 5 Percent Percentage of reads that have this haplotype + 6 Count Raw number of reads that have this haplotype + + +- XXX.matches.tsv file:: + + Column Name Description + 1 Sql ID This field will always be "0", however the MySQL database will assign an ID when it is loaded. + 2 Batch ID + 3 Catalog ID + 4 Sample ID + 5 Stack ID + 6 Haplotype + 7 Stack Depth + +Notes: Each line in this file records a match between a catalog locus and a locus in an individual, for a particular haplotype. The Batch ID plus the Catalog ID together represent a unique locus in the entire population, while the Sample ID and the Stack ID together represent a unique locus in an individual sample. + + +- batch_X.sumstats.tsv Summary Statistics Output:: + + Batch ID The batch identifier for this data set. + Locus ID Catalog locus identifier. + Chromosome If aligned to a reference genome. + Basepair If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand). + Column The nucleotide site within the catalog locus. + Population ID The ID supplied to the populations program, as written in the population map file. + P Nucleotide The most frequent allele at this position in this population. + Q Nucleotide The alternative allele. + Number of Individuals Number of individuals sampled in this population at this site. + P Frequency of most frequent allele. + Observed Heterozygosity The proportion of individuals that are heterozygotes in this population. + Observed Homozygosity The proportion of individuals that are homozygotes in this population. + Expected Heterozygosity Heterozygosity expected under Hardy-Weinberg equilibrium. + Expected Homozygosity Homozygosity expected under Hardy-Weinberg equilibrium. + pi An estimate of nucleotide diversity. + Smoothed pi A weighted average of p depending on the surrounding 3s of sequence in both directions. + Smoothed pi P-value If bootstrap resampling is enabled, a p-value ranking the significance of p within this population. + FIS The inbreeding coefficient of an individual (I) relative to the subpopulation (S). + Smoothed FIS A weighted average of FIS depending on the surrounding 3s of sequence in both directions. + Smoothed FIS P-value If bootstrap resampling is enabled, a p-value ranking the significance of FIS within this population. + Private allele True (1) or false (0), depending on if this allele is only occurs in this population. + +- batch_X.fst_Y-Z.tsv Pairwise FST Output:: + + Batch ID The batch identifier for this data set. + Locus ID Catalog locus identifier. + Population ID 1 The ID supplied to the populations program, as written in the population map file. + Population ID 2 The ID supplied to the populations program, as written in the population map file. + Chromosome If aligned to a reference genome. + Basepair If aligned to a reference genome. This is the alignment of the whole catalog locus. The exact basepair reported is aligned to the location of the RAD site (depending on whether alignment is to the positive or negative strand). + Column The nucleotide site within the catalog locus. + Overall pi An estimate of nucleotide diversity across the two populations. + FST A measure of population differentiation. + FET p-value P-value describing if the FST measure is statistically significant according to Fisher's Exact Test. + Odds Ratio Fisher's Exact Test odds ratio + CI High Fisher's Exact Test confidence interval. + CI Low Fisher's Exact Test confidence interval. + LOD Score Logarithm of odds score. + Expected Heterozygosity Heterozygosity expected under Hardy-Weinberg equilibrium. + Expected Homozygosity Homozygosity expected under Hardy-Weinberg equilibrium. + Corrected FST FST with either the FET p-value, or a window-size or genome size Bonferroni correction. + Smoothed FST A weighted average of FST depending on the surrounding 3s of sequence in both directions. + Smoothed FST P-value If bootstrap resampling is enabled, a p-value ranking the significance of FST within this pair of populations. + + +Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki <https://www.e-biogenouest.org/wiki/ManArchiveGalaxy>`_ . + + +-------- + +**Project links:** + +`STACKS website <http://creskolab.uoregon.edu/stacks/>`_ . + +`STACKS manual <http://creskolab.uoregon.edu/stacks/stacks_manual.pdf>`_ . + +`STACKS google group <https://groups.google.com/forum/#!forum/stacks-users>`_ . + +-------- + +**References:** + +-J. Catchen, P. Hohenlohe, S. Bassham, A. Amores, and W. Cresko. Stacks: an analysis tool set for population genomics. Molecular Ecology. 2013. + +-J. Catchen, S. Bassham, T. Wilson, M. Currey, C. O'Brien, Q. Yeates, and W. Cresko. The population structure and recent colonization history of Oregon threespine stickleback determined using restriction-site associated DNA-sequencing. Molecular Ecology. 2013. + +-J. Catchen, A. Amores, P. Hohenlohe, W. Cresko, and J. Postlethwait. Stacks: building and genotyping loci de novo from short-read sequences. G3: Genes, Genomes, Genetics, 1:171-182, 2011. + +-A. Amores, J. Catchen, A. Ferrara, Q. Fontenot and J. Postlethwait. Genome evolution and meiotic maps by massively parallel DNA sequencing: Spotted gar, an outgroup for the teleost genome duplication. Genetics, 188:799'808, 2011. + +-P. Hohenlohe, S. Amish, J. Catchen, F. Allendorf, G. Luikart. RAD sequencing identifies thousands of SNPs for assessing hybridization between rainbow trout and westslope cutthroat trout. Molecular Ecology Resources, 11(s1):117-122, 2011. + +-K. Emerson, C. Merz, J. Catchen, P. Hohenlohe, W. Cresko, W. Bradshaw, C. Holzapfel. Resolving postglacial phylogeography using high-throughput sequencing. Proceedings of the National Academy of Science, 107(37):16196-200, 2010. + +-------- + +**Integrated by:** + +Yvan Le Bras and Cyril Monjeaud + +GenOuest Bio-informatics Core Facility + +UMR 6074 IRISA INRIA-CNRS-UR1 Rennes (France) + +support@genouest.org + +If you use this tool in Galaxy, please cite : + +`Y. Le Bras, A. Roult, C. Monjeaud, M. Bahin, O. Quenez, C. Heriveau, A. Bretaudeau, O. Sallou, O. Collin, Towards a Life Sciences Virtual Research Environment : an e-Science initiative in Western France. JOBIM 2013. <https://www.e-biogenouest.org/resources/128>`_ + + + + + 10.1111/mec.12354 + 10.1111/mec.12330 + 10.1534/g3.111.000240 + 10.1534/genetics.111.127324 + 10.1111/j.1755-0998.2010.02967.x + 10.1073/pnas.1006538107 + + @INPROCEEDINGS{JOBIM2013, + author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.}, + title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France}, + booktitle = {JOBIM 2013 Proceedings}, + year = {2013}, + url = {https://www.e-biogenouest.org/resources/128}, + pages = {97-106} + } + +
+ diff -r 000000000000 -r d6ba40f6c824 STACKS_sort_read_pairs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_sort_read_pairs.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,109 @@ +#!/usr/bin/env python + +import sys, re +import os +import tempfile +import shutil, subprocess, glob +import optparse +from os.path import basename +import zipfile, tarfile, gzip +from galaxy.datatypes.checkers import * +from stacks import * + +""" + +Created by Yvan Le Bras +yvan.le_bras@irisa.fr + +Last modifications : 02/17/2014 + +WARNING : + +STACKS_sort_read_pairs.py needs: + +- STACKS scripts in your $PATH + +These scripts are available after compiling the sources of STACKS : + +http://creskolab.uoregon.edu/stacks/ + +or with the galaxy_stacks package in the Genouest toolshed + + +""" + +def __main__(): + + + # create the working dir + os.mkdir("sort_read_outputs") + os.mkdir("assembly_outputs") + os.mkdir("samples_inputs") + os.mkdir("stacks_inputs") + + # arguments recuperation + parser = optparse.OptionParser() + parser.add_option("-a") + parser.add_option("-e") + parser.add_option("-b") + parser.add_option("-c") + parser.add_option("-d") + parser.add_option("-o") + (options, args) = parser.parse_args() + + # edit the command line + cmd_line1 = ["sort_read_pairs.pl"] + + #parse config files and create symlink if individual files are selected + + # STACKS_archive + # check if zipped files are into the tab + extract_compress_files(options.a, os.getcwd()+"/stacks_inputs") + + # samples_archive + # check if zipped files are into the tab and change tab content + extract_compress_files(options.e, os.getcwd()+"/samples_inputs") + + # create the sort_read_pairs command input line + cmd_line1.extend(["-p", "stacks_inputs", "-s", "samples_inputs", "-o", "sort_read_outputs"]) + + if options.b: + cmd_line1.extend(["-w", options.b]) + if options.c: + cmd_line1.extend(["-r", options.c]) + + # exec command line 1 + p1 = subprocess.Popen(cmd_line1) + p1.communicate() + + # parse all files list and remove empty files from the output dir + for fasta_file in glob.glob("sort_read_outputs/*"): + if os.stat(fasta_file).st_size==0: + print "File "+fasta_file+" is empty" + os.remove(fasta_file) + + + # create the exec_velvet command input line + cmd_line2 = ["exec_velvet.pl"] + cmd_line2.extend(["-s", "sort_read_outputs", "-o", "assembly_outputs"]) + cmd_line2.append("-c") + + if options.d: + cmd_line2.extend(["-M", options.d]) + + # version + #cmd = 'sort_read_pairs.pl'+cmd_files+" "+cmd_options+" 2>&1" + #cmd2 = 'exec_velvet.pl'+cmd_files2+" -c -e /softs/local/velvet/velvet_1.2.03/ "+cmd_options2+" 2>&1" + + # launch the command line 2 + p2 = subprocess.Popen(cmd_line2) + p2.communicate() + + # get collated.fa file + try: + shutil.copy("assembly_outputs/collated.fa", options.o) + except: + print "No result file" + sys.exit(1) + +if __name__ == "__main__": __main__() diff -r 000000000000 -r d6ba40f6c824 STACKS_sort_read_pairs.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/STACKS_sort_read_pairs.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,164 @@ + + Run the STACKS sort_read_pairs.pl and exec_velvet.pl wrappers + + + stacks + velvet + + + +STACKS_sort_read_pairs.py +-a $STACKS_archive +-e $samples_archive +#if str( $options_whitelist.whitelist_selector) == "yes" +-b $whitelist +#end if +#if str( $options_filter.reads_selector) == "yes" +-c $options_filter.threshold +#end if +#if str( $options_filter2.length_selector) == "yes" +-d $options_filter2.threshold2 +#end if +-o $output + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This program will run each of the Stacks sort_read_pairs.pl and exec_velvet.pl utilities to assemble pair-end reads from STACKS pipeline results + +-------- + +**Created by:** + +Stacks was developed by Julian Catchen with contributions from Angel Amores, Paul Hohenlohe, and Bill Cresko + +-------- + +**Example:** + +Input file: + +Output archives of STACKS : Reference map or STACKS : De novo map, in zip or tar.gz format + + +Output file: + +A collated.fa file containing assembled contigs for each locus + + +Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki <https://www.e-biogenouest.org/wiki/ManArchiveGalaxy>`_ . + +-------- + + +**Project links:** + +`STACKS website <http://creskolab.uoregon.edu/stacks/>`_ . + +`STACKS manual <http://creskolab.uoregon.edu/stacks/stacks_manual.pdf>`_ . + +`STACKS google group <https://groups.google.com/forum/#!forum/stacks-users>`_ . + +-------- + +**References:** + +-J. Catchen, P. Hohenlohe, S. Bassham, A. Amores, and W. Cresko. Stacks: an analysis tool set for population genomics. Molecular Ecology. 2013. + +-J. Catchen, S. Bassham, T. Wilson, M. Currey, C. O'Brien, Q. Yeates, and W. Cresko. The population structure and recent colonization history of Oregon threespine stickleback determined using restriction-site associated DNA-sequencing. Molecular Ecology. 2013. + +-J. Catchen, A. Amores, P. Hohenlohe, W. Cresko, and J. Postlethwait. Stacks: building and genotyping loci de novo from short-read sequences. G3: Genes, Genomes, Genetics, 1:171-182, 2011. + +-A. Amores, J. Catchen, A. Ferrara, Q. Fontenot and J. Postlethwait. Genome evolution and meiotic maps by massively parallel DNA sequencing: Spotted gar, an outgroup for the teleost genome duplication. Genetics, 188:799'808, 2011. + +-P. Hohenlohe, S. Amish, J. Catchen, F. Allendorf, G. Luikart. RAD sequencing identifies thousands of SNPs for assessing hybridization between rainbow trout and westslope cutthroat trout. Molecular Ecology Resources, 11(s1):117-122, 2011. + +-K. Emerson, C. Merz, J. Catchen, P. Hohenlohe, W. Cresko, W. Bradshaw, C. Holzapfel. Resolving postglacial phylogeography using high-throughput sequencing. Proceedings of the National Academy of Science, 107(37):16196-200, 2010. + +-------- + +**Integrated by:** + +Yvan Le Bras and Cyril Monjeaud + +GenOuest Bio-informatics Core Facility + +UMR 6074 IRISA INRIA-CNRS-UR1 Rennes (France) + +support@genouest.org + +If you use this tool in Galaxy, please cite : + +`Y. Le Bras, A. Roult, C. Monjeaud, M. Bahin, O. Quenez, C. Heriveau, A. Bretaudeau, O. Sallou, O. Collin, Towards a Life Sciences Virtual Research Environment : an e-Science initiative in Western France. JOBIM 2013. <https://www.e-biogenouest.org/resources/128>`_ + + + + + 10.1111/mec.12354 + 10.1111/mec.12330 + 10.1534/g3.111.000240 + 10.1534/genetics.111.127324 + 10.1111/j.1755-0998.2010.02967.x + 10.1073/pnas.1006538107 + + @INPROCEEDINGS{JOBIM2013, + author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.}, + title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France}, + booktitle = {JOBIM 2013 Proceedings}, + year = {2013}, + url = {https://www.e-biogenouest.org/resources/128}, + pages = {97-106} + } + + diff -r 000000000000 -r d6ba40f6c824 bwa_index.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa_index.loc.sample Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,38 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of BWA indexed sequences data files. You will need +#to create these data files and then create a bwa_index.loc file +#similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The bwa_index.loc +#file has this format (longer white space characters are TAB characters): +# +# +# +#So, for example, if you had phiX indexed stored in +#/depot/data2/galaxy/phiX/base/, +#then the bwa_index.loc entry would look like this: +# +#phiX174 phiX phiX Pretty /depot/data2/galaxy/phiX/base/phiX.fa +# +#and your /depot/data2/galaxy/phiX/base/ directory +#would contain phiX.fa.* files: +# +#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 phiX.fa.amb +#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 phiX.fa.ann +#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 phiX.fa.bwt +#...etc... +# +#Your bwa_index.loc file should include an entry per line for each +#index set you have stored. The "file" in the path does not actually +#exist, but it is the prefix for the actual index files. For example: +# +#phiX174 phiX phiX174 /depot/data2/galaxy/phiX/base/phiX.fa +#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/hg18/base/hg18canon.fa +#hg18full hg18 hg18 Full /depot/data2/galaxy/hg18/base/hg18full.fa +#/orig/path/hg19.fa hg19 hg19 /depot/data2/galaxy/hg19/base/hg19.fa +#...etc... +# +#Note that for backwards compatibility with workflows, the unique ID of +#an entry must be the path that was in the original loc file, because that +#is the value stored in the workflow for that parameter. That is why the +#hg19 entry above looks odd. New genomes can be better-looking. +# diff -r 000000000000 -r d6ba40f6c824 bwa_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa_wrapper.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,358 @@ +#!/usr/bin/env python + +""" +Runs BWA on single-end or paired-end data. +Produces a SAM file containing the mappings. +Works with BWA version 0.5.9. + +usage: bwa_wrapper.py [options] + +See below for options +""" + +import optparse, os, shutil, subprocess, sys, tempfile +import glob +import gzip, zipfile, tarfile + +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +def check_is_double_encoded( fastq ): + # check that first read is bases, not one base followed by numbers + bases = [ 'A', 'C', 'G', 'T', 'a', 'c', 'g', 't', 'N' ] + nums = [ '0', '1', '2', '3' ] + for line in file( fastq, 'rb'): + if not line.strip() or line.startswith( '@' ): + continue + if len( [ b for b in line.strip() if b in nums ] ) > 0: + return False + elif line.strip()[0] in bases and len( [ b for b in line.strip() if b in bases ] ) == len( line.strip() ): + return True + else: + raise Exception, 'First line in first read does not appear to be a valid FASTQ read in either base-space or color-space' + raise Exception, 'There is no non-comment and non-blank line in your FASTQ file' + +def __main__(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to use' ) + parser.add_option( '-c', '--color-space', dest='color_space', action='store_true', help='If the input files are SOLiD format' ) + parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' ) + parser.add_option( '-f', '--input1', dest='fastq', help='The (forward) fastq file to use for the mapping' ) + parser.add_option( '-u', '--output', dest='output', help='The file to save the output (SAM format)' ) + parser.add_option( '-p', '--params', dest='params', help='Parameter setting to use (pre_set or full)' ) + parser.add_option( '-s', '--fileSource', dest='fileSource', help='Whether to use a previously indexed reference sequence or one form history (indexed or history)' ) + parser.add_option( '-n', '--maxEditDist', dest='maxEditDist', help='Maximum edit distance if integer' ) + parser.add_option( '-m', '--fracMissingAligns', dest='fracMissingAligns', help='Fraction of missing alignments given 2% uniform base error rate if fraction' ) + parser.add_option( '-o', '--maxGapOpens', dest='maxGapOpens', help='Maximum number of gap opens' ) + parser.add_option( '-e', '--maxGapExtens', dest='maxGapExtens', help='Maximum number of gap extensions' ) + parser.add_option( '-d', '--disallowLongDel', dest='disallowLongDel', help='Disallow a long deletion within specified bps' ) + parser.add_option( '-i', '--disallowIndel', dest='disallowIndel', help='Disallow indel within specified bps' ) + parser.add_option( '-l', '--seed', dest='seed', help='Take the first specified subsequences' ) + parser.add_option( '-k', '--maxEditDistSeed', dest='maxEditDistSeed', help='Maximum edit distance to the seed' ) + parser.add_option( '-M', '--mismatchPenalty', dest='mismatchPenalty', help='Mismatch penalty' ) + parser.add_option( '-O', '--gapOpenPenalty', dest='gapOpenPenalty', help='Gap open penalty' ) + parser.add_option( '-E', '--gapExtensPenalty', dest='gapExtensPenalty', help='Gap extension penalty' ) + parser.add_option( '-R', '--suboptAlign', dest='suboptAlign', default=None, help='Proceed with suboptimal alignments even if the top hit is a repeat' ) + parser.add_option( '-N', '--noIterSearch', dest='noIterSearch', help='Disable iterative search' ) + parser.add_option( '-T', '--outputTopN', dest='outputTopN', help='Maximum number of alignments to output in the XA tag for reads paired properly' ) + parser.add_option( '', '--outputTopNDisc', dest='outputTopNDisc', help='Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons)' ) + parser.add_option( '-S', '--maxInsertSize', dest='maxInsertSize', help='Maximum insert size for a read pair to be considered mapped good' ) + parser.add_option( '-P', '--maxOccurPairing', dest='maxOccurPairing', help='Maximum occurrences of a read for pairings' ) + parser.add_option( '', '--rgid', dest='rgid', help='Read group identifier' ) + parser.add_option( '', '--rgcn', dest='rgcn', help='Sequencing center that produced the read' ) + parser.add_option( '', '--rgds', dest='rgds', help='Description' ) + parser.add_option( '', '--rgdt', dest='rgdt', help='Date that run was produced (ISO8601 format date or date/time, like YYYY-MM-DD)' ) + parser.add_option( '', '--rgfo', dest='rgfo', help='Flow order' ) + parser.add_option( '', '--rgks', dest='rgks', help='The array of nucleotide bases that correspond to the key sequence of each read' ) + parser.add_option( '', '--rglb', dest='rglb', help='Library name' ) + parser.add_option( '', '--rgpg', dest='rgpg', help='Programs used for processing the read group' ) + parser.add_option( '', '--rgpi', dest='rgpi', help='Predicted median insert size' ) + parser.add_option( '', '--rgpl', dest='rgpl', choices=[ 'CAPILLARY', 'LS454', 'ILLUMINA', 'SOLID', 'HELICOS', 'IONTORRENT' and 'PACBIO' ], help='Platform/technology used to produce the reads' ) + parser.add_option( '', '--rgpu', dest='rgpu', help='Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD)' ) + parser.add_option( '', '--rgsm', dest='rgsm', help='Sample' ) + parser.add_option( '-D', '--dbkey', dest='dbkey', help='Dbkey for reference genome' ) + parser.add_option( '-X', '--do_not_build_index', dest='do_not_build_index', action='store_true', help="Don't build index" ) + parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' ) + parser.add_option( '-I', '--illumina1.3', dest='illumina13qual', help='Input FASTQ files have Illuina 1.3 quality scores' ) + (options, args) = parser.parse_args() + + tmp_input_dir = tempfile.mkdtemp() + tmp_output_dir= tempfile.mkdtemp() + + + myarchive = zipfile.ZipFile(options.fastq, 'r', allowZip64=True) + myarchive.extractall(tmp_input_dir) + + for fastq in glob.glob(tmp_input_dir+'/*'): + + sam_output_file=tmp_output_dir+'/'+os.path.splitext(os.path.basename(fastq))[0]+'.sam' + create_sam=open(sam_output_file, "w") + create_sam.close() + + # output version # of tool + try: + tmp = tempfile.NamedTemporaryFile().name + tmp_stdout = open( tmp, 'wb' ) + proc = subprocess.Popen( args='bwa 2>&1', shell=True, stdout=tmp_stdout ) + tmp_stdout.close() + returncode = proc.wait() + stdout = None + for line in open( tmp_stdout.name, 'rb' ): + if line.lower().find( 'version' ) >= 0: + stdout = line.strip() + break + if stdout: + sys.stdout.write( 'BWA %s\n' % stdout ) + else: + raise Exception + except: + sys.stdout.write( 'Could not determine BWA version\n' ) + + # check for color space fastq that's not double-encoded and exit if appropriate + if options.color_space: + if not check_is_double_encoded( options.fastq ): + stop_err( 'Your file must be double-encoded (it must be converted from "numbers" to "bases"). See the help section for details' ) + #if options.genAlignType == 'paired': + #if not check_is_double_encoded( options.rfastq ): + #stop_err( 'Your reverse reads file must also be double-encoded (it must be converted from "numbers" to "bases"). See the help section for details' ) + + #fastq = options.fastq + #if options.rfastq: + #rfastq = options.rfastq + + # set color space variable + if options.color_space: + color_space = '-c' + else: + color_space = '' + + # make temp directory for placement of indices + tmp_index_dir = tempfile.mkdtemp() + tmp_dir = tempfile.mkdtemp() + # index if necessary + if options.fileSource == 'history' and not options.do_not_build_index: + ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir ) + ref_file_name = ref_file.name + ref_file.close() + os.symlink( options.ref, ref_file_name ) + # determine which indexing algorithm to use, based on size + try: + size = os.stat( options.ref ).st_size + if size <= 2**30: + indexingAlg = 'is' + else: + indexingAlg = 'bwtsw' + except: + indexingAlg = 'is' + indexing_cmds = '%s -a %s' % ( color_space, indexingAlg ) + cmd1 = 'bwa index %s %s' % ( indexing_cmds, ref_file_name ) + try: + tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name + tmp_stderr = open( tmp, 'wb' ) + proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + # clean up temp dirs + if os.path.exists( tmp_index_dir ): + shutil.rmtree( tmp_index_dir ) + if os.path.exists( tmp_dir ): + shutil.rmtree( tmp_dir ) + stop_err( 'Error indexing reference sequence. ' + str( e ) ) + else: + ref_file_name = options.ref + if options.illumina13qual: + illumina_quals = "-I" + else: + illumina_quals = "" + + # set up aligning and generate aligning command options + if options.params == 'pre_set': + aligning_cmds = '-t %s %s %s' % ( options.threads, color_space, illumina_quals ) + gen_alignment_cmds = '' + else: + if options.maxEditDist != '0': + editDist = options.maxEditDist + else: + editDist = options.fracMissingAligns + if options.seed != '-1': + seed = '-l %s' % options.seed + else: + seed = '' + if options.suboptAlign: + suboptAlign = '-R "%s"' % ( options.suboptAlign ) + else: + suboptAlign = '' + if options.noIterSearch == 'true': + noIterSearch = '-N' + else: + noIterSearch = '' + aligning_cmds = '-n %s -o %s -e %s -d %s -i %s %s -k %s -t %s -M %s -O %s -E %s %s %s %s %s' % \ + ( editDist, options.maxGapOpens, options.maxGapExtens, options.disallowLongDel, + options.disallowIndel, seed, options.maxEditDistSeed, options.threads, + options.mismatchPenalty, options.gapOpenPenalty, options.gapExtensPenalty, + suboptAlign, noIterSearch, color_space, illumina_quals ) + #if options.genAlignType == 'paired': + #gen_alignment_cmds = '-a %s -o %s' % ( options.maxInsertSize, options.maxOccurPairing ) + #if options.outputTopNDisc: + #gen_alignment_cmds += ' -N %s' % options.outputTopNDisc + + gen_alignment_cmds = '' + if options.rgid: + if not options.rglb or not options.rgpl or not options.rgsm: + stop_err( 'If you want to specify read groups, you must include the ID, LB, PL, and SM tags.' ) + readGroup = '@RG\tID:%s\tLB:%s\tPL:%s\tSM:%s' % ( options.rgid, options.rglb, options.rgpl, options.rgsm ) + if options.rgcn: + readGroup += '\tCN:%s' % options.rgcn + if options.rgds: + readGroup += '\tDS:%s' % options.rgds + if options.rgdt: + readGroup += '\tDT:%s' % options.rgdt + if options.rgfo: + readGroup += '\tFO:%s' % options.rgfo + if options.rgks: + readGroup += '\tKS:%s' % options.rgks + if options.rgpg: + readGroup += '\tPG:%s' % options.rgpg + if options.rgpi: + readGroup += '\tPI:%s' % options.rgpi + if options.rgpu: + readGroup += '\tPU:%s' % options.rgpu + gen_alignment_cmds += ' -r "%s"' % readGroup + if options.outputTopN: + gen_alignment_cmds += ' -n %s' % options.outputTopN + # set up output files + tmp_align_out = tempfile.NamedTemporaryFile( dir=tmp_dir ) + tmp_align_out_name = tmp_align_out.name + tmp_align_out.close() + tmp_align_out2 = tempfile.NamedTemporaryFile( dir=tmp_dir ) + tmp_align_out2_name = tmp_align_out2.name + tmp_align_out2.close() + # prepare actual aligning and generate aligning commands + cmd2 = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, fastq, tmp_align_out_name ) + cmd2b = '' + cmd3 = 'bwa samse %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, fastq, sam_output_file ) + # perform alignments + buffsize = 1048576 + try: + # need to nest try-except in try-finally to handle 2.4 + try: + # align + try: + tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name + tmp_stderr = open( tmp, 'wb' ) + proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + stderr = '' + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + raise Exception, 'Error aligning sequence. ' + str( e ) + # and again if paired data + try: + if cmd2b: + tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name + tmp_stderr = open( tmp, 'wb' ) + proc = subprocess.Popen( args=cmd2b, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + stderr = '' + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + raise Exception, 'Error aligning second sequence. ' + str( e ) + # generate align + try: + tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name + tmp_stderr = open( tmp, 'wb' ) + proc = subprocess.Popen( args=cmd3, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + stderr = '' + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + raise Exception, 'Error generating alignments. ' + str( e ) + # remove header if necessary + if options.suppressHeader == 'true': + tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir) + tmp_out_name = tmp_out.name + tmp_out.close() + try: + shutil.move( sam_output_file, tmp_out_name ) + except Exception, e: + raise Exception, 'Error moving output file before removing headers. ' + str( e ) + fout = file( sam_output_file, 'w' ) + for line in file( tmp_out.name, 'r' ): + if not ( line.startswith( '@HD' ) or line.startswith( '@SQ' ) or line.startswith( '@RG' ) or line.startswith( '@PG' ) or line.startswith( '@CO' ) ): + fout.write( line ) + fout.close() + # check that there are results in the output file + if os.path.getsize( sam_output_file ) > 0: + sys.stdout.write( 'BWA run on single-end data') + else: + raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.' + except Exception, e: + stop_err( 'The alignment failed.\n' + str( e ) ) + finally: + # clean up temp dir + if os.path.exists( tmp_index_dir ): + shutil.rmtree( tmp_index_dir ) + if os.path.exists( tmp_dir ): + shutil.rmtree( tmp_dir ) + + # put all in an archive + mytotalzipfile=zipfile.ZipFile(options.output, 'w', allowZip64=True) + os.chdir(tmp_output_dir) + for samfile in glob.glob(tmp_output_dir+'/*'): + mytotalzipfile.write(os.path.basename(samfile)) + + +if __name__=="__main__": __main__() diff -r 000000000000 -r d6ba40f6c824 bwa_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa_wrapper.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,339 @@ + + from zip file with fastqsanger files + + bwa + + + + + bwa_wrapper.py + --threads="4" + + #if $input1.ext == "fastqillumina": + --illumina1.3 + #end if + + ## reference source + --fileSource="${genomeSource.refGenomeSource}" + #if $genomeSource.refGenomeSource == "history": + ##build index on the fly + --ref="${genomeSource.ownFile}" + --dbkey="${dbkey}" + #else: + ##use precomputed indexes + --ref="${genomeSource.indices.fields.path}" + --do_not_build_index + #end if + + ## input file(s) + --input1="${paired.input1}" + + ## output file + --output="${output}" + + ## run parameters + --params="${params.source_select}" + #if $params.source_select != "pre_set": + --maxEditDist="${params.maxEditDist}" + --fracMissingAligns="${params.fracMissingAligns}" + --maxGapOpens="${params.maxGapOpens}" + --maxGapExtens="${params.maxGapExtens}" + --disallowLongDel="${params.disallowLongDel}" + --disallowIndel="${params.disallowIndel}" + --seed="${params.seed}" + --maxEditDistSeed="${params.maxEditDistSeed}" + --mismatchPenalty="${params.mismatchPenalty}" + --gapOpenPenalty="${params.gapOpenPenalty}" + --gapExtensPenalty="${params.gapExtensPenalty}" + --suboptAlign="${params.suboptAlign}" + --noIterSearch="${params.noIterSearch}" + --outputTopN="${params.outputTopN}" + --outputTopNDisc="${params.outputTopNDisc}" + --maxInsertSize="${params.maxInsertSize}" + --maxOccurPairing="${params.maxOccurPairing}" + #if $params.readGroup.specReadGroup == "yes" + --rgid="${params.readGroup.rgid}" + --rgcn="${params.readGroup.rgcn}" + --rgds="${params.readGroup.rgds}" + --rgdt="${params.readGroup.rgdt}" + --rgfo="${params.readGroup.rgfo}" + --rgks="${params.readGroup.rgks}" + --rglb="${params.readGroup.rglb}" + --rgpg="${params.readGroup.rgpg}" + --rgpi="${params.readGroup.rgpi}" + --rgpl="${params.readGroup.rgpl}" + --rgpu="${params.readGroup.rgpu}" + --rgsm="${params.readGroup.rgsm}" + #end if + #end if + + ## suppress output SAM header + --suppressHeader="${suppressHeader}" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +BWA is a fast light-weighted tool that aligns relatively short sequences (queries) to a sequence database (large), such as the human reference genome. It is developed by Heng Li at the Sanger Insitute. Li H. and Durbin R. (2009) Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics, 25, 1754-60. + +------ + +**Know what you are doing** + +.. class:: warningmark + +There is no such thing (yet) as an automated gearshift in short read mapping. It is all like stick-shift driving in San Francisco. In other words = running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy. + + .. __: http://bio-bwa.sourceforge.net/ + + +Instructions to add the functionality of archives management in Galaxy on the `eBiogenouest HUB wiki <https://www.e-biogenouest.org/wiki/ManArchiveGalaxy>`_ . + +------ + +**Input formats** + +BWA accepts files in either Sanger FASTQ format (galaxy type *fastqsanger*) or Illumina FASTQ format (galaxy type *fastqillumina*). Use the FASTQ Groomer to prepare your files. + +------ + +**A Note on Built-in Reference Genomes** + +The default variant for all genomes is "Full", defined as all primary chromosomes (or scaffolds/contigs) including mitochondrial plus associated unmapped, plasmid, and other segments. When only one version of a genome is available in this tool, it represents the default "Full" variant. Some genomes will have more than one variant available. The "Canonical Male" or sometimes simply "Canonical" variant contains the primary chromosomes for a genome. For example a human "Canonical" variant contains chr1-chr22, chrX, chrY, and chrM. The "Canonical Female" variant contains the primary chromosomes excluding chrY. + +------ + +**Outputs** + +The output is in SAM format, and has the following columns:: + + Column Description + -------- -------------------------------------------------------- + 1 QNAME Query (pair) NAME + 2 FLAG bitwise FLAG + 3 RNAME Reference sequence NAME + 4 POS 1-based leftmost POSition/coordinate of clipped sequence + 5 MAPQ MAPping Quality (Phred-scaled) + 6 CIGAR extended CIGAR string + 7 MRNM Mate Reference sequence NaMe ('=' if same as RNAME) + 8 MPOS 1-based Mate POSition + 9 ISIZE Inferred insert SIZE + 10 SEQ query SEQuence on the same strand as the reference + 11 QUAL query QUALity (ASCII-33 gives the Phred base quality) + 12 OPT variable OPTional fields in the format TAG:VTYPE:VALU + +The flags are as follows:: + + Flag Description + ------ ------------------------------------- + 0x0001 the read is paired in sequencing + 0x0002 the read is mapped in a proper pair + 0x0004 the query sequence itself is unmapped + 0x0008 the mate is unmapped + 0x0010 strand of the query (1 for reverse) + 0x0020 strand of the mate + 0x0040 the read is the first read in a pair + 0x0080 the read is the second read in a pair + 0x0100 the alignment is not primary + +It looks like this (scroll sideways to see the entire example):: + + QNAME FLAG RNAME POS MAPQ CIAGR MRNM MPOS ISIZE SEQ QUAL OPT + HWI-EAS91_1_30788AAXX:1:1:1761:343 4 * 0 0 * * 0 0 AAAAAAANNAAAAAAAAAAAAAAAAAAAAAAAAAAACNNANNGAGTNGNNNNNNNGCTTCCCACAGNNCTGG hhhhhhh;;hhhhhhhhhhh^hOhhhhghhhfhhhgh;;h;;hhhh;h;;;;;;;hhhhhhghhhh;;Phhh + HWI-EAS91_1_30788AAXX:1:1:1578:331 4 * 0 0 * * 0 0 GTATAGANNAATAAGAAAAAAAAAAATGAAGACTTTCNNANNTCTGNANNNNNNNTCTTTTTTCAGNNGTAG hhhhhhh;;hhhhhhhhhhhhhhhhhhhhhhhhhhhh;;h;;hhhh;h;;;;;;;hhhhhhhhhhh;;hhVh + +------- + +**BWA settings** + +All of the options have a default value. You can change any of them. All of the options in BWA have been implemented here. + +------ + +**BWA parameter list** + +This is an exhaustive list of BWA options: + +For **aln**:: + + -n NUM Maximum edit distance if the value is INT, or the fraction of missing + alignments given 2% uniform base error rate if FLOAT. In the latter + case, the maximum edit distance is automatically chosen for different + read lengths. [0.04] + -o INT Maximum number of gap opens [1] + -e INT Maximum number of gap extensions, -1 for k-difference mode + (disallowing long gaps) [-1] + -d INT Disallow a long deletion within INT bp towards the 3'-end [16] + -i INT Disallow an indel within INT bp towards the ends [5] + -l INT Take the first INT subsequence as seed. If INT is larger than the + query sequence, seeding will be disabled. For long reads, this option + is typically ranged from 25 to 35 for '-k 2'. [inf] + -k INT Maximum edit distance in the seed [2] + -t INT Number of threads (multi-threading mode) [1] + -M INT Mismatch penalty. BWA will not search for suboptimal hits with a score + lower than (bestScore-misMsc). [3] + -O INT Gap open penalty [11] + -E INT Gap extension penalty [4] + -c Reverse query but not complement it, which is required for alignment + in the color space. + -R Proceed with suboptimal alignments even if the top hit is a repeat. By + default, BWA only searches for suboptimal alignments if the top hit is + unique. Using this option has no effect on accuracy for single-end + reads. It is mainly designed for improving the alignment accuracy of + paired-end reads. However, the pairing procedure will be slowed down, + especially for very short reads (~32bp). + -N Disable iterative search. All hits with no more than maxDiff + differences will be found. This mode is much slower than the default. + +For **samse**:: + + -n INT Maximum number of alignments to output in the XA tag for reads paired + properly. If a read has more than INT hits, the XA tag will not be + written. [3] + -r STR Specify the read group in a format like '@RG\tID:foo\tSM:bar' [null] + +For **sampe**:: + + -a INT Maximum insert size for a read pair to be considered as being mapped + properly. Since version 0.4.5, this option is only used when there + are not enough good alignment to infer the distribution of insert + sizes. [500] + -n INT Maximum number of alignments to output in the XA tag for reads paired + properly. If a read has more than INT hits, the XA tag will not be + written. [3] + -N INT Maximum number of alignments to output in the XA tag for disconcordant + read pairs (excluding singletons). If a read has more than INT hits, + the XA tag will not be written. [10] + -o INT Maximum occurrences of a read for pairing. A read with more + occurrences will be treated as a single-end read. Reducing this + parameter helps faster pairing. [100000] + -r STR Specify the read group in a format like '@RG\tID:foo\tSM:bar' [null] + +For specifying the read group in **samse** or **sampe**, use the following:: + + @RG Read group. Unordered multiple @RG lines are allowed. + ID Read group identifier. Each @RG line must have a unique ID. The value of + ID is used in the RG tags of alignment records. Must be unique among all + read groups in header section. Read group IDs may be modified when + merging SAM files in order to handle collisions. + CN Name of sequencing center producing the read. + DS Description. + DT Date the run was produced (ISO8601 date or date/time). + FO Flow order. The array of nucleotide bases that correspond to the + nucleotides used for each flow of each read. Multi-base flows are encoded + in IUPAC format, and non-nucleotide flows by various other characters. + Format : /\*|[ACMGRSVTWYHKDBN]+/ + KS The array of nucleotide bases that correspond to the key sequence of each read. + LB Library. + PG Programs used for processing the read group. + PI Predicted median insert size. + PL Platform/technology used to produce the reads. Valid values : CAPILLARY, + LS454, ILLUMINA, SOLID, HELICOS, IONTORRENT and PACBIO. + PU Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for + SOLiD). Unique identifier. + SM Sample. Use pool name where a pool is being sequenced. + + + + 10.1111/mec.12354 + 10.1111/mec.12330 + 10.1534/g3.111.000240 + 10.1534/genetics.111.127324 + 10.1111/j.1755-0998.2010.02967.x + 10.1073/pnas.1006538107 + + @INPROCEEDINGS{JOBIM2013, + author = {Le Bras, Y. and ROULT, A. and Monjeaud, C. and Bahin, M. and Quenez, O. and Heriveau, C. and Bretaudeau, A. and Sallou, O. and Collin, O.}, + title = {Towards a Life Sciences Virtual Research Environment: An e-Science initiative in Western France}, + booktitle = {JOBIM 2013 Proceedings}, + year = {2013}, + url = {https://www.e-biogenouest.org/resources/128}, + pages = {97-106} + } + + + + diff -r 000000000000 -r d6ba40f6c824 stacks.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/stacks.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,367 @@ +""" + +STACKS METHODS FOR GALAXY + +Created by Cyril Monjeaud & Yvan Le Bras +Cyril.Monjeaud@irisa.fr +yvan.le_bras@irisa.fr + +Last modifications : 01/22/2014 + + +""" + +import os, sys, re, shutil +import glob +import collections +import gzip, zipfile, tarfile +import subprocess +from galaxy.datatypes.checkers import * + + +""" + +STACKS COMMON METHODS + +galaxy_config_to_tabfiles(input_config) +galaxy_config_to_tabfiles_for_STACKS(input_config) +extract_compress_files_from_tabfiles(tab_files, tmp_input_dir) +create_symlinks_from_tabfiles(tab_files, tmp_input_dir) + +""" +def galaxy_config_to_tabfiles(input_config): + + tab_files=collections.OrderedDict() + for line in open(input_config, "r").readlines(): + if line.strip() != '': + extract=line.strip().split("::") + tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1] + + # tabfiles[name]-> path + return tab_files + + +def galaxy_config_to_tabfiles_for_STACKS(input_config): + + tab_files=collections.OrderedDict() + for line in open(input_config, "r").readlines(): + if line.strip() != '': + extract=line.strip().split("::") + parse_name=re.search("^STACKS.*\((.*\.[ATCG]*\.fq)\)$", extract[0]) + # rename galaxy name in a short name + if parse_name: + extract[0]=parse_name.groups(1)[0] + + tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1] + + # tabfiles[name]-> path + return tab_files + + +def extract_compress_files_from_tabfiles(tab_files, tmp_input_dir): + + # for each file + for key in tab_files.keys(): + #test if is zip file + if (check_zip( tab_files[key] )): + + # extract all files names and added it in the tab + myarchive = zipfile.ZipFile(tab_files[key], 'r') + for i in myarchive.namelist(): + tab_files[i]=tmp_input_dir+"/"+i + + # extract all files + myarchive.extractall(tmp_input_dir) + + #remove compress file from the tab + del tab_files[key] + + #test if is tar.gz file + else: + if tarfile.is_tarfile( tab_files[key] ) and check_gzip( tab_files[key] ): + # extract all files names and added it in the tab + mygzfile = tarfile.open(tab_files[key], 'r') + + for i in mygzfile.getnames(): + tab_files[i]=tmp_input_dir+"/"+i + + # extract all files + mygzfile.extractall(tmp_input_dir) + + #remove compress file from the tab + del tab_files[key] + + + +def create_symlinks_from_tabfiles(tab_files, tmp_input_dir): + + for key in tab_files.keys(): + #print "file single: "+key+" -> "+tab_files[key] + #create a sym_link in our temp dir + if not os.path.exists(tmp_input_dir+'/'+key): + os.symlink(tab_files[key], tmp_input_dir+'/'+key) + + +""" + +PROCESS RADTAGS METHODS + +generate_additional_file(tmp_output_dir, output_archive) + +""" + +def change_outputs_procrad_name(tmp_output_dir, sample_name): + + list_files = glob.glob(tmp_output_dir+'/*') + + for file in list_files: + # change sample name + new_file_name=os.path.basename(file.replace("_",".").replace("sample", sample_name)) + + # transform .fa -> .fasta or .fq->.fastq + if os.path.splitext(new_file_name)[1] == ".fa": + new_file_name = os.path.splitext(new_file_name)[0]+'.fasta' + else: + new_file_name = os.path.splitext(new_file_name)[0]+'.fastq' + + shutil.move(tmp_output_dir+'/'+os.path.basename(file), tmp_output_dir+'/'+new_file_name) + + +def generate_additional_archive_file(tmp_output_dir, output_archive): + + list_files = glob.glob(tmp_output_dir+'/*') + + myzip=zipfile.ZipFile("archive.zip.temp", 'w', allowZip64=True) + + # for each fastq file + for fastq_file in list_files: + # add file to the archive output + myzip.write(fastq_file, os.path.basename(fastq_file)) + + shutil.move("archive.zip.temp", output_archive) + + +""" + +DENOVOMAP METHODS + +check_fastq_extension_and_add(tab_files, tmp_input_dir) + +""" + +def check_fastq_extension_and_add(tab_files, tmp_input_dir): + + # for each file + for key in tab_files.keys(): + + if not re.search("\.fq$", key) and not re.search("\.fastq$", key) and not re.search("\.fa$", key) and not re.search("\.fasta$", key): + # open the file + myfastxfile=open(tab_files[key], 'r') + + # get the header + line = myfastxfile.readline() + line = line.strip() + + # fasta rapid test + if line.startswith( '>' ): + tab_files[key+".fasta"]=tab_files[key] + del tab_files[key] + # fastq rapid test + elif line.startswith( '@' ): + tab_files[key+".fq"]=tab_files[key] + del tab_files[key] + else: + print "[WARNING] : your input file "+key+" was not extension and is not recognize as a Fasta or Fastq file" + + myfastxfile.close() + + +""" + +REFMAP METHODS + +""" + +def check_sam_extension_and_add(tab_files, tmp_input_dir): + + # for each file + for key in tab_files.keys(): + + if not re.search("\.sam$", key): + # add the extension + tab_files[key+".sam"]=tab_files[key] + del tab_files[key] + + + + + + +""" + +PREPARE POPULATION MAP METHODS + +generate_popmap_for_denovo(tab_files, infos_file, pop_map) +generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map) + + +""" +def generate_popmap_for_denovo(tab_files, infos_file, pop_map): + + # initiate the dict : barcode -> tab[seq] + fq_name_for_barcode={} + + for key in tab_files: + single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", key).groups(0)[0] + fq_name_for_barcode[single_barcode]=key + + # open the infos file and output file + my_open_info_file=open(infos_file, 'r') + my_output_file=open(pop_map, 'w') + + # conversion tab for population to integer + pop_to_int=[] + + # write infos into the final output + for line in my_open_info_file: + + parse_line=re.search("(^[ATCG]+)\t(.*)", line.strip()) + + if not parse_line: + print "[WARNING] Wrong input infos file structure : "+line + else: + barcode=parse_line.groups(1)[0] + population_name=parse_line.groups(1)[1] + + # if its the first meet with the population + if population_name not in pop_to_int: + pop_to_int.append(population_name) + + # manage ext if present, because the population map file should not have the ext + if re.search("(\.fq$|\.fastq$)", fq_name_for_barcode[barcode]): + fqfile=os.path.splitext(fq_name_for_barcode[barcode])[0] + else: + fqfile=fq_name_for_barcode[barcode] + + + # write in the file + my_output_file.write(fqfile+"\t"+str(pop_to_int.index(population_name))+"\n") + + # close files + my_output_file.close() + my_open_info_file.close() + + + + +def generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map): + + # initiate the dict : barcode -> tab[seq] + seq_id_for_barcode={} + + # initiate the dict : barcode -> sam_name + sam_name_for_barcode={} + + ### Parse fastqfiles ### + # insert my barcode into a tab with sequences ID associated + for fastq_file in tab_fq_files.keys(): + single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", fastq_file).groups(0)[0] + + # open the fasq file + open_fastqfile=open(tab_fq_files[fastq_file], 'r') + + # for each line, get the seq ID + tab_seq_id=[] + for line in open_fastqfile: + my_match_seqID=re.search("^@([A-Z0-9]+\.[0-9]+)\s.*", line) + if my_match_seqID: + tab_seq_id.append(my_match_seqID.groups(0)[0]) + + # push in a dict the tab of seqID for the current barcode + seq_id_for_barcode[single_barcode]=tab_seq_id + + + ### Parse samfiles and get the first seq id ### + for sam_file in tab_sam_files.keys(): + + # open the sam file + open_samfile=open(tab_sam_files[sam_file], 'r') + + # get the first seq id + first_seq_id='' + for line in open_samfile: + if not re.search("^@", line): + first_seq_id=line.split("\t")[0] + break + + + # map with seq_id_for_barcode structure + for barcode in seq_id_for_barcode: + for seq in seq_id_for_barcode[barcode]: + if seq == first_seq_id: + #print "sam -> "+sam_file+" seq -> "+first_seq_id+" barcode -> "+barcode + sam_name_for_barcode[barcode]=sam_file + break + + # open the infos file and output file + my_open_info_file=open(infos_file, 'r') + my_output_file=open(pop_map, 'w') + + # conversion tab for population to integer + pop_to_int=[] + + # write infos into the final output + for line in my_open_info_file: + parse_line=re.search("(^[ATCG]+)\t(.*)", line) + + if not parse_line: + print "[WARNING] Wrong input infos file structure : "+line + else: + + # if its the first meet with the population + if parse_line.groups(1)[1] not in pop_to_int: + pop_to_int.append(parse_line.groups(1)[1]) + + # manage ext if present, because the population map file should not have the ext + if re.search("\.sam", sam_name_for_barcode[parse_line.groups(1)[0]]): + samfile=os.path.splitext(sam_name_for_barcode[parse_line.groups(1)[0]])[0] + else: + samfile=sam_name_for_barcode[parse_line.groups(1)[0]] + + # write in the file + my_output_file.write(samfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n") + + # close files + my_output_file.close() + my_open_info_file.close() + + +""" + +STACKS POPULATION + + +""" + + +def extract_compress_files(myfile, tmp_input_dir): + + #test if is zip file + if (check_zip( myfile )): + + # extract all files names and added it in the tab + myarchive = zipfile.ZipFile(myfile, 'r') + + # extract all files + myarchive.extractall(tmp_input_dir) + + + #test if is tar.gz file + else: + # extract all files names and added it in the tab + mygzfile = tarfile.open(myfile, 'r') + + # extract all files + mygzfile.extractall(tmp_input_dir) + + diff -r 000000000000 -r d6ba40f6c824 tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,8 @@ + + + + + value, dbkey, name, path + +
+
diff -r 000000000000 -r d6ba40f6c824 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,13 @@ + + + + + + + + + + + + +