Previous changeset 13:cfc86c3fc5c8 (2017-02-17) Next changeset 15:aff38ea879f1 (2018-02-28) |
Commit message:
planemo upload for repository https://github.com/blankenberg/tools-blankenberg/tree/master/tools/naive_variant_caller commit a1f39a3e28911591f6a1ed58a43e95e0baf5e750 |
modified:
README.rst |
added:
naive_variant_caller.xml tool_data_table_conf.xml.sample |
removed:
dependency_configs/tool_dependencies.xml tool-data/tool_data_table_conf.xml.sample tools/naive_variant_caller.py tools/naive_variant_caller.xml |
b |
diff -r cfc86c3fc5c8 -r 5c852eca82e0 README.rst --- a/README.rst Fri Feb 17 11:42:07 2017 -0500 +++ b/README.rst Wed Feb 28 15:54:57 2018 -0500 |
b |
@@ -1,4 +1,4 @@ -This repository contains the **Naive Variant Caller** tool. +This repository contains the **Naive Variant Caller** tool (NVC). ------ |
b |
diff -r cfc86c3fc5c8 -r 5c852eca82e0 dependency_configs/tool_dependencies.xml --- a/dependency_configs/tool_dependencies.xml Fri Feb 17 11:42:07 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,12 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="numpy" version="1.7.1"> - <repository changeset_revision="300877695495" name="package_numpy_1_7" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> - </package> - <package name="pyBamParser" version="0.0.1"> - <repository changeset_revision="144681ee972c" name="package_pybamparser_0_0_1" owner="blankenberg" toolshed="https://toolshed.g2.bx.psu.edu" /> - </package> - <package name="pyBamTools" version="0.0.2"> - <repository changeset_revision="6819855ac2e8" name="package_pybamtools_0_0_2" owner="blankenberg" toolshed="https://toolshed.g2.bx.psu.edu" /> - </package> -</tool_dependency> |
b |
diff -r cfc86c3fc5c8 -r 5c852eca82e0 naive_variant_caller.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/naive_variant_caller.xml Wed Feb 28 15:54:57 2018 -0500 |
b |
b'@@ -0,0 +1,232 @@\n+<tool id="naive_variant_caller" name="Naive Variant Caller" version="0.0.3">\n+ <description> - tabulate variable sites from BAM datasets</description>\n+ <requirements>\n+ <requirement type="package" version="0.0.3">nvc</requirement>\n+ </requirements>\n+ <stdio>\n+ <exit_code range="1:" />\n+ <exit_code range=":-1" />\n+ </stdio>\n+ <version_command>naive_variant_caller.py --version</version_command>\n+ <command>naive_variant_caller.py\n+ -o "${output_vcf}"\n+ \n+ #for $input_bam in $reference_source.input_bams:\n+ -b \'${input_bam.input_bam}\'\n+ -i \'${input_bam.input_bam.metadata.bam_index}\'\n+ #end for\n+ \n+ #if $reference_source.reference_source_selector != "history":\n+ -r \'${reference_source.ref_file.fields.path}\'\n+ #elif $reference_source.ref_file:\n+ -r \'${reference_source.ref_file}\'\n+ #end if\n+ \n+ #for $region in $regions:\n+ --region \'${region.chromosome}:${region.start}-${region.end}\'\n+ #end for\n+ \n+ #for $region_file in $region_files:\n+ --regions_filename \'${region_file.input_region}\'\n+ --regions_file_columns \'${int($region_file.input_region.metadata.chromCol)-1},${int($region_file.input_region.metadata.startCol)-1},${int($region_file.input_region.metadata.endCol)-1}\'\n+ #end for\n+\n+ ${variants_only}\n+ \n+ ${use_strand}\n+ \n+ --ploidy \'${$ploidy}\'\n+ \n+ --min_support_depth \'${min_support_depth}\'\n+ \n+ #if str($min_base_quality):\n+ --min_base_quality \'${min_base_quality}\'\n+ #end if\n+ \n+ #if str($min_mapping_quality):\n+ --min_mapping_quality \'${min_mapping_quality}\'\n+ #end if\n+ \n+ --allow_out_of_bounds_positions\n+ \n+ #if str( $advanced_options.advanced_options_selector ) == "advanced":\n+ #if str( $advanced_options.coverage_dtype ) != "guess":\n+ --coverage_dtype \'${advanced_options.coverage_dtype}\'\n+ #end if\n+ ${advanced_options.safe}\n+ #end if \n+ </command>\n+ <inputs>\n+ <conditional name="reference_source">\n+ <param name="reference_source_selector" type="select" label="Choose the source for the reference list">\n+ <option value="cached">Locally cached</option>\n+ <option value="history">History</option>\n+ </param>\n+ <when value="cached">\n+ <repeat name="input_bams" title="BAM file" min="1" >\n+ <param name="input_bam" type="data" format="bam" label="BAM file">\n+ <validator type="unspecified_build" />\n+ <validator type="dataset_metadata_in_data_table" table_name="sam_fa_indexes" metadata_name="dbkey" metadata_column="value" message="Sequences are not currently available for the specified build." /> <!-- fixme!!! this needs to be a select -->\n+ </param>\n+ </repeat>\n+ <param name="ref_file" type="select" label="Using reference genome" >\n+ <options from_data_table="sam_fa_indexes">\n+ <!-- <filter type="data_meta" key="dbkey" ref="input_bam" column="dbkey"/> does not yet work in a repeat...--> \n+ </options>\n+ <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>\n+ </param>\n+ </when>\n+ <when value="history"> <!-- FIX ME!!!! -->\n+ <repeat name="input_bams" title="BAM file" min="1" >\n+ <param name="input_bam" type="data" format="bam" label="BAM file" >\n+ </param>\n+ </repeat>\n+ <param name="ref_file" type="data" format="fasta" label="Using reference file" optional="True" />\n+ </when>\n+ </conditional>\n+\n+ <repeat name="regions" title="Restrict to regions" min="0" >\n+ <param name="chromosome" type="text" value="" optional="False" label="Chromosome" />\n+ <param name="start" type="integer" value="" optional="True" label="Start" help="0-based, closed. (BED style)" />\n+ <param n'..b'ligned sequencing reads from the BAM format and produces a VCF file containing per position variant calls. This tool allows multiple BAM files to be provided as input and utilizes read group information to make calls for individual samples. \n+\n+User configurable options allow filtering reads that do not pass mapping or base quality thresholds and minimum per base read depth; user\'s can also specify the ploidy and whether to consider each strand separately. \n+\n+In addition to calling alternate alleles based upon simple ratios of nucleotides at a position, per base nucleotide counts are also provided. A custom tag, NC, is used within the Genotype fields. The NC field is a comma-separated listing of nucleotide counts in the form of <nucleotide>=<count>, where a plus or minus character is prepended to indicate strand, if the strandedness option was specified.\n+ \n+\n+------\n+\n+**Inputs**\n+\n+Accepts one or more BAM input files and a reference genome from the built-in list or from a FASTA file in your history.\n+\n+\n+**Outputs**\n+\n+The output is in VCF format.\n+\n+Example VCF output line, without reporting by strand:\n+ ``chrM\t16029\t.\tT\tG,A,C\t.\t.\tAC=15,9,5;AF=0.00155311658729,0.000931869952371,0.000517705529095\tGT:AC:AF:NC\t0/0:15,9,5:0.00155311658729,0.000931869952371,0.000517705529095:A=9,C=5,T=9629,G=15,``\n+\n+Example VCF output line, when reporting by strand:\n+ ``chrM\t16029\t.\tT\tG,A,C\t.\t.\tAC=15,9,5;AF=0.00155311658729,0.000931869952371,0.000517705529095\tGT:AC:AF:NC\t0/0:15,9,5:0.00155311658729,0.000931869952371,0.000517705529095:+T=3972,-A=9,-C=5,-T=5657,-G=15,``\n+\n+**Options**\n+\n+Reference Genome:\n+\n+ Ensure that you have selected the correct reference genome, either from the list of built-in genomes or by selecting the corresponding FASTA file from your history.\n+\n+Restrict to regions:\n+\n+ You can specify any number of regions on which you would like to receive results. You can specify just a chromosome name, or a chromosome name and start postion, or a chromosome name and start and end position for the set of desired regions. \n+\n+Minimum number of reads needed to consider a REF/ALT:\n+\n+ This value declares the minimum number of reads containing a particular base at each position in order to list and use said allele in genotyping calls. Default is 0.\n+\n+Minimum base quality:\n+\n+ The minimum base quality score needed for the position in a read to be used for nucleotide counts and genotyping. Default is no filter.\n+\n+Minimum mapping quality:\n+\n+ The minimum mapping quality score needed to consider a read for nucleotide counts and genotyping. Default is no filter.\n+\n+Ploidy:\n+\n+ The number of genotype calls to make at each reported position.\n+\n+Only write out positions with possible alternate alleles:\n+\n+ When set, only positions which have at least one non-reference nucleotide which passes declare filters will be present in the output.\n+\n+Report counts by strand:\n+\n+ When set, nucleotide counts (NC) will be reported in reference to the aligned read\'s source strand. Reported as: <strand><BASE>=<COUNT>.\n+\n+Choose the dtype to use for storing coverage information:\n+\n+ This controls the maximum depth value for each nucleotide/position/strand (when specified). Smaller values require the least amount of memory, but have smaller maximal limits.\n+\n+ +--------+----------------------------+\n+ | name | maximum coverage value |\n+ +========+============================+\n+ | uint8 | 255 |\n+ +--------+----------------------------+\n+ | uint16 | 65,535 |\n+ +--------+----------------------------+\n+ | uint32 | 4,294,967,295 |\n+ +--------+----------------------------+\n+ | uint64 | 18,446,744,073,709,551,615 |\n+ +--------+----------------------------+\n+\n+\n+ </help>\n+ <citations>\n+ <citation type="doi">10.1186/gb4161</citation>\n+ </citations>\n+\n+</tool>\n' |
b |
diff -r cfc86c3fc5c8 -r 5c852eca82e0 tool-data/tool_data_table_conf.xml.sample --- a/tool-data/tool_data_table_conf.xml.sample Fri Feb 17 11:42:07 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,7 +0,0 @@ -<tables> - <!-- Location of SAMTools indexes and other files --> - <table name="sam_fa_indexes" comment_char="#"> - <columns>line_type, value, path</columns> - <file path="tool-data/sam_fa_indices.loc" /> - </table> -</tables> |
b |
diff -r cfc86c3fc5c8 -r 5c852eca82e0 tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Wed Feb 28 15:54:57 2018 -0500 |
b |
@@ -0,0 +1,7 @@ +<tables> + <!-- Location of SAMTools indexes and other files --> + <table name="sam_fa_indexes" comment_char="#"> + <columns>line_type, value, path</columns> + <file path="tool-data/sam_fa_indices.loc" /> + </table> +</tables> |
b |
diff -r cfc86c3fc5c8 -r 5c852eca82e0 tools/naive_variant_caller.py --- a/tools/naive_variant_caller.py Fri Feb 17 11:42:07 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,72 +0,0 @@ -#!/usr/bin/env python -#Dan Blankenberg -import sys -import optparse - -from pyBamParser.bam import Reader -from pyBamTools.genotyping.naive import VCFReadGroupGenotyper, PROGRAM_NAME, PROGRAM_VERSION - -def main(): - #Parse Command Line - parser = optparse.OptionParser() - parser.add_option( '-b', '--bam', dest='bam_file', action='append', type="string", default=[], help='BAM filename, optionally index filename. Multiple allowed.' ) - parser.add_option( '-i', '--index', dest='index_file', action='append', type="string", default=[], help='optionally index filename. Multiple allowed.' ) - parser.add_option( '-o', '--output_vcf_filename', dest='output_vcf_filename', action='store', default = None, type="string", help='Output VCF filename' ) - parser.add_option( '-r', '--reference_genome_filename', dest='reference_genome_filename', action='store', default = None, type="string", help='Input reference file' ) - parser.add_option( '-v', '--variants_only', dest='variants_only', action='store_true', default = False, help='Report only sites with a possible variant allele.' ) - parser.add_option( '-s', '--use_strand', dest='use_strand', action='store_true', default = False, help='Report counts by strand' ) - parser.add_option( '-p', '--ploidy', dest='ploidy', action='store', type="int", default=2, help='Ploidy. Default=2.' ) - parser.add_option( '-d', '--min_support_depth', dest='min_support_depth', action='store', type="int", default=0, help='Minimum number of reads needed to consider a REF/ALT. Default=0.' ) - parser.add_option( '-q', '--min_base_quality', dest='min_base_quality', action='store', type="int", default=None, help='Minimum base quality.' ) - parser.add_option( '-m', '--min_mapping_quality', dest='min_mapping_quality', action='store', type="int", default=None, help='Minimum mapping.' ) - parser.add_option( '-t', '--coverage_dtype', dest='coverage_dtype', action='store', type="string", default=None, help='dtype to use for coverage array' ) - parser.add_option( '--allow_out_of_bounds_positions', dest='allow_out_of_bounds_positions', action='store_true', default = False, help='Allows out of bounds positions to not throw fatal errors' ) - parser.add_option( '--safe', dest='safe', action='store_true', default = False, help='Perform checks to prevent certain errors. Is slower.' ) - parser.add_option( '--region', dest='region', action='append', type="string", default=[], help='region' ) - parser.add_option( '', '--version', dest='version', action='store_true', default = False, help='Report version and quit' ) - (options, args) = parser.parse_args() - - if options.version: - print "%s version %s" % ( PROGRAM_NAME, PROGRAM_VERSION ) - sys.exit(0) - - if len( options.bam_file ) == 0: - print >>sys.stderr, 'You must provide at least one bam (-b) file.' - parser.print_help( sys.stderr ) - sys.exit( 1 ) - if options.index_file: - assert len( options.index_file ) == len( options.bam_file ), "If you provide a name for an index file, you must provide the index name for all bam files." - bam_files = zip( options.bam_file, options.index_file ) - else: - bam_files = [ ( x, ) for x in options.bam_file ] - if not options.reference_genome_filename: - print >> sys.stderr, "Warning: Reference file has not been specified. Providing a reference genome is highly recommended." - if options.output_vcf_filename: - out = open( options.output_vcf_filename, 'wb' ) - else: - out = sys.stdout - - regions = [] - if options.region: - for region in options.region: - region_split = region.split( ":" ) - region = region_split.pop( 0 ) - if region_split: - region_split = filter( bool, region_split[0].split( '-' ) ) - if region_split: - if len( region_split ) != 2: - print >> sys.stderr, "You must specify both a start and an end, or only a chromosome when specifying regions." - cleanup_before_exit( tmp_dir ) - sys.exit( 1 ) - region = tuple( [ region ] + map( int, region_split ) ) - regions.append( region ) - - coverage = VCFReadGroupGenotyper( map( lambda x: Reader( *x ), bam_files ), options.reference_genome_filename, dtype=options.coverage_dtype, - min_support_depth=options.min_support_depth, min_base_quality=options.min_base_quality, - min_mapping_quality=options.min_mapping_quality, restrict_regions=regions, use_strand=options.use_strand, - allow_out_of_bounds_positions=options.allow_out_of_bounds_positions, safe=options.safe ) - for line in coverage.iter_vcf( ploidy=options.ploidy, variants_only=options.variants_only ): - out.write( "%s\n" % line ) - out.close() - -if __name__ == "__main__": main() |
b |
diff -r cfc86c3fc5c8 -r 5c852eca82e0 tools/naive_variant_caller.xml --- a/tools/naive_variant_caller.xml Fri Feb 17 11:42:07 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
b'@@ -1,226 +0,0 @@\n-<tool id="naive_variant_caller" name="Naive Variant Caller" version="0.0.2">\n- <description> - tabulate variable sites from BAM datasets</description>\n- <requirements>\n- <requirement type="package" version="1.7.1">numpy</requirement>\n- <requirement type="package" version="0.0.1">pyBamParser</requirement>\n- <requirement type="package" version="0.0.2">pyBamTools</requirement>\n- </requirements>\n- <stdio>\n- <exit_code range="1:" err_level="fatal" />\n- <exit_code range=":-1" err_level="fatal" />\n- </stdio>\n- <command interpreter="python">naive_variant_caller.py\n- -o "${output_vcf}"\n- \n- #for $input_bam in $reference_source.input_bams:\n- -b "${input_bam.input_bam}"\n- -i "${input_bam.input_bam.metadata.bam_index}"\n- #end for\n- \n- #if $reference_source.reference_source_selector != "history":\n- -r "${reference_source.ref_file.fields.path}"\n- #elif $reference_source.ref_file:\n- -r "${reference_source.ref_file}"\n- #end if\n- \n- #for $region in $regions:\n- --region "${region.chromosome}:${region.start}-${region.end}"\n- #end for\n- \n- ${variants_only}\n- \n- ${use_strand}\n- \n- --ploidy "${$ploidy}"\n- \n- --min_support_depth "${min_support_depth}"\n- \n- #if str($min_base_quality):\n- --min_base_quality "${min_base_quality}"\n- #end if\n- \n- #if str($min_mapping_quality):\n- --min_mapping_quality "${min_mapping_quality}"\n- #end if\n- \n- --allow_out_of_bounds_positions\n- \n- #if str( $advanced_options.advanced_options_selector ) == "advanced":\n- #if str( $advanced_options.coverage_dtype ) != "guess":\n- --coverage_dtype "${advanced_options.coverage_dtype}"\n- #end if\n- ${advanced_options.safe}\n- #end if \n- </command>\n- <version_command interpreter="python">naive_variant_caller.py --version</version_command>\n- <inputs>\n- <conditional name="reference_source">\n- <param name="reference_source_selector" type="select" label="Choose the source for the reference list">\n- <option value="cached">Locally cached</option>\n- <option value="history">History</option>\n- </param>\n- <when value="cached">\n- <repeat name="input_bams" title="BAM file" min="1" >\n- <param name="input_bam" type="data" format="bam" label="BAM file">\n- <validator type="unspecified_build" />\n- <validator type="dataset_metadata_in_data_table" table_name="sam_fa_indexes" metadata_name="dbkey" metadata_column="value" message="Sequences are not currently available for the specified build." /> <!-- fixme!!! this needs to be a select -->\n- </param>\n- </repeat>\n- <param name="ref_file" type="select" label="Using reference genome" >\n- <options from_data_table="sam_fa_indexes">\n- <!-- <filter type="data_meta" key="dbkey" ref="input_bam" column="dbkey"/> does not yet work in a repeat...--> \n- </options>\n- <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>\n- </param>\n- </when>\n- <when value="history"> <!-- FIX ME!!!! -->\n- <repeat name="input_bams" title="BAM file" min="1" >\n- <param name="input_bam" type="data" format="bam" label="BAM file" >\n- </param>\n- </repeat>\n- <param name="ref_file" type="data" format="fasta" label="Using reference file" optional="True" />\n- </when>\n- </conditional>\n-\n- <repeat name="regions" title="Restrict to regions" min="0" >\n- <param name="chromosome" type="text" value="" optional="False" label="Chromosome" />\n- <param name="start" type="integer" value="" optional="True" label="Start" />\n- <param name="end" type="integer" value="" optional="True" label="End" />\n- </repeat>\n-\n- <!-- TODO: enhance filtering -->\n'..b'.\n- \n-\n-------\n-\n-**Inputs**\n-\n-Accepts one or more BAM input files and a reference genome from the built-in list or from a FASTA file in your history.\n-\n-\n-**Outputs**\n-\n-The output is in VCF format.\n-\n-Example VCF output line, without reporting by strand:\n- ``chrM\t16029\t.\tT\tG,A,C\t.\t.\tAC=15,9,5;AF=0.00155311658729,0.000931869952371,0.000517705529095\tGT:AC:AF:NC\t0/0:15,9,5:0.00155311658729,0.000931869952371,0.000517705529095:A=9,C=5,T=9629,G=15,``\n-\n-Example VCF output line, when reporting by strand:\n- ``chrM\t16029\t.\tT\tG,A,C\t.\t.\tAC=15,9,5;AF=0.00155311658729,0.000931869952371,0.000517705529095\tGT:AC:AF:NC\t0/0:15,9,5:0.00155311658729,0.000931869952371,0.000517705529095:+T=3972,-A=9,-C=5,-T=5657,-G=15,``\n-\n-**Options**\n-\n-Reference Genome:\n-\n- Ensure that you have selected the correct reference genome, either from the list of built-in genomes or by selecting the corresponding FASTA file from your history.\n-\n-Restrict to regions:\n-\n- You can specify any number of regions on which you would like to receive results. You can specify just a chromosome name, or a chromosome name and start postion, or a chromosome name and start and end position for the set of desired regions. \n-\n-Minimum number of reads needed to consider a REF/ALT:\n-\n- This value declares the minimum number of reads containing a particular base at each position in order to list and use said allele in genotyping calls. Default is 0.\n-\n-Minimum base quality:\n-\n- The minimum base quality score needed for the position in a read to be used for nucleotide counts and genotyping. Default is no filter.\n-\n-Minimum mapping quality:\n-\n- The minimum mapping quality score needed to consider a read for nucleotide counts and genotyping. Default is no filter.\n-\n-Ploidy:\n-\n- The number of genotype calls to make at each reported position.\n-\n-Only write out positions with possible alternate alleles:\n-\n- When set, only positions which have at least one non-reference nucleotide which passes declare filters will be present in the output.\n-\n-Report counts by strand:\n-\n- When set, nucleotide counts (NC) will be reported in reference to the aligned read\'s source strand. Reported as: <strand><BASE>=<COUNT>.\n-\n-Choose the dtype to use for storing coverage information:\n-\n- This controls the maximum depth value for each nucleotide/position/strand (when specified). Smaller values require the least amount of memory, but have smaller maximal limits.\n-\n- +--------+----------------------------+\n- | name | maximum coverage value |\n- +========+============================+\n- | uint8 | 255 |\n- +--------+----------------------------+\n- | uint16 | 65,535 |\n- +--------+----------------------------+\n- | uint32 | 4,294,967,295 |\n- +--------+----------------------------+\n- | uint64 | 18,446,744,073,709,551,615 |\n- +--------+----------------------------+\n-\n-\n- </help>\n- <tests>\n- <test>\n- <param name="reference_source_selector" value="history" />\n- <param name="input_bam" value="fake_phiX174_reads_1.bam" ftype="bam" /> \n- <param name="ref_file" value="phiX174.fasta" ftype="fasta" />\n- <param name="regions" value="0" />\n- <param name="min_support_depth" value="0" />\n- <param name="min_base_quality" value="" />\n- <param name="min_mapping_quality" value="" />\n- <param name="ploidy" value="2" />\n- <param name="variants_only" value="False" />\n- <param name="use_strand" value="False" />\n- <param name="advanced_options_selector" value="advanced" />\n- <param name="coverage_dtype" value="uint8" />\n- <output name="output_vcf" file="fake_phiX174_reads_1_test_out_1.vcf" compare="contains" />\n- </test>\n- </tests>\n-\n- <citations>\n- <citation type="doi">10.1186/gb4161</citation>\n- </citations>\n-\n-</tool>\n' |