Galaxy |

Changeset 14:5c852eca82e0 (2018-02-28)

Previous changeset 13:cfc86c3fc5c8 (2017-02-17) Next changeset 15:aff38ea879f1 (2018-02-28)

Commit message:
planemo upload for repository https://github.com/blankenberg/tools-blankenberg/tree/master/tools/naive_variant_caller commit a1f39a3e28911591f6a1ed58a43e95e0baf5e750

modified:
README.rst

added:
naive_variant_caller.xml
tool_data_table_conf.xml.sample

removed:
dependency_configs/tool_dependencies.xml
tool-data/tool_data_table_conf.xml.sample
tools/naive_variant_caller.py
tools/naive_variant_caller.xml

diff -r cfc86c3fc5c8 -r 5c852eca82e0 README.rst
--- a/README.rst Fri Feb 17 11:42:07 2017 -0500
+++ b/README.rst Wed Feb 28 15:54:57 2018 -0500

@@ -1,4 +1,4 @@
-This repository contains the **Naive Variant Caller** tool.
+This repository contains the **Naive Variant Caller** tool (NVC).

------

diff -r cfc86c3fc5c8 -r 5c852eca82e0 dependency_configs/tool_dependencies.xml
--- a/dependency_configs/tool_dependencies.xml Fri Feb 17 11:42:07 2017 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,12 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="numpy" version="1.7.1">
-        <repository changeset_revision="300877695495" name="package_numpy_1_7" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
-    </package>
-    <package name="pyBamParser" version="0.0.1">
-        <repository changeset_revision="144681ee972c" name="package_pybamparser_0_0_1" owner="blankenberg" toolshed="https://toolshed.g2.bx.psu.edu" />
-    </package>
-    <package name="pyBamTools" version="0.0.2">
-        <repository changeset_revision="6819855ac2e8" name="package_pybamtools_0_0_2" owner="blankenberg" toolshed="https://toolshed.g2.bx.psu.edu" />
-    </package>
-</tool_dependency>

diff -r cfc86c3fc5c8 -r 5c852eca82e0 naive_variant_caller.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/naive_variant_caller.xml Wed Feb 28 15:54:57 2018 -0500

b'@@ -0,0 +1,232 @@\n+<tool id="naive_variant_caller" name="Naive Variant Caller" version="0.0.3">\n+ <description> - tabulate variable sites from BAM datasets</description>\n+ <requirements>\n+ <requirement type="package" version="0.0.3">nvc</requirement>\n+ </requirements>\n+ <stdio>\n+ <exit_code range="1:" />\n+ <exit_code range=":-1" />\n+ </stdio>\n+ <version_command>naive_variant_caller.py --version</version_command>\n+ <command>naive_variant_caller.py\n+ -o "${output_vcf}"\n+ \n+ #for $input_bam in $reference_source.input_bams:\n+ -b \'${input_bam.input_bam}\'\n+ -i \'${input_bam.input_bam.metadata.bam_index}\'\n+ #end for\n+ \n+ #if $reference_source.reference_source_selector != "history":\n+ -r \'${reference_source.ref_file.fields.path}\'\n+ #elif $reference_source.ref_file:\n+ -r \'${reference_source.ref_file}\'\n+ #end if\n+ \n+ #for $region in $regions:\n+ --region \'${region.chromosome}:${region.start}-${region.end}\'\n+ #end for\n+ \n+ #for $region_file in $region_files:\n+ --regions_filename \'${region_file.input_region}\'\n+ --regions_file_columns \'${int($region_file.input_region.metadata.chromCol)-1},${int($region_file.input_region.metadata.startCol)-1},${int($region_file.input_region.metadata.endCol)-1}\'\n+ #end for\n+\n+ ${variants_only}\n+ \n+ ${use_strand}\n+ \n+ --ploidy \'${$ploidy}\'\n+ \n+ --min_support_depth \'${min_support_depth}\'\n+ \n+ #if str($min_base_quality):\n+ --min_base_quality \'${min_base_quality}\'\n+ #end if\n+ \n+ #if str($min_mapping_quality):\n+ --min_mapping_quality \'${min_mapping_quality}\'\n+ #end if\n+ \n+ --allow_out_of_bounds_positions\n+ \n+ #if str( $advanced_options.advanced_options_selector ) == "advanced":\n+ #if str( $advanced_options.coverage_dtype ) != "guess":\n+ --coverage_dtype \'${advanced_options.coverage_dtype}\'\n+ #end if\n+ ${advanced_options.safe}\n+ #end if \n+ </command>\n+ <inputs>\n+ <conditional name="reference_source">\n+ <param name="reference_source_selector" type="select" label="Choose the source for the reference list">\n+ <option value="cached">Locally cached</option>\n+ <option value="history">History</option>\n+ </param>\n+ <when value="cached">\n+ <repeat name="input_bams" title="BAM file" min="1" >\n+ <param name="input_bam" type="data" format="bam" label="BAM file">\n+ <validator type="unspecified_build" />\n+ <validator type="dataset_metadata_in_data_table" table_name="sam_fa_indexes" metadata_name="dbkey" metadata_column="value" message="Sequences are not currently available for the specified build." /> \n+ </param>\n+ </repeat>\n+ <param name="ref_file" type="select" label="Using reference genome" >\n+ <options from_data_table="sam_fa_indexes">\n+  \n+ </options>\n+ <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>\n+ </param>\n+ </when>\n+ <when value="history"> \n+ <repeat name="input_bams" title="BAM file" min="1" >\n+ <param name="input_bam" type="data" format="bam" label="BAM file" >\n+ </param>\n+ </repeat>\n+ <param name="ref_file" type="data" format="fasta" label="Using reference file" optional="True" />\n+ </when>\n+ </conditional>\n+\n+ <repeat name="regions" title="Restrict to regions" min="0" >\n+ <param name="chromosome" type="text" value="" optional="False" label="Chromosome" />\n+ <param name="start" type="integer" value="" optional="True" label="Start" help="0-based, closed. (BED style)" />\n+ <param n'..b'ligned sequencing reads from the BAM format and produces a VCF file containing per position variant calls. This tool allows multiple BAM files to be provided as input and utilizes read group information to make calls for individual samples. \n+\n+User configurable options allow filtering reads that do not pass mapping or base quality thresholds and minimum per base read depth; user\'s can also specify the ploidy and whether to consider each strand separately. \n+\n+In addition to calling alternate alleles based upon simple ratios of nucleotides at a position, per base nucleotide counts are also provided. A custom tag, NC, is used within the Genotype fields. The NC field is a comma-separated listing of nucleotide counts in the form of <nucleotide>=<count>, where a plus or minus character is prepended to indicate strand, if the strandedness option was specified.\n+ \n+\n+------\n+\n+**Inputs**\n+\n+Accepts one or more BAM input files and a reference genome from the built-in list or from a FASTA file in your history.\n+\n+\n+**Outputs**\n+\n+The output is in VCF format.\n+\n+Example VCF output line, without reporting by strand:\n+ ``chrM\t16029\t.\tT\tG,A,C\t.\t.\tAC=15,9,5;AF=0.00155311658729,0.000931869952371,0.000517705529095\tGT:AC:AF:NC\t0/0:15,9,5:0.00155311658729,0.000931869952371,0.000517705529095:A=9,C=5,T=9629,G=15,``\n+\n+Example VCF output line, when reporting by strand:\n+ ``chrM\t16029\t.\tT\tG,A,C\t.\t.\tAC=15,9,5;AF=0.00155311658729,0.000931869952371,0.000517705529095\tGT:AC:AF:NC\t0/0:15,9,5:0.00155311658729,0.000931869952371,0.000517705529095:+T=3972,-A=9,-C=5,-T=5657,-G=15,``\n+\n+**Options**\n+\n+Reference Genome:\n+\n+ Ensure that you have selected the correct reference genome, either from the list of built-in genomes or by selecting the corresponding FASTA file from your history.\n+\n+Restrict to regions:\n+\n+ You can specify any number of regions on which you would like to receive results. You can specify just a chromosome name, or a chromosome name and start postion, or a chromosome name and start and end position for the set of desired regions. \n+\n+Minimum number of reads needed to consider a REF/ALT:\n+\n+ This value declares the minimum number of reads containing a particular base at each position in order to list and use said allele in genotyping calls. Default is 0.\n+\n+Minimum base quality:\n+\n+ The minimum base quality score needed for the position in a read to be used for nucleotide counts and genotyping. Default is no filter.\n+\n+Minimum mapping quality:\n+\n+ The minimum mapping quality score needed to consider a read for nucleotide counts and genotyping. Default is no filter.\n+\n+Ploidy:\n+\n+ The number of genotype calls to make at each reported position.\n+\n+Only write out positions with possible alternate alleles:\n+\n+ When set, only positions which have at least one non-reference nucleotide which passes declare filters will be present in the output.\n+\n+Report counts by strand:\n+\n+ When set, nucleotide counts (NC) will be reported in reference to the aligned read\'s source strand. Reported as: <strand><BASE>=<COUNT>.\n+\n+Choose the dtype to use for storing coverage information:\n+\n+ This controls the maximum depth value for each nucleotide/position/strand (when specified). Smaller values require the least amount of memory, but have smaller maximal limits.\n+\n+ +--------+----------------------------+\n+ | name | maximum coverage value |\n+ +========+============================+\n+ | uint8 | 255 |\n+ +--------+----------------------------+\n+ | uint16 | 65,535 |\n+ +--------+----------------------------+\n+ | uint32 | 4,294,967,295 |\n+ +--------+----------------------------+\n+ | uint64 | 18,446,744,073,709,551,615 |\n+ +--------+----------------------------+\n+\n+\n+ </help>\n+ <citations>\n+ <citation type="doi">10.1186/gb4161</citation>\n+ </citations>\n+\n+</tool>\n'

diff -r cfc86c3fc5c8 -r 5c852eca82e0 tool-data/tool_data_table_conf.xml.sample
--- a/tool-data/tool_data_table_conf.xml.sample Fri Feb 17 11:42:07 2017 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,7 +0,0 @@
-<tables>
-    
-    <table name="sam_fa_indexes" comment_char="#">
-        <columns>line_type, value, path</columns>
-        <file path="tool-data/sam_fa_indices.loc" />
-    </table>
-</tables>

diff -r cfc86c3fc5c8 -r 5c852eca82e0 tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Wed Feb 28 15:54:57 2018 -0500

@@ -0,0 +1,7 @@
+<tables>
+    
+    <table name="sam_fa_indexes" comment_char="#">
+        <columns>line_type, value, path</columns>
+        <file path="tool-data/sam_fa_indices.loc" />
+    </table>
+</tables>

diff -r cfc86c3fc5c8 -r 5c852eca82e0 tools/naive_variant_caller.py
--- a/tools/naive_variant_caller.py Fri Feb 17 11:42:07 2017 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,72 +0,0 @@
-#!/usr/bin/env python
-#Dan Blankenberg
-import sys
-import optparse
-
-from pyBamParser.bam import Reader
-from pyBamTools.genotyping.naive import VCFReadGroupGenotyper, PROGRAM_NAME, PROGRAM_VERSION
-
-def main():
-    #Parse Command Line
-    parser = optparse.OptionParser()
-    parser.add_option( '-b', '--bam', dest='bam_file', action='append', type="string", default=[], help='BAM filename, optionally index filename. Multiple allowed.' )
-    parser.add_option( '-i', '--index', dest='index_file', action='append', type="string", default=[], help='optionally index filename. Multiple allowed.' )
-    parser.add_option( '-o', '--output_vcf_filename', dest='output_vcf_filename', action='store', default = None, type="string", help='Output VCF filename' )
-    parser.add_option( '-r', '--reference_genome_filename', dest='reference_genome_filename', action='store', default = None, type="string", help='Input reference file' )
-    parser.add_option( '-v', '--variants_only', dest='variants_only', action='store_true', default = False, help='Report only sites with a possible variant allele.' )
-    parser.add_option( '-s', '--use_strand', dest='use_strand', action='store_true', default = False, help='Report counts by strand' )
-    parser.add_option( '-p', '--ploidy', dest='ploidy', action='store', type="int", default=2, help='Ploidy. Default=2.' )
-    parser.add_option( '-d', '--min_support_depth', dest='min_support_depth', action='store', type="int", default=0, help='Minimum number of reads needed to consider a REF/ALT. Default=0.' )
-    parser.add_option( '-q', '--min_base_quality', dest='min_base_quality', action='store', type="int", default=None, help='Minimum base quality.' )
-    parser.add_option( '-m', '--min_mapping_quality', dest='min_mapping_quality', action='store', type="int", default=None, help='Minimum mapping.' )
-    parser.add_option( '-t', '--coverage_dtype', dest='coverage_dtype', action='store', type="string", default=None, help='dtype to use for coverage array' )
-    parser.add_option( '--allow_out_of_bounds_positions', dest='allow_out_of_bounds_positions', action='store_true', default = False, help='Allows out of bounds positions to not throw fatal errors' )
-    parser.add_option( '--safe', dest='safe', action='store_true', default = False, help='Perform checks to prevent certain errors. Is slower.' )
-    parser.add_option( '--region', dest='region', action='append', type="string", default=[], help='region' )
-    parser.add_option( '', '--version', dest='version', action='store_true', default = False, help='Report version and quit' )
-    (options, args) = parser.parse_args()
-
-    if options.version:
-        print "%s version %s" % ( PROGRAM_NAME, PROGRAM_VERSION )
-        sys.exit(0)
-
-    if len( options.bam_file ) == 0:
-        print >>sys.stderr, 'You must provide at least one bam (-b) file.'
-        parser.print_help( sys.stderr )
-        sys.exit( 1 )
-    if options.index_file:
-        assert len( options.index_file ) == len( options.bam_file ), "If you provide a name for an index file, you must provide the index name for all bam files."
-        bam_files = zip( options.bam_file, options.index_file )
-    else:
-        bam_files = [ ( x, ) for x in options.bam_file ]
-    if not options.reference_genome_filename:
-        print >> sys.stderr, "Warning: Reference file has not been specified. Providing a reference genome is highly recommended."
-    if options.output_vcf_filename:
-        out = open( options.output_vcf_filename, 'wb' )
-    else:
-        out = sys.stdout
-
-    regions = []
-    if options.region:
-        for region in options.region:
-            region_split = region.split( ":" )
-            region = region_split.pop( 0 )
-            if region_split:
-                region_split = filter( bool, region_split[0].split( '-' ) )
-                if region_split:
-                    if len( region_split ) != 2:
-                        print >> sys.stderr, "You must specify both a start and an end, or only a chromosome when specifying regions."
-                        cleanup_before_exit( tmp_dir )
-                        sys.exit( 1 )
-                    region = tuple( [ region ] + map( int, region_split ) )
-            regions.append( region )
-
-    coverage = VCFReadGroupGenotyper( map( lambda x: Reader( *x ), bam_files ), options.reference_genome_filename, dtype=options.coverage_dtype,
-                                               min_support_depth=options.min_support_depth, min_base_quality=options.min_base_quality,
-                                               min_mapping_quality=options.min_mapping_quality, restrict_regions=regions, use_strand=options.use_strand,
-                                               allow_out_of_bounds_positions=options.allow_out_of_bounds_positions, safe=options.safe )
-    for line in coverage.iter_vcf( ploidy=options.ploidy, variants_only=options.variants_only ):
-        out.write( "%s\n" % line )
-    out.close()
-
-if __name__ == "__main__": main()

diff -r cfc86c3fc5c8 -r 5c852eca82e0 tools/naive_variant_caller.xml
--- a/tools/naive_variant_caller.xml Fri Feb 17 11:42:07 2017 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

b'@@ -1,226 +0,0 @@\n-<tool id="naive_variant_caller" name="Naive Variant Caller" version="0.0.2">\n- <description> - tabulate variable sites from BAM datasets</description>\n- <requirements>\n- <requirement type="package" version="1.7.1">numpy</requirement>\n- <requirement type="package" version="0.0.1">pyBamParser</requirement>\n- <requirement type="package" version="0.0.2">pyBamTools</requirement>\n- </requirements>\n- <stdio>\n- <exit_code range="1:" err_level="fatal" />\n- <exit_code range=":-1" err_level="fatal" />\n- </stdio>\n- <command interpreter="python">naive_variant_caller.py\n- -o "${output_vcf}"\n- \n- #for $input_bam in $reference_source.input_bams:\n- -b "${input_bam.input_bam}"\n- -i "${input_bam.input_bam.metadata.bam_index}"\n- #end for\n- \n- #if $reference_source.reference_source_selector != "history":\n- -r "${reference_source.ref_file.fields.path}"\n- #elif $reference_source.ref_file:\n- -r "${reference_source.ref_file}"\n- #end if\n- \n- #for $region in $regions:\n- --region "${region.chromosome}:${region.start}-${region.end}"\n- #end for\n- \n- ${variants_only}\n- \n- ${use_strand}\n- \n- --ploidy "${$ploidy}"\n- \n- --min_support_depth "${min_support_depth}"\n- \n- #if str($min_base_quality):\n- --min_base_quality "${min_base_quality}"\n- #end if\n- \n- #if str($min_mapping_quality):\n- --min_mapping_quality "${min_mapping_quality}"\n- #end if\n- \n- --allow_out_of_bounds_positions\n- \n- #if str( $advanced_options.advanced_options_selector ) == "advanced":\n- #if str( $advanced_options.coverage_dtype ) != "guess":\n- --coverage_dtype "${advanced_options.coverage_dtype}"\n- #end if\n- ${advanced_options.safe}\n- #end if \n- </command>\n- <version_command interpreter="python">naive_variant_caller.py --version</version_command>\n- <inputs>\n- <conditional name="reference_source">\n- <param name="reference_source_selector" type="select" label="Choose the source for the reference list">\n- <option value="cached">Locally cached</option>\n- <option value="history">History</option>\n- </param>\n- <when value="cached">\n- <repeat name="input_bams" title="BAM file" min="1" >\n- <param name="input_bam" type="data" format="bam" label="BAM file">\n- <validator type="unspecified_build" />\n- <validator type="dataset_metadata_in_data_table" table_name="sam_fa_indexes" metadata_name="dbkey" metadata_column="value" message="Sequences are not currently available for the specified build." /> \n- </param>\n- </repeat>\n- <param name="ref_file" type="select" label="Using reference genome" >\n- <options from_data_table="sam_fa_indexes">\n-  \n- </options>\n- <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>\n- </param>\n- </when>\n- <when value="history"> \n- <repeat name="input_bams" title="BAM file" min="1" >\n- <param name="input_bam" type="data" format="bam" label="BAM file" >\n- </param>\n- </repeat>\n- <param name="ref_file" type="data" format="fasta" label="Using reference file" optional="True" />\n- </when>\n- </conditional>\n-\n- <repeat name="regions" title="Restrict to regions" min="0" >\n- <param name="chromosome" type="text" value="" optional="False" label="Chromosome" />\n- <param name="start" type="integer" value="" optional="True" label="Start" />\n- <param name="end" type="integer" value="" optional="True" label="End" />\n- </repeat>\n-\n- \n'..b'.\n- \n-\n-------\n-\n-**Inputs**\n-\n-Accepts one or more BAM input files and a reference genome from the built-in list or from a FASTA file in your history.\n-\n-\n-**Outputs**\n-\n-The output is in VCF format.\n-\n-Example VCF output line, without reporting by strand:\n- ``chrM\t16029\t.\tT\tG,A,C\t.\t.\tAC=15,9,5;AF=0.00155311658729,0.000931869952371,0.000517705529095\tGT:AC:AF:NC\t0/0:15,9,5:0.00155311658729,0.000931869952371,0.000517705529095:A=9,C=5,T=9629,G=15,``\n-\n-Example VCF output line, when reporting by strand:\n- ``chrM\t16029\t.\tT\tG,A,C\t.\t.\tAC=15,9,5;AF=0.00155311658729,0.000931869952371,0.000517705529095\tGT:AC:AF:NC\t0/0:15,9,5:0.00155311658729,0.000931869952371,0.000517705529095:+T=3972,-A=9,-C=5,-T=5657,-G=15,``\n-\n-**Options**\n-\n-Reference Genome:\n-\n- Ensure that you have selected the correct reference genome, either from the list of built-in genomes or by selecting the corresponding FASTA file from your history.\n-\n-Restrict to regions:\n-\n- You can specify any number of regions on which you would like to receive results. You can specify just a chromosome name, or a chromosome name and start postion, or a chromosome name and start and end position for the set of desired regions. \n-\n-Minimum number of reads needed to consider a REF/ALT:\n-\n- This value declares the minimum number of reads containing a particular base at each position in order to list and use said allele in genotyping calls. Default is 0.\n-\n-Minimum base quality:\n-\n- The minimum base quality score needed for the position in a read to be used for nucleotide counts and genotyping. Default is no filter.\n-\n-Minimum mapping quality:\n-\n- The minimum mapping quality score needed to consider a read for nucleotide counts and genotyping. Default is no filter.\n-\n-Ploidy:\n-\n- The number of genotype calls to make at each reported position.\n-\n-Only write out positions with possible alternate alleles:\n-\n- When set, only positions which have at least one non-reference nucleotide which passes declare filters will be present in the output.\n-\n-Report counts by strand:\n-\n- When set, nucleotide counts (NC) will be reported in reference to the aligned read\'s source strand. Reported as: <strand><BASE>=<COUNT>.\n-\n-Choose the dtype to use for storing coverage information:\n-\n- This controls the maximum depth value for each nucleotide/position/strand (when specified). Smaller values require the least amount of memory, but have smaller maximal limits.\n-\n- +--------+----------------------------+\n- | name | maximum coverage value |\n- +========+============================+\n- | uint8 | 255 |\n- +--------+----------------------------+\n- | uint16 | 65,535 |\n- +--------+----------------------------+\n- | uint32 | 4,294,967,295 |\n- +--------+----------------------------+\n- | uint64 | 18,446,744,073,709,551,615 |\n- +--------+----------------------------+\n-\n-\n- </help>\n- <tests>\n- <test>\n- <param name="reference_source_selector" value="history" />\n- <param name="input_bam" value="fake_phiX174_reads_1.bam" ftype="bam" /> \n- <param name="ref_file" value="phiX174.fasta" ftype="fasta" />\n- <param name="regions" value="0" />\n- <param name="min_support_depth" value="0" />\n- <param name="min_base_quality" value="" />\n- <param name="min_mapping_quality" value="" />\n- <param name="ploidy" value="2" />\n- <param name="variants_only" value="False" />\n- <param name="use_strand" value="False" />\n- <param name="advanced_options_selector" value="advanced" />\n- <param name="coverage_dtype" value="uint8" />\n- <output name="output_vcf" file="fake_phiX174_reads_1_test_out_1.vcf" compare="contains" />\n- </test>\n- </tests>\n-\n- <citations>\n- <citation type="doi">10.1186/gb4161</citation>\n- </citations>\n-\n-</tool>\n'