Mercurial > repos > iuc > data_manager_hisat2_index_builder
changeset 4:d210e1f185bd draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_hisat2_index_builder commit 8652f36a3a3838dca989426961561e81432acf4f
author | iuc |
---|---|
date | Tue, 04 Apr 2017 18:09:40 -0400 |
parents | 98a60a4cfb9a |
children | 8eac26f44d29 |
files | data_manager/hisat2_index_builder.py data_manager/hisat2_index_builder.xml tool_data_table_conf.xml.sample tool_dependencies.xml |
diffstat | 4 files changed, 45 insertions(+), 53 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/hisat2_index_builder.py Mon Nov 23 09:41:52 2015 -0500 +++ b/data_manager/hisat2_index_builder.py Tue Apr 04 18:09:40 2017 -0400 @@ -1,14 +1,13 @@ #!/usr/bin/env python # Based heavily on the Bowtie 2 data manager wrapper script by Dan Blankenberg +from __future__ import print_function -import shlex -import sys -import os import argparse +import os +import shlex import subprocess - -from json import loads, dumps - +import sys +from json import dumps, loads DEFAULT_DATA_TABLE_NAME = "hisat2_indexes" @@ -41,7 +40,7 @@ proc = subprocess.Popen( args=args, shell=False, cwd=target_directory ) return_code = proc.wait() if return_code: - print >> sys.stderr, "Error building index." + print("Error building index.", file=sys.stderr) sys.exit( return_code ) data_table_entry = dict( value=sequence_id, dbkey=options.fasta_dbkey, name=sequence_name, path=sequence_id ) _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ) @@ -71,7 +70,7 @@ data_manager_dict = {} if options.fasta_dbkey in [ None, '', '?' ]: - raise Exception( '"%s" is not a valid dbkey. You must specify a valid dbkey.' % ( dbkey ) ) + raise Exception( '"%s" is not a valid dbkey. You must specify a valid dbkey.' % ( options.fasta_dbkey ) ) sequence_id, sequence_name = get_id_name( params, dbkey=options.fasta_dbkey, fasta_description=options.fasta_description ) @@ -79,7 +78,8 @@ build_hisat_index( data_manager_dict, options, params, sequence_id, sequence_name ) # save info to json file - open( filename, 'wb' ).write( dumps( data_manager_dict ) ) + open( filename, 'w' ).write( dumps( data_manager_dict ) ) + if __name__ == "__main__": main()
--- a/data_manager/hisat2_index_builder.xml Mon Nov 23 09:41:52 2015 -0500 +++ b/data_manager/hisat2_index_builder.xml Tue Apr 04 18:09:40 2017 -0400 @@ -1,46 +1,44 @@ -<tool id="hisat2_index_builder_data_manager" name="HISAT2 index" tool_type="manage_data" version="1.0.0"> +<tool id="hisat2_index_builder_data_manager" name="HISAT2 index" tool_type="manage_data" version="2.0.5"> <description>builder</description> <requirements> - <requirement type="package" version="2.0">hisat</requirement> + <requirement type="package" version="2.0.5">hisat2</requirement> </requirements> - <stdio> - <exit_code range=":-1" /> - <exit_code range="1:" /> - </stdio> - <command><![CDATA[ + <command detect_errors="exit_code"><![CDATA[ #if $advanced.adv_param_select == 'yes' and $advanced.gtf_input: - ln -s "${advanced.gtf_input}" gtf_file.gtf && - python \$HISAT2_ROOT_DIR/bin/extract_splice_sites.py gtf_file.gtf > splice_sites.txt && - python \$HISAT2_ROOT_DIR/bin/extract_exons.py gtf_file.gtf > exon.txt && - ls -lh && + ln -s '${advanced.gtf_input}' gtf_file.gtf && + hisat2_extract_splice_sites.py gtf_file.gtf > splice_sites.txt && + hisat2_extract_exons.py gtf_file.gtf > exon.txt && #end if #if $advanced.adv_param_select == 'yes' and $advanced.snps: - ln -s "${all_fasta_source.fields.path}" genome.fa && - ln -s "${advanced.snps}" snps.tabular && - python \$HISAT2_ROOT_DIR/bin/extract_snps.py --genome_file genome.fa --snp_file snps.tabular > snps.txt && + ln -s '${advanced.snps}' snps.tabular && + #if $advanced.snps.is_of_type('vcf') + hisat2_extract_snps_haplotypes_VCF.py '${all_fasta_source.fields.path}' snps.tabular extracted && + #else + hisat2_extract_snps_haplotypes_UCSC.py '${all_fasta_source.fields.path}' snps.tabular extracted && + #end if #end if - python $__tool_directory__/hisat2_index_builder.py --output "${out_file}" - --fasta_filename "${all_fasta_source.fields.path}" - --fasta_dbkey "${all_fasta_source.fields.dbkey}" - --fasta_description "${all_fasta_source.fields.name}" - --data_table_name "hisat2_indexes" + python '$__tool_directory__/hisat2_index_builder.py' --output '${out_file}' + --fasta_filename '${all_fasta_source.fields.path}' + --fasta_dbkey '${all_fasta_source.fields.dbkey}' + --fasta_description '${all_fasta_source.fields.name}' + --data_table_name hisat2_indexes + --indexer_options "-p \${GALAXY_SLOTS:-1} #if $advanced.adv_param_select == 'yes': - --indexer_options " --noauto - -p \${GALAXY_SLOTS:-1} - #if $snps: - --snps `pwd`/snps.txt + #if $advanced.snps: + --snps "`pwd`/extracted.snp" + --haplotype "`pwd`/extracted.haplotype" #end if #if $advanced.gtf_input: - --ss `pwd`/splice_sites.txt - --exon `pwd`/exon.txt + --ss "`pwd`/splice_sites.txt" + --exon "`pwd`/exon.txt" #end if --bmax $advanced.bmax --bmaxdivn $advanced.bmaxdivn --dcv $advanced.dcv --offrate $advanced.offrate - " #end if + " ]]> </command> <inputs> @@ -52,21 +50,21 @@ <option value="no">Use defaults</option> <option value="yes">Fine-tune indexing parameters</option> </param> + <when value="no" /> <when value="yes"> - <param type="integer" name="bmax" label="Maximum number of suffixes allowed in a block." help="--bmax" value="4" /> - <param type="integer" name="bmaxdivn" label="Maximum number of suffixes allowed in a block, expressed as a fraction of the length of the reference." help="--bmaxdivn" value="4" /> - <param type="integer" name="dcv" label="Period for the difference-cover sample." help="--dcv: A larger period yields less memory overhead, but may make suffix sorting slower, especially if repeats are present. Must be a power of 2 no greater than 4096. " value="1024" min="2" max="4096" /> - <param type="integer" name="offrate" label="Mark rows in the Burrows-Wheeler transform" help="--offrate: To map alignments back to positions on the reference sequences, it's necessary to annotate ("mark") some or all of the Burrows-Wheeler rows with their corresponding location on the genome. This parameter governs how many rows get marked: the indexer will mark every 2^<int> rows. Marking more rows makes reference-position lookups faster, but requires more memory to hold the annotations at runtime. The default is 4 (every 16th row is marked; for human genome, annotations occupy about 680 megabytes)." value="4" /> - <param type="data" format="tabular" name="snps" label="Provide a list of SNPs in the UCSC dbSNP format" optional="True" help="This should be a dataset in the Data Manager History (automatically created). If you include SNPs or splice sites and exons, building an index on the human genome will consume up to 200GB RAM as index building involves a graph construction." /> - <param type="data" format="gtf" name="gtf_input" label="Provide a GTF file for HISAT2 to extract splice sites from" optional="True" help="This should be a dataset in the Data Manager History (automatically created). If you include SNPs or splice sites and exons, building an index on the human genome will consume up to 200GB RAM as index building involves a graph construction." /> + <param argument="--bmax" type="integer" value="4" label="Maximum number of suffixes allowed in a block" /> + <param argument="--bmaxdivn" type="integer" value="4" label="Maximum number of suffixes allowed in a block, expressed as a fraction of the length of the reference" /> + <param argument="--dcv" type="integer" min="2" max="4096" value="1024" label="Period for the difference-cover sample" help="A larger period yields less memory overhead, but may make suffix sorting slower, especially if repeats are present. Must be a power of 2 no greater than 4096" /> + <param argument="--offrate" type="integer" value="4" label="Mark rows in the Burrows-Wheeler transform" help="To map alignments back to positions on the reference sequences, it's necessary to annotate ("mark") some or all of the Burrows-Wheeler rows with their corresponding location on the genome. This parameter governs how many rows get marked: the indexer will mark every 2^<int> rows. Marking more rows makes reference-position lookups faster, but requires more memory to hold the annotations at runtime. The default is 4 (every 16th row is marked; for human genome, annotations occupy about 680 megabytes)" /> + <param name="snps" type="data" format="tabular,vcf" optional="true" label="Provide a list of SNPs in the UCSC dbSNP or VCF format" help="If you include SNPs or splice sites and exons, building an index on the human genome will consume up to 200GB RAM as index building involves a graph construction" /> + <param name="gtf_input" type="data" format="gtf" optional="true" label="Provide a GTF file for HISAT2 to extract splice sites from" help="If you include SNPs or splice sites and exons, building an index on the human genome will consume up to 200GB RAM as index building involves a graph construction" /> </when> - <when value="no" /> </conditional> - <param label="Name of sequence" name="sequence_name" type="text" value="" /> - <param label="ID for sequence" name="sequence_id" type="text" value="" /> + <param name="sequence_name" type="text" value="" label="Name of sequence" /> + <param name="sequence_id" type="text" value="" label="ID for sequence" /> </inputs> <outputs> - <data format="data_manager_json" name="out_file" /> + <data name="out_file" format="data_manager_json" /> </outputs> <help> <![CDATA[
--- a/tool_data_table_conf.xml.sample Mon Nov 23 09:41:52 2015 -0500 +++ b/tool_data_table_conf.xml.sample Tue Apr 04 18:09:40 2017 -0400 @@ -1,12 +1,12 @@ <!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc--> <tables> <!-- Locations of all fasta files under genome directory --> - <table name="all_fasta" comment_char="#"> + <table name="all_fasta" comment_char="#" allow_duplicate_entries="False"> <columns>value, dbkey, name, path</columns> <file path="tool-data/all_fasta.loc" /> </table> <!-- Locations of indexes in the hisat mapper format --> - <table name="hisat2_indexes" comment_char="#"> + <table name="hisat2_indexes" comment_char="#" allow_duplicate_entries="False"> <columns>value, dbkey, name, path</columns> <file path="tool-data/hisat2_indexes.loc" /> </table>
--- a/tool_dependencies.xml Mon Nov 23 09:41:52 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="hisat" version="2.0"> - <repository changeset_revision="c65f00072e57" name="package_hisat_2_0" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> - </package> -</tool_dependency>