Mercurial > repos > iuc > data_manager_hisat2_index_builder

--- a/data_manager/hisat2_index_builder.py	Mon Nov 23 09:41:52 2015 -0500
+++ b/data_manager/hisat2_index_builder.py	Tue Apr 04 18:09:40 2017 -0400
@@ -1,14 +1,13 @@
 #!/usr/bin/env python
 # Based heavily on the Bowtie 2 data manager wrapper script by Dan Blankenberg
+from __future__ import print_function

-import shlex
-import sys
-import os
 import argparse
+import os
+import shlex
 import subprocess
-
-from json import loads, dumps
-
+import sys
+from json import dumps, loads

 DEFAULT_DATA_TABLE_NAME = "hisat2_indexes"

@@ -41,7 +40,7 @@
     proc = subprocess.Popen( args=args, shell=False, cwd=target_directory )
     return_code = proc.wait()
     if return_code:
-        print >> sys.stderr, "Error building index."
+        print("Error building index.", file=sys.stderr)
         sys.exit( return_code )
     data_table_entry = dict( value=sequence_id, dbkey=options.fasta_dbkey, name=sequence_name, path=sequence_id )
     _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry )
@@ -71,7 +70,7 @@
     data_manager_dict = {}

     if options.fasta_dbkey in [ None, '', '?' ]:
-        raise Exception( '"%s" is not a valid dbkey. You must specify a valid dbkey.' % ( dbkey ) )
+        raise Exception( '"%s" is not a valid dbkey. You must specify a valid dbkey.' % ( options.fasta_dbkey ) )

     sequence_id, sequence_name = get_id_name( params, dbkey=options.fasta_dbkey, fasta_description=options.fasta_description )

@@ -79,7 +78,8 @@
     build_hisat_index( data_manager_dict, options, params, sequence_id, sequence_name )

     # save info to json file
-    open( filename, 'wb' ).write( dumps( data_manager_dict ) )
+    open( filename, 'w' ).write( dumps( data_manager_dict ) )
+

 if __name__ == "__main__":
     main()
--- a/data_manager/hisat2_index_builder.xml	Mon Nov 23 09:41:52 2015 -0500
+++ b/data_manager/hisat2_index_builder.xml	Tue Apr 04 18:09:40 2017 -0400
@@ -1,46 +1,44 @@
-<tool id="hisat2_index_builder_data_manager" name="HISAT2 index" tool_type="manage_data" version="1.0.0">
+<tool id="hisat2_index_builder_data_manager" name="HISAT2 index" tool_type="manage_data" version="2.0.5">
     <description>builder</description>
     <requirements>
-        <requirement type="package" version="2.0">hisat</requirement>
+        <requirement type="package" version="2.0.5">hisat2</requirement>
     </requirements>
-    <stdio>
-        <exit_code range=":-1" />
-        <exit_code range="1:" />
-    </stdio>
-    <command><![CDATA[
+    <command detect_errors="exit_code"><![CDATA[
         #if $advanced.adv_param_select == 'yes' and $advanced.gtf_input:
-            ln -s "${advanced.gtf_input}" gtf_file.gtf &&
-            python \$HISAT2_ROOT_DIR/bin/extract_splice_sites.py gtf_file.gtf > splice_sites.txt &&
-            python \$HISAT2_ROOT_DIR/bin/extract_exons.py gtf_file.gtf > exon.txt &&
-            ls -lh &&
+            ln -s '${advanced.gtf_input}' gtf_file.gtf &&
+            hisat2_extract_splice_sites.py gtf_file.gtf > splice_sites.txt &&
+            hisat2_extract_exons.py gtf_file.gtf > exon.txt &&
         #end if
         #if $advanced.adv_param_select == 'yes' and $advanced.snps:
-            ln -s "${all_fasta_source.fields.path}" genome.fa &&
-            ln -s "${advanced.snps}" snps.tabular &&
-            python \$HISAT2_ROOT_DIR/bin/extract_snps.py --genome_file genome.fa --snp_file snps.tabular > snps.txt &&
+            ln -s '${advanced.snps}' snps.tabular &&
+            #if $advanced.snps.is_of_type('vcf')
+                hisat2_extract_snps_haplotypes_VCF.py '${all_fasta_source.fields.path}' snps.tabular extracted &&
+            #else
+                hisat2_extract_snps_haplotypes_UCSC.py '${all_fasta_source.fields.path}' snps.tabular extracted &&
+            #end if
         #end if
-        python $__tool_directory__/hisat2_index_builder.py --output "${out_file}"
-            --fasta_filename "${all_fasta_source.fields.path}"
-            --fasta_dbkey "${all_fasta_source.fields.dbkey}"
-            --fasta_description "${all_fasta_source.fields.name}"
-            --data_table_name "hisat2_indexes"
+        python '$__tool_directory__/hisat2_index_builder.py' --output '${out_file}'
+            --fasta_filename '${all_fasta_source.fields.path}'
+            --fasta_dbkey '${all_fasta_source.fields.dbkey}'
+            --fasta_description '${all_fasta_source.fields.name}'
+            --data_table_name hisat2_indexes
+            --indexer_options "-p \${GALAXY_SLOTS:-1}
             #if $advanced.adv_param_select == 'yes':
-                --indexer_options "
                 --noauto
-                -p \${GALAXY_SLOTS:-1}
-                #if $snps:
-                    --snps `pwd`/snps.txt
+                #if $advanced.snps:
+                    --snps "`pwd`/extracted.snp"
+                    --haplotype "`pwd`/extracted.haplotype"
                 #end if
                 #if $advanced.gtf_input:
-                    --ss `pwd`/splice_sites.txt
-                    --exon `pwd`/exon.txt
+                    --ss "`pwd`/splice_sites.txt"
+                    --exon "`pwd`/exon.txt"
                 #end if
                 --bmax $advanced.bmax
                 --bmaxdivn $advanced.bmaxdivn
                 --dcv $advanced.dcv
                 --offrate $advanced.offrate
-                "
             #end if
+            "
         ]]>
     </command>
     <inputs>
@@ -52,21 +50,21 @@
                 <option value="no">Use defaults</option>
                 <option value="yes">Fine-tune indexing parameters</option>
             </param>
+            <when value="no" />
             <when value="yes">
-                <param type="integer" name="bmax" label="Maximum number of suffixes allowed in a block." help="--bmax" value="4" />
-                <param type="integer" name="bmaxdivn" label="Maximum number of suffixes allowed in a block, expressed as a fraction of the length of the reference." help="--bmaxdivn" value="4" />
-                <param type="integer" name="dcv" label="Period for the difference-cover sample." help="--dcv: A larger period yields less memory overhead, but may make suffix sorting slower, especially if repeats are present. Must be a power of 2 no greater than 4096. " value="1024" min="2" max="4096" />
-                <param type="integer" name="offrate" label="Mark rows in the Burrows-Wheeler transform" help="--offrate: To map alignments back to positions on the reference sequences, it's necessary to annotate (&quot;mark&quot;) some or all of the Burrows-Wheeler rows with their corresponding location on the genome. This parameter governs how many rows get marked: the indexer will mark every 2^&lt;int&gt; rows. Marking more rows makes reference-position lookups faster, but requires more memory to hold the annotations at runtime. The default is 4 (every 16th row is marked; for human genome, annotations occupy about 680 megabytes)." value="4" />
-                <param type="data" format="tabular" name="snps" label="Provide a list of SNPs in the UCSC dbSNP format" optional="True" help="This should be a dataset in the Data Manager History (automatically created). If you include SNPs or splice sites and exons, building an index on the human genome will consume up to 200GB RAM as index building involves a graph construction." />
-                <param type="data" format="gtf" name="gtf_input" label="Provide a GTF file for HISAT2 to extract splice sites from" optional="True" help="This should be a dataset in the Data Manager History (automatically created). If you include SNPs or splice sites and exons, building an index on the human genome will consume up to 200GB RAM as index building involves a graph construction." />
+                <param argument="--bmax" type="integer" value="4" label="Maximum number of suffixes allowed in a block" />
+                <param argument="--bmaxdivn" type="integer" value="4" label="Maximum number of suffixes allowed in a block, expressed as a fraction of the length of the reference" />
+                <param argument="--dcv" type="integer" min="2" max="4096" value="1024" label="Period for the difference-cover sample" help="A larger period yields less memory overhead, but may make suffix sorting slower, especially if repeats are present. Must be a power of 2 no greater than 4096" />
+                <param argument="--offrate" type="integer" value="4" label="Mark rows in the Burrows-Wheeler transform" help="To map alignments back to positions on the reference sequences, it's necessary to annotate (&quot;mark&quot;) some or all of the Burrows-Wheeler rows with their corresponding location on the genome. This parameter governs how many rows get marked: the indexer will mark every 2^&lt;int&gt; rows. Marking more rows makes reference-position lookups faster, but requires more memory to hold the annotations at runtime. The default is 4 (every 16th row is marked; for human genome, annotations occupy about 680 megabytes)" />
+                <param name="snps" type="data" format="tabular,vcf" optional="true" label="Provide a list of SNPs in the UCSC dbSNP or VCF format" help="If you include SNPs or splice sites and exons, building an index on the human genome will consume up to 200GB RAM as index building involves a graph construction" />
+                <param name="gtf_input" type="data" format="gtf" optional="true" label="Provide a GTF file for HISAT2 to extract splice sites from" help="If you include SNPs or splice sites and exons, building an index on the human genome will consume up to 200GB RAM as index building involves a graph construction" />
             </when>
-            <when value="no" />
         </conditional>
-        <param label="Name of sequence" name="sequence_name" type="text" value="" />
-        <param label="ID for sequence" name="sequence_id" type="text" value="" />
+        <param name="sequence_name" type="text" value="" label="Name of sequence" />
+        <param name="sequence_id" type="text" value="" label="ID for sequence" />
     </inputs>
     <outputs>
-        <data format="data_manager_json" name="out_file" />
+        <data name="out_file" format="data_manager_json" />
     </outputs>
     <help>
 <![CDATA[
--- a/tool_data_table_conf.xml.sample	Mon Nov 23 09:41:52 2015 -0500
+++ b/tool_data_table_conf.xml.sample	Tue Apr 04 18:09:40 2017 -0400
@@ -1,12 +1,12 @@
 <!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
 <tables>
     <!-- Locations of all fasta files under genome directory -->
-    <table name="all_fasta" comment_char="#">
+    <table name="all_fasta" comment_char="#" allow_duplicate_entries="False">
         <columns>value, dbkey, name, path</columns>
         <file path="tool-data/all_fasta.loc" />
     </table>
     <!-- Locations of indexes in the hisat mapper format -->
-    <table name="hisat2_indexes" comment_char="#">
+    <table name="hisat2_indexes" comment_char="#" allow_duplicate_entries="False">
         <columns>value, dbkey, name, path</columns>
         <file path="tool-data/hisat2_indexes.loc" />
     </table>
--- a/tool_dependencies.xml	Mon Nov 23 09:41:52 2015 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="hisat" version="2.0">
-        <repository changeset_revision="c65f00072e57" name="package_hisat_2_0" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
-    </package>
-</tool_dependency>