changeset 0:0ec408bcfc80 draft

Uploaded
author crs4
date Wed, 11 Sep 2013 12:51:21 -0400
parents
children 386166019772
files COPYING ssake.py ssake.xml tool_dependencies.xml
diffstat 4 files changed, 307 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/COPYING	Wed Sep 11 12:51:21 2013 -0400
@@ -0,0 +1,24 @@
+Copyright © 2012-2013 CRS4 Srl. http://www.crs4.it/
+Created by:
+Massimiliano Orsini <massimiliano.orsini@crs4.it>
+Gianmauro Cuccuru <gianmauro.cuccuru@crs4.it>
+Nicola Soranzo <nicola.soranzo@crs4.it>
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ssake.py	Wed Sep 11 12:51:21 2013 -0400
@@ -0,0 +1,162 @@
+# -*- coding: utf-8 -*-
+"""
+SSAKE wrapper
+"""
+
+import logging
+import optparse
+import os
+import shutil
+import subprocess
+import tempfile
+
+def execute(cmd):
+    """ """
+    subprocess.check_call(args=cmd, stdout=open(os.devnull, 'w'), shell=True)
+
+
+def which(name, flags=os.X_OK):
+    """
+    Search PATH for executable files with the given name.
+    """
+    result = []
+    exts = filter(None, os.environ.get('PATHEXT', '').split(os.pathsep))
+    path = os.environ.get('PATH', None)
+    if path is None:
+        return []
+    for p in os.environ.get('PATH', '').split(os.pathsep):
+        p = os.path.join(p, str(name))
+        if os.access(p, flags):
+            result.append(p)
+            for e in exts:
+                pext = p + e
+                if os.access(pext, flags):
+                    result.append(pext)
+    return result
+
+
+class SSAKE:
+    def __init__(self, logger, options):
+        self.logger = logger
+        self.executables = ('SSAKE', 'makePairedOutput2EQUALfiles.pl', 'makePairedOutput2UNEQUALfiles.pl')
+        self.logger.debug(which(self.executables[0]))
+        self.logger.debug(which(self.executables[1]))
+        self.logger.debug(which(self.executables[2]))
+        self.logger.debug('Creating temp dir')
+        self.wd = tempfile.mkdtemp()
+        
+        self.kind_of_reads = int(options.kind_of_reads)
+        if not (self.kind_of_reads):
+            self.infile = options.if_unpaired
+            self.paired = 0
+        else:
+            self.infile_r1 = options.if_paired_r1
+            self.infile_r2 = options.if_paired_r2
+            self.paired = 1
+            self.insert_size = options.insert_size
+            self.minnumlinks = options.minnumlinks
+            self.error = options.error
+            self.maxlinkratio = options.maxlinkratio
+            self.minoverlap = options.minoverlap
+        self.mindepthofcoverage = options.mindepthofcoverage
+        self.minoverlappingbases = options.minoverlappingbases
+        self.mincall = options.mincall
+        self.baseratio = options.baseratio
+        self.ignore_header = options.ignore_header
+        self.prefix = options.prefix
+        self.contigs = options.contigs
+        self.log = options.logfile
+        self.short = options.short
+        self.singlets = options.singlets
+        if options.seeds_file:
+            self.seeds_file = options.seeds_file
+
+    def run(self):
+        """ """
+        os.chdir(self.wd)
+        seeds = ''
+        if hasattr(self, 'seeds_file'):
+            seeds = " -s %s" % self.seeds_file
+        if self.kind_of_reads == 1:
+            cmd = "%s %s %s %d" % (
+                self.executables[1], self.infile_r1, self.infile_r2,
+                self.insert_size)
+            self.logger.info("Preparing data")
+            execute(cmd)
+            paired_file = "%s/paired.fa" % self.wd
+            command = "%s -f %s -k %d -e %s -a %s -x %d" % (self.executables[0], paired_file, self.minnumlinks, self.error, self.maxlinkratio, self.minoverlap)
+        elif self.kind_of_reads == 2:
+            cmd = "%s %s %s %d" % (
+                self.executables[2], self.infile_r1, self.infile_r2,
+                self.insert_size)
+            self.logger.info("Preparing data")
+            execute(cmd)
+            paired_file = "%s/paired.fa" % self.wd
+            unpaired_file = "%s/unpaired.fa" % self.wd
+            command = "%s -f %s -g %s -k %d -e %s -a %s -x %d" % (self.executables[0], paired_file, unpaired_file, self.minnumlinks, self.error, self.maxlinkratio, self.minoverlap)
+        else:
+            command = "%s -f %s" % (self.executables[0], self.infile)
+        command += " %s -w %d -m %d -o %d -r %s -h %s -b %s -p %s" % (seeds, self.mindepthofcoverage, self.minoverlappingbases, self.mincall, self.baseratio, self.ignore_header, self.prefix, self.paired)
+        self.logger.debug(command)
+        self.logger.info("Executing SSAKE")
+        execute(command)
+
+        with open("%s.log" % os.path.join(self.wd, self.prefix), 'rb') as ssake_log_file:
+            self.logger.info("\n".join(["Log from SSAKE", ssake_log_file.read()]))
+        self.logger.info("Moving result files")
+        shutil.move("%s.contigs" % os.path.join(self.wd, self.prefix), self.contigs)
+        shutil.move("%s.short" % os.path.join(self.wd, self.prefix), self.short)
+        shutil.move("%s.singlets" % os.path.join(self.wd, self.prefix), self.singlets)
+
+    def __del__(self):
+        shutil.rmtree(self.wd)
+
+
+LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
+LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
+LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
+
+
+def __main__():
+    """ main function """
+    parser = optparse.OptionParser()
+    parser.add_option('--if_unpaired', dest='if_unpaired', help='Unpaired FASTA input file name')
+    parser.add_option('--if_paired_r1', dest='if_paired_r1', help='Paired FASTA reads 1 input file name')
+    parser.add_option('--if_paired_r2', dest='if_paired_r2', help='Paired FASTA reads 2 input file name')
+    parser.add_option('-s', dest='seeds_file', help='FASTA as seeds, input file name')
+    parser.add_option('-w', dest='mindepthofcoverage', type='int', help='minimum depth of coverage allowed for contigs')
+    parser.add_option('-m', dest='minoverlappingbases', type='int', default=20, help='Minimum number of overlapping bases with the seed/contig during overhang consensus build up (default -m 20)')
+    parser.add_option('-o', dest='mincall', type='int', default=2, help='mincall -o ')
+    parser.add_option('-r', dest='baseratio', type='float', default=0.7, help='baseratio -r')
+    parser.add_option('-k', dest='minnumlinks', type='int', default=4, help='Minimum number of links (read pairs) to compute scaffold -k')
+    parser.add_option('-e', dest='error', type='float', default=0.75, help='Error (%) allowed on mean distance -e')
+    parser.add_option('-a', dest='maxlinkratio', type='float', default=0.5, help='Maximum link ratio between two best contig pairs -a')
+    parser.add_option('-x', dest='minoverlap', type='int', default=20, help='Minimum overlap required between contigs to merge adjacent contigs in a scaffold -x')
+    parser.add_option('--ignore_header', dest='ignore_header', choices=['0', '1'], default='1', help='Ignore read name/header *will use less RAM if set to 1* -h')
+    parser.add_option('--kind_of_reads', dest='kind_of_reads', choices=['0', '1', '2'], help='Kind of reads (-p)')
+    parser.add_option('--iz', dest='insert_size', type='int', help='Library insert size')
+    parser.add_option('--prefix', dest='prefix', default='ssake_pre', help='prefix')
+    parser.add_option('--out1', dest='contigs', help='contig file')
+    parser.add_option('--out2', dest='short', help='short file')
+    parser.add_option('--out3', dest='singlets', help='singlets file')
+    parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO', help='logging level (default: INFO)')
+    parser.add_option('--logfile', help='log file (default=stderr)')
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error('Wrong number of arguments')
+
+    log_level = getattr(logging, options.loglevel)
+    kwargs = {'format': LOG_FORMAT,
+              'datefmt': LOG_DATEFMT,
+              'level': log_level}
+    if options.logfile:
+        kwargs['filename'] = options.logfile
+        logging.basicConfig(**kwargs)
+        logger = logging.getLogger('SSAKE scaffold assembly')
+
+    S = SSAKE(logger, options)
+    S.run()
+    return
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ssake.xml	Wed Sep 11 12:51:21 2013 -0400
@@ -0,0 +1,99 @@
+<tool id="ssake" name="SSAKE" version="0.0.10">
+  <description>short DNA sequences assembler</description>
+  <requirements>
+    <requirement type="package" version="3.8">ssake</requirement>
+  </requirements>
+  <command interpreter="python">
+  ssake.py
+  #if $kind_of_reads.kind_of_reads_select == '0'
+    --if_unpaired $infile
+  #else
+    --if_paired_r1 $infile_r1
+    --if_paired_r2 $infile_r2
+    --iz ${kind_of_reads.insert_size}
+    -k ${kind_of_reads.minnumlinks}
+    -e ${kind_of_reads.error}
+    -a ${kind_of_reads.maxlinkratio}
+    -x ${kind_of_reads.minoverlap}
+  #end if
+  #if $seeds
+    -s $seeds
+  #end if
+  -w $mindepthofcoverage
+  -m $minoverlap
+  -o $mincall
+  -r $baseratio
+  --ignore_header 1
+  --kind_of_reads ${kind_of_reads.kind_of_reads_select}
+  --out1 $contig
+  --out2 $short
+  --out3 $singlets
+  --logfile $log
+  </command>
+  <inputs>
+    <conditional name="kind_of_reads">
+      <param name="kind_of_reads_select" type="select" label="Kind of reads (-p)">
+        <option value="0">Unpaired </option>
+        <option value="1">Paired and equal (both files must have the same number of sequences, arranged in the same order)</option>
+        <option value="2">Paired and unequal (files can have different number of sequences in any order)</option>
+      </param>
+      <when value="0">
+        <param name="infile" type="data" format="fasta" label="Input FASTA file" />
+      </when>
+      <when value="1">
+        <param name="infile_r1" type="data" format="fasta" label="Input FASTA file (read 1)" />
+        <param name="infile_r2" type="data" format="fasta" label="Input FASTA file (read 2)" />
+        <param name="insert_size" type="integer" value="200" label="Library insert size" />
+        <param name="minnumlinks" type="integer" value="4" label="Minimum number of links (read pairs) to compute scaffold (-k)" />
+        <param name="error" type="float" value="0.75" min="0" max="1" label="Error (%) allowed on mean distance (-e)" />
+        <param name="maxlinkratio" type="float" value="0.5" label="Maximum link ratio between two best contig pairs (-a)" />
+        <param name="minoverlap" type="integer" value="20" label="Minimum overlap required between contigs to merge adjacent contigs in a scaffold (-x)" />
+      </when>
+      <when value="2">
+        <param name="infile_r1" type="data" format="fasta" label="Input FASTA file (read 1)" />
+        <param name="infile_r2" type="data" format="fasta" label="Input FASTA file (read 2)" />
+        <param name="insert_size" type="integer" value="200" label="Library insert size" />
+        <param name="minnumlinks" type="integer" value="4" label="Minimum number of links (read pairs) to compute scaffold (-k)" />
+        <param name="error" type="float" value="0.75" min="0" max="1" label="Error (%) allowed on mean distance (-e)" />
+        <param name="maxlinkratio" type="float" value="0.5" label="Maximum link ratio between two best contig pairs (-a)" />
+        <param name="minoverlap" type="integer" value="20" label="Minimum overlap required between contigs to merge adjacent contigs in a scaffold (-x)" />
+      </when>
+    </conditional>
+    <param name="seeds" type="data" format="fasta" optional="true" label="FASTA file containing sequences to use as seeds exclusively (-s)" help="Optional, specify only if different from read set" />
+    <param name="mindepthofcoverage" type="integer" value="1" label="Minimum depth of coverage allowed for contigs (-w)" />
+    <param name="minoverlap" type="integer" value="20" label="Minimum number of overlapping bases with the seed/contig during overhang consensus build up (-m)" />
+    <param name="mincall" type="integer" value="2" label="Minimum number of reads needed to call a base during an extension (-o)" />
+    <param name="baseratio" type="float" value="0.7" label="Minimum base ratio used to accept a overhang consensus base (-r)" />
+  </inputs>
+
+  <outputs>
+    <data name="contig" format="fasta" label="${tool.name} on ${on_string}: contigs" />
+    <data name="log" format="txt" label="${tool.name} on ${on_string}: log" />
+    <data name="short" format="txt" label="${tool.name} on ${on_string}: unacceptable reads" />
+    <data name="singlets" format="fasta" label="${tool.name} on ${on_string}: unassembled reads" />
+  </outputs>
+  <help>
+**What it does**
+
+SSAKE is a genomics application for de novo assembly of millions of very short DNA sequences.
+It is an easy-to-use, robust, reliable and tractable clustering algorithm for very short sequence reads, such as those generated by Illumina Ltd.
+
+**License and citation**
+
+This Galaxy tool is Copyright © 2012-2013 `CRS4 Srl.`_ and is released under the `MIT license`_.
+
+.. _CRS4 Srl.: http://www.crs4.it/
+.. _MIT license: http://opensource.org/licenses/MIT
+
+If you use this tool in Galaxy, please cite |Cuccuru2013|_.
+
+.. |Cuccuru2013| replace:: Cuccuru, G., Orsini, M., Pinna, A., Sbardellati, A., Soranzo, N., Travaglione, A., Uva, P., Zanetti, G., Fotia, G. (2013) Orione, a web-based framework for NGS analysis in microbiology. *Submitted*
+.. _Cuccuru2013: http://orione.crs4.it/
+
+This tool uses `SSAKE`_, which is licensed separately. Please cite |Warren2007|_.
+
+.. _SSAKE: http://www.bcgsc.ca/platform/bioinfo/software/ssake/
+.. |Warren2007| replace:: Warren RL, Sutton GG, Jones SJM, Holt RA. 2007. Assembling millions of short DNA sequences using SSAKE. Bioinformatics. 23(4):500-501
+.. _Warren2007: http://bioinformatics.oxfordjournals.org/content/23/4/500
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Wed Sep 11 12:51:21 2013 -0400
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="ssake" version="3.8">
+    <install version="1.0">
+      <actions>
+        <action type="download_by_url" target_filename="ssake_v3-8.tar.gz">http://www.bcgsc.ca/platform/bioinfo/software/ssake/releases/3.8/ssake_v3-8-tar.gz</action>
+        <!-- fix for Perl >= 5.16.0-->
+        <action type="shell_command">sed -i -e 's/require "getopts.pl"/use Getopt::Std/' -e 's/&amp;Getopts/getopts/' SSAKE tools/TQSfastq.pl</action>
+        <action type="move_directory_files">
+          <source_directory>.</source_directory>
+          <destination_directory>$INSTALL_DIR</destination_directory>
+        </action>
+        <action type="set_environment">
+          <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR</environment_variable>
+        </action>
+        <action type="set_environment">
+          <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/tools</environment_variable>
+        </action>
+      </actions>
+    </install>
+  </package>
+</tool_dependency>