changeset 0:988d5a82291a draft

Uploaded
author crs4
date Thu, 24 Oct 2013 14:02:10 -0400
parents
children d180348fe9db
files COPYING sopra_wpc.py sopra_wpc.xml tool_dependencies.xml
diffstat 4 files changed, 229 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/COPYING	Thu Oct 24 14:02:10 2013 -0400
@@ -0,0 +1,23 @@
+Copyright © 2013 CRS4 Srl. http://www.crs4.it/
+Created by:
+Gianmauro Cuccuru <gianmauro.cuccuru@crs4.it>
+Nicola Soranzo <nicola.soranzo@crs4.it>
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sopra_wpc.py	Thu Oct 24 14:02:10 2013 -0400
@@ -0,0 +1,116 @@
+# -*- coding: utf-8 -*-
+"""
+SOPRA with prebuilt contigs workflow runner
+"""
+
+import optparse
+import os
+import tempfile
+import shutil
+import subprocess
+import sys
+
+
+# Copyright (c) Twisted Matrix Laboratories.
+def which(name, flags=os.X_OK):
+    """ Search PATH for executable files with the given name. """
+    result = []
+    exts = filter(None, os.environ.get('PATHEXT', '').split(os.pathsep))
+    path = os.environ.get('PATH', None)
+    if path is None:
+        return []
+    for p in os.environ.get('PATH', '').split(os.pathsep):
+        p = os.path.join(p, name)
+        if os.access(p, flags):
+            result.append(p)
+        for e in exts:
+            pext = p + e
+            if os.access(pext, flags):
+                result.append(pext)
+    return result
+
+
+def __main__():
+    parser = optparse.OptionParser(description='SOPRA with prebuilt contigs')
+    parser.add_option('--contigs', action='append', dest='contigs', help='Contigs FASTA files, at least 1')
+    parser.add_option('--mate', action='append', dest='mates', help='Paired-end Illumina libraries, at least 1 FASTA file')
+    parser.add_option('-d', action='append', dest='insert_sizes', type='int', help='List of insert sizes for the corresponding mate pair libraries')
+    parser.add_option('-v', dest='max_mismatches', type='int', help='Maximum number of mismatches when aligning reads on contigs with Bowtie')
+    parser.add_option('-c', dest='c_option', type='int', help='If the number of times a read and its reverse complement appear in the library is equal to or more than this value, the pairing information from that read will be disregarded')
+    parser.add_option('-w', dest='w_option', type='int', help='Minimum number of links between two contigs')
+    parser.add_option('-L', dest='L_option', type='int', help='Minimum length of contigs to be used in scaffold assembly')
+    parser.add_option('--h_option', dest='h_option', type='float', help='High coverage contigs (above mean coverage + h x std coverage) are not considered in the scaffold assembly mainly to exclude reads from repetitive regions')
+    parser.add_option('--scaffolds', dest='scaffolds', help='scaffolds fasta file mandatory')
+    parser.add_option('-l', '--logfile', dest='logfile', help='log file (default=stdout)')
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error('Wrong number of arguments')
+
+    contigs = options.contigs # a list of file paths
+    mates = options.mates # a list of file paths
+    insert_sizes = options.insert_sizes # a list of integers
+    max_mismatches = options.max_mismatches
+    c_option = options.c_option
+    w_option = options.w_option
+    L_option = options.L_option
+    h_option = options.h_option
+    scaffolds = options.scaffolds
+    logfile = options.logfile
+
+    s_scaf_path = which('s_scaf_v1.4.6.pl').pop()
+    print 'Creating temp dir'
+    wd = tempfile.mkdtemp()
+    try:
+        fake_mates = [os.path.join(wd, os.path.basename(mate) + '.fasta') for mate in mates] # s_prep_contigAseq_v1.4.6.pl wants a mate file with extension [Ff][Aa][Ss][Tt][Aa] or [Ff][Aa]
+        contigs_sopra = os.path.join(wd, 'contigs_sopra.fasta') # s_prep_contigAseq_v1.4.6.pl always writes all the prepared contigs to this file
+        bowtie_build = os.path.join(wd, 'bowtie_build') # arbitrary basename for bowtie-build output files
+        mate_sopras = [os.path.splitext(fake_mate)[0] + '_sopra.fasta' for fake_mate in fake_mates] # s_prep_contigAseq_v1.4.6.pl writes the prepared paired reads to these files
+        mysam_mates = [mate_sopra + '.sam' for mate_sopra in mate_sopras] # arbitrary filenames for bowtie output in SAM format
+        mysam_mates_parsed = [mysam_mate + '_parsed' for mysam_mate in mysam_mates] # s_parse_sam_v1.4.6.pl writes its output to these files
+        orientdistinfo = os.path.join(wd, 'orientdistinfo_c%d' % c_option) # s_read_parsed_sam_v1.4.6.pl writes its output to this file
+        scaffolds_file = os.path.join(wd, "scaffolds_h%s_L%d_w%d.fasta" % (h_option, L_option, w_option)) # s_scaf_v1.4.6.pl writes its output to this file
+
+        for i in range(len(mates)):
+            print "Copying mate %s to %s" % (mates[i], fake_mates[i])
+            shutil.copy2(mates[i], fake_mates[i])
+
+        log = open(logfile, 'w') if logfile else sys.stdout
+        try:
+            cmd_step1 = "s_prep_contigAseq_v1.4.6.pl -contig %s -mate %s -a %s" % (" ".join(contigs), " ".join(fake_mates), wd)
+            print "SOPRA with prebuilt contigs (preparation) command to be executed:\n %s" % cmd_step1
+            subprocess.check_call(args=cmd_step1, stdout=log, shell=True)
+
+            cmd_step2 = "bowtie-build %s %s" % (contigs_sopra, bowtie_build)
+            print "SOPRA with prebuilt contigs (Bowtie building index) command to be executed:\n %s" % cmd_step2
+            subprocess.check_call(args=cmd_step2, stdout=log, shell=True)
+
+            for i in range(len(mate_sopras)):
+                cmd_step3 = "bowtie -v %d -m 1 -f --sam %s %s %s" % (max_mismatches, bowtie_build, mate_sopras[i], mysam_mates[i])
+                print "SOPRA with prebuilt contigs (Bowtie alignment of library %d) command to be executed:\n %s" % (i+1, cmd_step3)
+                subprocess.check_call(args=cmd_step3, stdout=log, stderr=subprocess.STDOUT, shell=True) # need to redirect stderr because bowtie writes some logging info there
+
+            cmd_step4 = "s_parse_sam_v1.4.6.pl -sam %s -a %s" % (' '.join(mysam_mates), wd)
+            print "SOPRA with prebuilt contigs (removing reads not mapped in a proper pair) command to be executed:\n %s" % cmd_step4
+            subprocess.check_call(args=cmd_step4, stdout=log, shell=True)
+
+            cmd_step5 = "s_read_parsed_sam_v1.4.6.pl -c %d -a %s" % (c_option, wd)
+            for i in range(len(mysam_mates_parsed)):
+                cmd_step5 += " -parsed %s -d %d" % (mysam_mates_parsed[i], insert_sizes[i])
+            print "SOPRA with prebuilt contigs (read parsed SAM) command to be executed:\n %s" % cmd_step5
+            subprocess.check_call(args=cmd_step5, stdout=log, shell=True)
+
+            cmd_step6 = "perl -X %s -w %d -L %d -h %s -o %s -a %s" % (s_scaf_path, w_option, L_option, h_option, orientdistinfo, wd) # need to call with perl -X because: 1) otherwise some Perl warnings are written on stderr; 2) simply redirecting stderr would hide real errors since it always returns exit status 0
+            print "SOPRA with prebuilt contigs (scaffold assembly) command to be executed:\n %s" % cmd_step6
+            subprocess.check_call(args=cmd_step6, stdout=log, shell=True)
+        finally:
+            if log != sys.stdout:
+                log.close()
+
+        print 'Moving result file %s to %s' % (scaffolds_file, scaffolds)
+        shutil.move(scaffolds_file, scaffolds)
+    finally:
+        shutil.rmtree(wd)
+
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sopra_wpc.xml	Thu Oct 24 14:02:10 2013 -0400
@@ -0,0 +1,69 @@
+<tool id="sopra_wpc" name="SOPRA with prebuilt contigs" version="0.1">
+  <description>for Illumina workflow</description>
+  <requirements>
+    <requirement type="package" version="1.4.6">sopra</requirement>
+    <requirement type="package" version="1.0.0">bowtie</requirement>
+  </requirements>
+  <command interpreter="python">
+    sopra_wpc.py
+    #for $cr in $contigs_repeat
+      --contigs ${cr.contigs_file}
+    #end for
+    #for $mr in $mate_repeat
+      --mate ${mr.mate_file}
+      -d ${mr.insert_size}
+    #end for
+    -v $max_mismatches -c $c_option -w $w_option -L $L_option --h_option $h_option --scaffolds $scaffolds_file --logfile $logfile
+  </command>
+  <inputs>
+    <repeat name="contigs_repeat" title="Contigs file" min="1">
+      <param name="contigs_file" type="data" format="fasta" label="Contigs" help="FASTA format" />
+    </repeat>
+    <repeat name="mate_repeat" title="Paired-end Illumina library" min="1">
+      <param name="mate_file" type="data" format="fasta" label="Paired-end Illumina library" help="FASTA format" />
+      <param name="insert_size" type="integer" value="" label="Insert size" help="Insert size for the library (-d)" />
+    </repeat>
+    <param name="max_mismatches" type="integer" min="0" max="3" value="0" label="Maximum number of mismatches when aligning reads on contigs with Bowtie (-v)" help="May be 0, 1, 2, or 3" />
+    <param name="c_option" type="integer" value="5" label="If the number of times a read and its reverse complement appear in the library is equal to or more than this value, the pairing information from that read will be disregarded (-c)" />
+    <param name="w_option" type="integer" value="4" label="Minimum number of links between two contigs (-w)" />
+    <param name="L_option" type="integer" value="150" label="Minimum length of contigs to be used in scaffold assembly (-L)" />
+    <param name="h_option" type="float" value="2.2" label="h value (-h)" help="High coverage contigs (above mean_coverage + h x std_coverage) are not considered in the scaffold assembly mainly to exclude reads from repetitive regions" />
+  </inputs>
+  <outputs>
+    <data format="fasta" name="scaffolds_file" label="${tool.name} on ${on_string}: scaffolds_sopra.fasta"/>
+    <data format="txt" name="logfile" label="${tool.name} on ${on_string}: log"/>
+  </outputs>
+  <help>
+**What it does**
+
+SOPRA is an assembly tool for mate pair/paired-end data generated by high-throughput sequencing technologies, e.g. Illumina and SOLiD platforms.
+
+The input paired-end FASTA file can be obtained with:
+FR reads -> *FASTQ interlacer on paired end reads* followed by *FASTQ to FASTA* converter
+RF reads -> *Reverse-Complement*, *FASTQ interlacer on paired end reads* followed by *FASTQ to FASTA* converter
+
+.. class:: infomark
+
+**TIP:** Try trimming the end of short reads before feeding it to the assembler to remove the error prone bases (e.g. last 10 to 20 bps) and check if it improves the assembly.
+
+-----
+
+**License and citation**
+
+This Galaxy tool is Copyright © 2013 `CRS4 Srl.`_ and is released under the `MIT license`_.
+
+.. _CRS4 Srl.: http://www.crs4.it/
+.. _MIT license: http://opensource.org/licenses/MIT
+
+If you use this tool in Galaxy, please cite |Cuccuru2013|_.
+
+.. |Cuccuru2013| replace:: Cuccuru, G., Orsini, M., Pinna, A., Sbardellati, A., Soranzo, N., Travaglione, A., Uva, P., Zanetti, G., Fotia, G. (2013) Orione, a web-based framework for NGS analysis in microbiology. *Submitted*
+.. _Cuccuru2013: http://orione.crs4.it/
+
+This tool uses `SOPRA`_, which is licensed separately. Please cite |Dayarian2010|_.
+
+.. _SOPRA: http://www.physics.rutgers.edu/~anirvans/SOPRA/
+.. |Dayarian2010| replace:: Dayarian, A., Michael, T. P., Sengupta, A. M. (2010) SOPRA: Scaffolding algorithm for paired reads via statistical optimization. *BMC Bioinformatics* 11, 345
+.. _Dayarian2010: http://www.biomedcentral.com/1471-2105/11/345/
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Thu Oct 24 14:02:10 2013 -0400
@@ -0,0 +1,21 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <!--<package name="bowtie" version="1.0.0">
+    <repository name="package_bowtie_1_0_0" owner="iuc" />
+  </package>-->
+  <package name="sopra" version="1.4.6">
+    <install version="1.0">
+      <actions>
+        <action type="download_by_url">http://www.physics.rutgers.edu/~anirvans/SOPRA/SOPRA_v1.4.6.zip</action>
+        <action type="move_directory_files">
+          <source_directory>source_codes_v1.4.6/SOPRA_with_prebuilt_contigs</source_directory>
+          <destination_directory>$INSTALL_DIR/SOPRA_with_prebuilt_contigs</destination_directory>
+        </action>
+        <action type="shell_command">chmod 755 $INSTALL_DIR/SOPRA_with_prebuilt_contigs/*.pl</action>
+        <action type="set_environment">
+          <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/SOPRA_with_prebuilt_contigs</environment_variable>
+        </action>
+      </actions>
+    </install>
+  </package>
+</tool_dependency>