diff commons/tools/PrepareBatches.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/tools/PrepareBatches.py	Mon Apr 29 03:20:15 2013 -0400
@@ -0,0 +1,126 @@
+#!/usr/bin/env python
+
+# Copyright INRA (Institut National de la Recherche Agronomique)
+# http://www.inra.fr
+# http://urgi.versailles.inra.fr
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software.  You can  use, 
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info". 
+#
+# As a counterpart to the access to the source code and  rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty  and the software's author,  the holder of the
+# economic rights,  and the successive licensors  have only  limited
+# liability. 
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading,  using,  modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean  that it is complicated to manipulate,  and  that  also
+# therefore means  that it is reserved for developers  and  experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or 
+# data to be ensured and,  more generally, to use and operate it in the 
+# same conditions as regards security. 
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+
+import os
+import sys
+from ConfigParser import NoSectionError, NoOptionError
+from commons.core.checker.CheckerUtils import CheckerUtils
+from commons.core.checker.CheckerException import CheckerException
+from commons.core.utils.FileUtils import FileUtils
+from commons.core.seq.FastaUtils import FastaUtils
+
+class PrepareBatches(object):
+    
+    def __init__(self, pipelineName, projectDir, projectName, iConfig, verbose):
+        self._pipelineName = pipelineName
+        self._projectDir = projectDir
+        self._projectName = projectName
+        self._iConfig = iConfig
+        self._verbose = verbose
+        
+    def run(self):
+        if self._verbose > 0:
+            print "beginning of step 1"
+            sys.stdout.flush()
+        if FileUtils.isRessourceExists("%s_db" % self._projectName):
+            print "ERROR: directory '%s_db' already exists" % self._projectName
+            sys.exit(1)
+        
+        os.mkdir("%s_db" % self._projectName)
+        os.chdir("%s_db" % self._projectName)
+        genomeFastaFileName = "%s.fa" % self._projectName
+        os.symlink("../%s" % genomeFastaFileName, genomeFastaFileName)
+        sectionName = "prepare_batches"
+        self._checkConfig(sectionName)  
+        
+        separator = "\n"
+        inGenomeFileHandler = open(genomeFastaFileName, "r")
+        try:
+            CheckerUtils.checkHeaders(inGenomeFileHandler)
+        except CheckerException, e:
+            print "Error in file %s. Wrong headers are :" % genomeFastaFileName
+            print separator.join(e.messages)
+            print "Authorized characters are : a-z A-Z 0-9 - . : _\n"
+            inGenomeFileHandler.close()
+            sys.exit(1)
+        inGenomeFileHandler.close()
+
+        doClean = False
+        if self._iConfig.get(sectionName, "clean") == "yes":
+            doClean = True
+        chunkFilePrefix = "%s_chunks" % self._projectName
+        chunkLength = int(self._iConfig.get(sectionName, "chunk_length"))
+        chunkOverlap = int(self._iConfig.get(sectionName, "chunk_overlap"))
+        FastaUtils.dbChunks(genomeFastaFileName, chunkLength, chunkOverlap, 0, chunkFilePrefix, doClean, self._verbose)
+        
+        nbSeq = int(self._iConfig.get(sectionName, "nb_seq_per_batch"))
+        FastaUtils.splitFastaFileInBatches("%s.fa" % chunkFilePrefix, nbSeq * chunkLength)
+
+        if self._iConfig.get(sectionName, "clean") == "yes":
+            FileUtils.removeFilesByPattern("%s.fa*" % self._projectName)
+            
+        os.chdir( ".." )
+        if self._verbose > 0:
+            print "step 1 finished successfully"
+            sys.stdout.flush()
+
+    def _checkConfig(self, sectionName):
+        try:
+            CheckerUtils.checkSectionInConfigFile(self._iConfig, sectionName)
+        except NoSectionError:
+            print "ERROR: the section %s must be in your configuration file" % sectionName
+            sys.exit(1)
+        try:
+            CheckerUtils.checkOptionInSectionInConfigFile(self._iConfig, sectionName, "chunk_length")
+        except NoOptionError:
+            print "ERROR: the option 'chunk_length' must be defined in %s in your configuration file" % sectionName
+            sys.exit(1)
+        try:
+            CheckerUtils.checkOptionInSectionInConfigFile(self._iConfig, sectionName, "chunk_overlap")
+        except NoOptionError:
+            print "ERROR: the option 'chunk_overlap' must be defined in %s in your configuration file" % sectionName
+            sys.exit(1)
+        try:
+            CheckerUtils.checkOptionInSectionInConfigFile(self._iConfig, sectionName, "nb_seq_per_batch")
+        except NoOptionError:
+            print "ERROR: the option 'nb_seq_per_batch' must be defined in %s in your configuration file" % sectionName
+            sys.exit(1)
+        try:
+            CheckerUtils.checkOptionInSectionInConfigFile(self._iConfig, sectionName, "resources")
+        except NoOptionError:
+            print "ERROR: the option 'resources' must be defined in %s in your configuration file" % sectionName
+            sys.exit(1)
+        try:
+            CheckerUtils.checkOptionInSectionInConfigFile(self._iConfig, sectionName, "tmpDir")
+        except NoOptionError:
+            print "ERROR: the option 'tmpDir' must be defined in %s in your configuration file" % sectionName
+            sys.exit(1)