# HG changeset patch # User urgi-team # Date 1469017131 14400 # Node ID 22b0494ec88322d6988e273cbfc812ce043eb481 # Parent 782306d67e3992b36117d9b20fcc632f9da492f9 Uploaded diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/LICENSE --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/LICENSE Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,506 @@ + +CeCILL FREE SOFTWARE LICENSE AGREEMENT + + + Notice + +This Agreement is a Free Software license agreement that is the result +of discussions between its authors in order to ensure compliance with +the two main principles guiding its drafting: + + * firstly, compliance with the principles governing the distribution + of Free Software: access to source code, broad rights granted to + users, + * secondly, the election of a governing law, French law, with which + it is conformant, both as regards the law of torts and + intellectual property law, and the protection that it offers to + both authors and holders of the economic rights over software. + +The authors of the CeCILL (for Ce[a] C[nrs] I[nria] L[ogiciel] L[ibre]) +license are: + +Commissariat à l'Energie Atomique - CEA, a public scientific, technical +and industrial research establishment, having its principal place of +business at 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris, France. + +Centre National de la Recherche Scientifique - CNRS, a public scientific +and technological establishment, having its principal place of business +at 3 rue Michel-Ange, 75794 Paris cedex 16, France. + +Institut National de Recherche en Informatique et en Automatique - +INRIA, a public scientific and technological establishment, having its +principal place of business at Domaine de Voluceau, Rocquencourt, BP +105, 78153 Le Chesnay cedex, France. + + + Preamble + +The purpose of this Free Software license agreement is to grant users +the right to modify and redistribute the software governed by this +license within the framework of an open source distribution model. + +The exercising of these rights is conditional upon certain obligations +for users so as to preserve this status for all subsequent redistributions. + +In consideration of access to the source code and the rights to copy, +modify and redistribute granted by the license, users are provided only +with a limited warranty and the software's author, the holder of the +economic rights, and the successive licensors only have limited liability. + +In this respect, the risks associated with loading, using, modifying +and/or developing or reproducing the software by the user are brought to +the user's attention, given its Free Software status, which may make it +complicated to use, with the result that its use is reserved for +developers and experienced professionals having in-depth computer +knowledge. Users are therefore encouraged to load and test the +suitability of the software as regards their requirements in conditions +enabling the security of their systems and/or data to be ensured and, +more generally, to use and operate it in the same conditions of +security. This Agreement may be freely reproduced and published, +provided it is not altered, and that no provisions are either added or +removed herefrom. + +This Agreement may apply to any or all software for which the holder of +the economic rights decides to submit the use thereof to its provisions. + + + Article 1 - DEFINITIONS + +For the purpose of this Agreement, when the following expressions +commence with a capital letter, they shall have the following meaning: + +Agreement: means this license agreement, and its possible subsequent +versions and annexes. + +Software: means the software in its Object Code and/or Source Code form +and, where applicable, its documentation, "as is" when the Licensee +accepts the Agreement. + +Initial Software: means the Software in its Source Code and possibly its +Object Code form and, where applicable, its documentation, "as is" when +it is first distributed under the terms and conditions of the Agreement. + +Modified Software: means the Software modified by at least one +Contribution. + +Source Code: means all the Software's instructions and program lines to +which access is required so as to modify the Software. + +Object Code: means the binary files originating from the compilation of +the Source Code. + +Holder: means the holder(s) of the economic rights over the Initial +Software. + +Licensee: means the Software user(s) having accepted the Agreement. + +Contributor: means a Licensee having made at least one Contribution. + +Licensor: means the Holder, or any other individual or legal entity, who +distributes the Software under the Agreement. + +Contribution: means any or all modifications, corrections, translations, +adaptations and/or new functions integrated into the Software by any or +all Contributors, as well as any or all Internal Modules. + +Module: means a set of sources files including their documentation that +enables supplementary functions or services in addition to those offered +by the Software. + +External Module: means any or all Modules, not derived from the +Software, so that this Module and the Software run in separate address +spaces, with one calling the other when they are run. + +Internal Module: means any or all Module, connected to the Software so +that they both execute in the same address space. + +GNU GPL: means the GNU General Public License version 2 or any +subsequent version, as published by the Free Software Foundation Inc. + +Parties: mean both the Licensee and the Licensor. + +These expressions may be used both in singular and plural form. + + + Article 2 - PURPOSE + +The purpose of the Agreement is the grant by the Licensor to the +Licensee of a non-exclusive, transferable and worldwide license for the +Software as set forth in Article 5 hereinafter for the whole term of the +protection granted by the rights over said Software. + + + Article 3 - ACCEPTANCE + +3.1 The Licensee shall be deemed as having accepted the terms and +conditions of this Agreement upon the occurrence of the first of the +following events: + + * (i) loading the Software by any or all means, notably, by + downloading from a remote server, or by loading from a physical + medium; + * (ii) the first time the Licensee exercises any of the rights + granted hereunder. + +3.2 One copy of the Agreement, containing a notice relating to the +characteristics of the Software, to the limited warranty, and to the +fact that its use is restricted to experienced users has been provided +to the Licensee prior to its acceptance as set forth in Article 3.1 +hereinabove, and the Licensee hereby acknowledges that it has read and +understood it. + + + Article 4 - EFFECTIVE DATE AND TERM + + + 4.1 EFFECTIVE DATE + +The Agreement shall become effective on the date when it is accepted by +the Licensee as set forth in Article 3.1. + + + 4.2 TERM + +The Agreement shall remain in force for the entire legal term of +protection of the economic rights over the Software. + + + Article 5 - SCOPE OF RIGHTS GRANTED + +The Licensor hereby grants to the Licensee, who accepts, the following +rights over the Software for any or all use, and for the term of the +Agreement, on the basis of the terms and conditions set forth hereinafter. + +Besides, if the Licensor owns or comes to own one or more patents +protecting all or part of the functions of the Software or of its +components, the Licensor undertakes not to enforce the rights granted by +these patents against successive Licensees using, exploiting or +modifying the Software. If these patents are transferred, the Licensor +undertakes to have the transferees subscribe to the obligations set +forth in this paragraph. + + + 5.1 RIGHT OF USE + +The Licensee is authorized to use the Software, without any limitation +as to its fields of application, with it being hereinafter specified +that this comprises: + + 1. permanent or temporary reproduction of all or part of the Software + by any or all means and in any or all form. + + 2. loading, displaying, running, or storing the Software on any or + all medium. + + 3. entitlement to observe, study or test its operation so as to + determine the ideas and principles behind any or all constituent + elements of said Software. This shall apply when the Licensee + carries out any or all loading, displaying, running, transmission + or storage operation as regards the Software, that it is entitled + to carry out hereunder. + + + 5.2 ENTITLEMENT TO MAKE CONTRIBUTIONS + +The right to make Contributions includes the right to translate, adapt, +arrange, or make any or all modifications to the Software, and the right +to reproduce the resulting software. + +The Licensee is authorized to make any or all Contributions to the +Software provided that it includes an explicit notice that it is the +author of said Contribution and indicates the date of the creation thereof. + + + 5.3 RIGHT OF DISTRIBUTION + +In particular, the right of distribution includes the right to publish, +transmit and communicate the Software to the general public on any or +all medium, and by any or all means, and the right to market, either in +consideration of a fee, or free of charge, one or more copies of the +Software by any means. + +The Licensee is further authorized to distribute copies of the modified +or unmodified Software to third parties according to the terms and +conditions set forth hereinafter. + + + 5.3.1 DISTRIBUTION OF SOFTWARE WITHOUT MODIFICATION + +The Licensee is authorized to distribute true copies of the Software in +Source Code or Object Code form, provided that said distribution +complies with all the provisions of the Agreement and is accompanied by: + + 1. a copy of the Agreement, + + 2. a notice relating to the limitation of both the Licensor's + warranty and liability as set forth in Articles 8 and 9, + +and that, in the event that only the Object Code of the Software is +redistributed, the Licensee allows future Licensees unhindered access to +the full Source Code of the Software by indicating how to access it, it +being understood that the additional cost of acquiring the Source Code +shall not exceed the cost of transferring the data. + + + 5.3.2 DISTRIBUTION OF MODIFIED SOFTWARE + +When the Licensee makes a Contribution to the Software, the terms and +conditions for the distribution of the resulting Modified Software +become subject to all the provisions of this Agreement. + +The Licensee is authorized to distribute the Modified Software, in +source code or object code form, provided that said distribution +complies with all the provisions of the Agreement and is accompanied by: + + 1. a copy of the Agreement, + + 2. a notice relating to the limitation of both the Licensor's + warranty and liability as set forth in Articles 8 and 9, + +and that, in the event that only the object code of the Modified +Software is redistributed, the Licensee allows future Licensees +unhindered access to the full source code of the Modified Software by +indicating how to access it, it being understood that the additional +cost of acquiring the source code shall not exceed the cost of +transferring the data. + + + 5.3.3 DISTRIBUTION OF EXTERNAL MODULES + +When the Licensee has developed an External Module, the terms and +conditions of this Agreement do not apply to said External Module, that +may be distributed under a separate license agreement. + + + 5.3.4 COMPATIBILITY WITH THE GNU GPL + +The Licensee can include a code that is subject to the provisions of one +of the versions of the GNU GPL in the Modified or unmodified Software, +and distribute that entire code under the terms of the same version of +the GNU GPL. + +The Licensee can include the Modified or unmodified Software in a code +that is subject to the provisions of one of the versions of the GNU GPL, +and distribute that entire code under the terms of the same version of +the GNU GPL. + + + Article 6 - INTELLECTUAL PROPERTY + + + 6.1 OVER THE INITIAL SOFTWARE + +The Holder owns the economic rights over the Initial Software. Any or +all use of the Initial Software is subject to compliance with the terms +and conditions under which the Holder has elected to distribute its work +and no one shall be entitled to modify the terms and conditions for the +distribution of said Initial Software. + +The Holder undertakes that the Initial Software will remain ruled at +least by this Agreement, for the duration set forth in Article 4.2. + + + 6.2 OVER THE CONTRIBUTIONS + +The Licensee who develops a Contribution is the owner of the +intellectual property rights over this Contribution as defined by +applicable law. + + + 6.3 OVER THE EXTERNAL MODULES + +The Licensee who develops an External Module is the owner of the +intellectual property rights over this External Module as defined by +applicable law and is free to choose the type of agreement that shall +govern its distribution. + + + 6.4 JOINT PROVISIONS + +The Licensee expressly undertakes: + + 1. not to remove, or modify, in any manner, the intellectual property + notices attached to the Software; + + 2. to reproduce said notices, in an identical manner, in the copies + of the Software modified or not. + +The Licensee undertakes not to directly or indirectly infringe the +intellectual property rights of the Holder and/or Contributors on the +Software and to take, where applicable, vis-à-vis its staff, any and all +measures required to ensure respect of said intellectual property rights +of the Holder and/or Contributors. + + + Article 7 - RELATED SERVICES + +7.1 Under no circumstances shall the Agreement oblige the Licensor to +provide technical assistance or maintenance services for the Software. + +However, the Licensor is entitled to offer this type of services. The +terms and conditions of such technical assistance, and/or such +maintenance, shall be set forth in a separate instrument. Only the +Licensor offering said maintenance and/or technical assistance services +shall incur liability therefor. + +7.2 Similarly, any Licensor is entitled to offer to its licensees, under +its sole responsibility, a warranty, that shall only be binding upon +itself, for the redistribution of the Software and/or the Modified +Software, under terms and conditions that it is free to decide. Said +warranty, and the financial terms and conditions of its application, +shall be subject of a separate instrument executed between the Licensor +and the Licensee. + + + Article 8 - LIABILITY + +8.1 Subject to the provisions of Article 8.2, the Licensee shall be +entitled to claim compensation for any direct loss it may have suffered +from the Software as a result of a fault on the part of the relevant +Licensor, subject to providing evidence thereof. + +8.2 The Licensor's liability is limited to the commitments made under +this Agreement and shall not be incurred as a result of in particular: +(i) loss due the Licensee's total or partial failure to fulfill its +obligations, (ii) direct or consequential loss that is suffered by the +Licensee due to the use or performance of the Software, and (iii) more +generally, any consequential loss. In particular the Parties expressly +agree that any or all pecuniary or business loss (i.e. loss of data, +loss of profits, operating loss, loss of customers or orders, +opportunity cost, any disturbance to business activities) or any or all +legal proceedings instituted against the Licensee by a third party, +shall constitute consequential loss and shall not provide entitlement to +any or all compensation from the Licensor. + + + Article 9 - WARRANTY + +9.1 The Licensee acknowledges that the scientific and technical +state-of-the-art when the Software was distributed did not enable all +possible uses to be tested and verified, nor for the presence of +possible defects to be detected. In this respect, the Licensee's +attention has been drawn to the risks associated with loading, using, +modifying and/or developing and reproducing the Software which are +reserved for experienced users. + +The Licensee shall be responsible for verifying, by any or all means, +the suitability of the product for its requirements, its good working +order, and for ensuring that it shall not cause damage to either persons +or properties. + +9.2 The Licensor hereby represents, in good faith, that it is entitled +to grant all the rights over the Software (including in particular the +rights set forth in Article 5). + +9.3 The Licensee acknowledges that the Software is supplied "as is" by +the Licensor without any other express or tacit warranty, other than +that provided for in Article 9.2 and, in particular, without any warranty +as to its commercial value, its secured, safe, innovative or relevant +nature. + +Specifically, the Licensor does not warrant that the Software is free +from any error, that it will operate without interruption, that it will +be compatible with the Licensee's own equipment and software +configuration, nor that it will meet the Licensee's requirements. + +9.4 The Licensor does not either expressly or tacitly warrant that the +Software does not infringe any third party intellectual property right +relating to a patent, software or any other property right. Therefore, +the Licensor disclaims any and all liability towards the Licensee +arising out of any or all proceedings for infringement that may be +instituted in respect of the use, modification and redistribution of the +Software. Nevertheless, should such proceedings be instituted against +the Licensee, the Licensor shall provide it with technical and legal +assistance for its defense. Such technical and legal assistance shall be +decided on a case-by-case basis between the relevant Licensor and the +Licensee pursuant to a memorandum of understanding. The Licensor +disclaims any and all liability as regards the Licensee's use of the +name of the Software. No warranty is given as regards the existence of +prior rights over the name of the Software or as regards the existence +of a trademark. + + + Article 10 - TERMINATION + +10.1 In the event of a breach by the Licensee of its obligations +hereunder, the Licensor may automatically terminate this Agreement +thirty (30) days after notice has been sent to the Licensee and has +remained ineffective. + +10.2 A Licensee whose Agreement is terminated shall no longer be +authorized to use, modify or distribute the Software. However, any +licenses that it may have granted prior to termination of the Agreement +shall remain valid subject to their having been granted in compliance +with the terms and conditions hereof. + + + Article 11 - MISCELLANEOUS + + + 11.1 EXCUSABLE EVENTS + +Neither Party shall be liable for any or all delay, or failure to +perform the Agreement, that may be attributable to an event of force +majeure, an act of God or an outside cause, such as defective +functioning or interruptions of the electricity or telecommunications +networks, network paralysis following a virus attack, intervention by +government authorities, natural disasters, water damage, earthquakes, +fire, explosions, strikes and labor unrest, war, etc. + +11.2 Any failure by either Party, on one or more occasions, to invoke +one or more of the provisions hereof, shall under no circumstances be +interpreted as being a waiver by the interested Party of its right to +invoke said provision(s) subsequently. + +11.3 The Agreement cancels and replaces any or all previous agreements, +whether written or oral, between the Parties and having the same +purpose, and constitutes the entirety of the agreement between said +Parties concerning said purpose. No supplement or modification to the +terms and conditions hereof shall be effective as between the Parties +unless it is made in writing and signed by their duly authorized +representatives. + +11.4 In the event that one or more of the provisions hereof were to +conflict with a current or future applicable act or legislative text, +said act or legislative text shall prevail, and the Parties shall make +the necessary amendments so as to comply with said act or legislative +text. All other provisions shall remain effective. Similarly, invalidity +of a provision of the Agreement, for any reason whatsoever, shall not +cause the Agreement as a whole to be invalid. + + + 11.5 LANGUAGE + +The Agreement is drafted in both French and English and both versions +are deemed authentic. + + + Article 12 - NEW VERSIONS OF THE AGREEMENT + +12.1 Any person is authorized to duplicate and distribute copies of this +Agreement. + +12.2 So as to ensure coherence, the wording of this Agreement is +protected and may only be modified by the authors of the License, who +reserve the right to periodically publish updates or new versions of the +Agreement, each with a separate number. These subsequent versions may +address new issues encountered by Free Software. + +12.3 Any Software distributed under a given version of the Agreement may +only be subsequently distributed under the same version of the Agreement +or a subsequent version, subject to the provisions of Article 5.3.4. + + + Article 13 - GOVERNING LAW AND JURISDICTION + +13.1 The Agreement is governed by French law. The Parties agree to +endeavor to seek an amicable solution to any disagreements or disputes +that may arise during the performance of the Agreement. + +13.2 Failing an amicable solution within two (2) months as from their +occurrence, and unless emergency proceedings are necessary, the +disagreements or disputes shall be referred to the Paris Courts having +jurisdiction, by the more diligent Party. + + +Version 2.0 dated 2006-09-05. diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/PKG-INFO --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/PKG-INFO Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,10 @@ +Metadata-Version: 1.0 +Name: TEisotools +Version: 1.0 +Summary: Set of tools to analyse RNA_seq for the France Genomics projects. +Home-page: https://urgi.versailles.inra.fr/Projects/TEiso +Author: URGI team +Author-email: urgi-support@versailles.inra.fr +License: UNKNOWN +Description: UNKNOWN +Platform: UNKNOWN diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/TEiso/Bedtools_closest.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/TEiso/Bedtools_closest.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,125 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import os +from commons.core.checker.CheckerUtils import CheckerUtils +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.utils.FileUtils import FileUtils +import subprocess + +class Bedtools_closest(object): + + def __init__(self, input_file_A = "", input_file_B = "", output_file = "", verbosity = 3): + self._input_file_A = input_file_A + self._input_file_B = input_file_B + self._output_file = output_file + self._verbosity = verbosity + + def setAttributesFromCmdLine(self): + description = "For each feature in A, finds the closest feature (upstream or downstream) in B.\n" + usage = " Bedtools_closest [OPTIONS] -a -b -o \n" + parser = RepetOptionParser(description = description, usage = usage) + parser.add_option( '-a', '--input_file_A', dest='input_file_A', help='bed/gff/vcf' ) + parser.add_option( '-b', '--input_file_B', dest='input_file_B', help='bed/gff/vcf' ) + parser.add_option( '-o', '--output_file', dest='output_file', help='write all output in this file/bed/gff/vcf', default = "") + options, args = parser.parse_args() + self.setAttributesFromOptions(options) + + def setAttributesFromOptions(self, options): + self._input_file_A = options.input_file_A + self._input_file_B = options.input_file_B + self._output_file = options.output_file + + def checkExecutables(self): + if not CheckerUtils.isExecutableInUserPath("bedtools"): + raise Exception("ERROR: bedtools must be in your path") + + def checkOptions(self): + if self._input_file_A != "": + if not FileUtils.isRessourceExists(self._input_file_A ): + raise Exception("ERROR: reference file %s does not exist!" % self._input_file_A ) + else: + raise Exception("ERROR: No specified -a option!") + + if self._input_file_B != "": + if not FileUtils.isRessourceExists(self._input_file_B): + raise Exception("ERROR: reference file %s does not exist!" % self._input_file_B ) + else: + raise Exception("ERROR: No specified -b option!") + + def getbedtoolsclosestCmd(self, file_A, file_B, output_file): + cmd = 'bedtools closest -a %s -b %s -d -D a > %s' % (file_A,file_B, output_file) + ##print cmd + return cmd + + def run(self): + self.checkExecutables() + self.checkOptions() + sortfileA = "%s.sorted" % self._input_file_A + sortfileB = "%s.sorted" % self._input_file_B + os.system("bedtools sort -i %s > %s " % (self._input_file_A, sortfileA)) + os.system("bedtools sort -i %s > %s " % (self._input_file_B, sortfileB)) + + try: + if os.path.exists(self._output_file): + raise Exception("ERROR: %s already exists." % self._output_file) + + cmd_bedtoolsclosest = self.getbedtoolsclosestCmd(sortfileA, sortfileB, self._output_file) + ## hide output of subprocess: stdout = index_dir_stderr + fstdout = open( "bedtools_closest.log" , 'w' ) + process = subprocess.Popen(cmd_bedtoolsclosest, shell = True, stdout = fstdout, stderr=subprocess.STDOUT) + returncode = process.wait() + fstdout.close() + # get stderr, allowing for case where it's very large + fstdout = open("bedtools_closest.log", 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += fstdout.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + fstdout.close() + if returncode != 0: + raise Exception, stderr + #os.system("mv cufflinks.log %s/cufflinks.log " % workingDir) + except Exception: + raise Exception("ERROR in %s " % cmd_bedtoolsclosest) + + +if __name__ == "__main__": + iLaunch = Bedtools_closest() + iLaunch.setAttributesFromCmdLine() + iLaunch.run() + diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/TEiso/Bowtie_build.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/TEiso/Bowtie_build.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,123 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +import os +from commons.core.checker.CheckerUtils import CheckerUtils +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.utils.FileUtils import FileUtils +import subprocess + +class Bowtie_build(object): + + def __init__(self, input_reference = "", output_prefix= "", workingDir = "", verbosity = 3): + + self._inputRef = input_reference + self._outPrefix = output_prefix + self._outputDir = workingDir + self._verbosity = verbosity + + def setAttributesFromCmdLine(self): + description = "Create index for refrence file with bowtie-build.\n" + usage = "Bowtie_build -i -o \n" + print usage + parser = RepetOptionParser(description = description, usage = usage) + parser.add_option( '-i', '--input_reference', dest='input_reference', help='comma-separated list of files with ref sequences' ) + parser.add_option( '-p', '--output_prefix', dest='output_prefix', help=' write Ebwt data to files with this dir/basename' ) + parser.add_option( '-o', '--outputDir', dest='outputDir', help='result directory name', default = "") + options, args = parser.parse_args() + self.setAttributesFromOptions(options) + + def setAttributesFromOptions(self, options): + self._inputRef = options.input_reference + self._outPrefix = options.output_prefix + self._outputDir = options.outputDir + + def checkExecutables(self): + if not CheckerUtils.isExecutableInUserPath("bowtie2-build"): + raise Exception("ERROR: bowtie2-build must be in your path") + + def checkOptions(self): + if self._inputRef != "": + if not FileUtils.isRessourceExists(self._inputRef): + raise Exception("ERROR: reference file %s does not exist!" % self._inputRef) + else: + raise Exception("ERROR: No specified -i option!") + + if self._outPrefix == "": + raise Exception("ERROR: No specified -o option!") + + + def getBowtie2buildCmd(self, inputRef, outPrefix ): + cmd = 'bowtie2-build %s %s' % (inputRef, self._outPrefix ) + #print cmd + return cmd + + def run(self): + + self.checkExecutables() + self.checkOptions() + try: + workingDir = self._outputDir + if os.path.exists(workingDir): + raise Exception("ERROR: %s already exists." % workingDir) + os.mkdir(workingDir) + os.chdir(workingDir) + os.symlink("%s" % self._inputRef,"%s.fa" % self._outPrefix) + cmd_bowtie = self.getBowtie2buildCmd(self._inputRef, self._outPrefix) + ## hide output of subprocess: stdout = index_dir_stderr + fstdout = open( "bowtie2_build.log" , 'w' ) + process = subprocess.Popen(cmd_bowtie, shell = True, stdout = fstdout, stderr=subprocess.STDOUT) + returncode = process.wait() + fstdout.close() + # get stderr, allowing for case where it's very large + fstdout = open("bowtie2_build.log", 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += fstdout.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + fstdout.close() + if returncode != 0: + raise Exception, stderr + except Exception: + raise Exception("ERROR in %s " % cmd_bowtie) + + +if __name__ == "__main__": + print "heloo" + iLaunch = Bowtie_build() + iLaunch.setAttributesFromCmdLine() + iLaunch.run() + diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/TEiso/ClosestToStartSite.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/TEiso/ClosestToStartSite.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,333 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +from commons.core.LoggerFactory import LoggerFactory +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.checker.RepetException import RepetException +from commons.core.utils.FileUtils import FileUtils +import re,os +LOG_NAME = "TEiso" + +class ClosestToStartSite(object): + + def __init__(self, inputFile = "", cuffcom_tmap = "", outputFile = "", verbosity = 3): + self._inputFile = inputFile + self._cuffcom_tmap = cuffcom_tmap + self._outputFile = outputFile + self._verbosity = verbosity + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_NAME, self.__class__.__name__), self._verbosity) + + def setAttributesFromCmdLine(self): + self._toolVersion = "1.0" + description = "ClosestToStartSite version %s" % self._toolVersion + epilog = "\nParser a bed file and create a bed file to create a report about positions of features A to features B. \n" + epilog +="it can also add the class code of features A.\n" + epilog += "example: ClosestToStartSite.py -i -c -o \n" + parser = RepetOptionParser(description = description, epilog = epilog, version = self._toolVersion) + parser.add_option("-i", "--inputFile", dest = "inputFile", action = "store", type = "string", help = "input bed file name.", default = "") + parser.add_option("-c", "--cuffcom_tmap", dest = "cuffcom_tmap", action = "store", type = "string", help = "input gtf file of cuffcompare (.tmap)", default = "") + parser.add_option("-o", "--outputFile", dest = "outputFile", action = "store", type = "string", help = "output file name", default = "") + parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 3]",default = 3) + options = parser.parse_args()[0] + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self._inputFile = options.inputFile + self._cuffcom_tmap = options.cuffcom_tmap + self._outputFile = options.outputFile + self._verbosity = options.verbosity + + def _logAndRaise(self, errorMsg): + self._log.error(errorMsg) + raise RepetException(errorMsg) + + def checkoption(self): + if self._inputFile == "": + self._log.info("Missing input file") + else: + if not FileUtils.isRessourceExists(self._inputFile): + self._log.info("'%s' doesn't exist!" % self._inputFile) + + if self._cuffcom_tmap != "": + if not FileUtils.isRessourceExists(self._cuffcom_tmap): + self._log.info("'%s' doesn't exist!" % self._cuffcom_tmap) + if self._outputFile == "": + self._outputFile = "%s_Close2TSS_with_classcode.bed" % os.path.splitext(self._inputFile)[0] + else: + if FileUtils.isRessourceExists(self._outputFile): + self._log.info("Output file '%s' already exists!" % self._outputFile) + else: + if self._outputFile == "": + self._outputFile = "%s_Close2TSS.bed" % os.path.splitext(self._inputFile)[0] + else: + if FileUtils.isRessourceExists(self._outputFile): + self._log.info("Output file '%s' already exists!" % self._outputFile) + + + def getClassCodeCuffcompare(self, tmap_file, listPossitions): + class_code_dic = {} + lcode_ref = [] + tmp = [] + linetowrite =[] + with open(tmap_file) as tmap: + tmapline = tmap.readlines() + for i in range(1,len(tmapline)): + cuff_id = tmapline[i].split("\t")[4].strip() + class_code = tmapline[i].split("\t")[2].strip() + ref_id = tmapline[i].split("\t")[1].strip() + lcode_ref.append(class_code) + lcode_ref.append(ref_id) + class_code_dic[cuff_id] = lcode_ref + lcode_ref = [] + + + for i in xrange(0,len(listPossitions)): + tmp.extend(listPossitions[i]) + transcript_bedtools = listPossitions[i][3] + + if transcript_bedtools in class_code_dic.keys(): + tmp.append(class_code_dic[transcript_bedtools][0]) + tmp.append(class_code_dic[transcript_bedtools][1]) + else: + tmp.append("NA") + tmp.append("NA") + linetowrite.append(tmp) + tmp=[] + return linetowrite + + + + def getClosestToStartSite(self, inputFile): + linelist = [] + tmplist = [] + with open(inputFile, "r") as bedFile: + for line in bedFile.readlines(): + m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\t+([^\t]+)\t+([^\t]+)\t+([+-])\t+(\d+\.\d+)\t+([^\t]+)+\t+(\d+)\t+(\d+)\t+([^\t]+)+\t+([^\t]+)\t+([+-])\t+([^\t]+)",line) + if(m != None): + start_TR = int(m.group(2))##F[1] + end_TR = int(m.group(3))##F[2] + strand_TR= m.group(6) ##[5] + start_TE = int(m.group(9))##[8] + end_TE = int(m.group(10))##[9] + dist = int(m.group(14))##[13] + if (start_TE < start_TR) and (end_TE < start_TR) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE) and (dist != 0): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + tmplist.append(line.split("\t")[13].strip()) + tmplist.append("TE_closest_TSS") + linelist.append(tmplist) + tmplist = [] + # F[1] gene F[2] + # =========================> + # ------------ + # F[8] F[9] + + if (start_TE > end_TR) and (end_TE > end_TR) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE) and (dist != 0): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + tmplist.append(line.split("\t")[13].strip()) + tmplist.append("TE_closest_TSS") + linelist.append(tmplist) + tmplist = [] + + # F[1] F[2] + # <====================== + # --------------- + + if (start_TE <= start_TR) and (start_TR < end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + overlap = (end_TE-start_TR)+1 + tmplist.append(overlap) + tmplist.append("TE_overlap_TSS") + linelist.append(tmplist) + tmplist = [] + + # F[1] gene F[2] + # =========================> + # ------------- + # F[8] F[9] + + # gene + # F[1]=========================>F[2] + + # F[8]---------------F[9] + + + if (start_TE < start_TR) and (start_TR == end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + tmplist.append(0) + tmplist.append("TE_overlap_TSS") + linelist.append(tmplist) + tmplist = [] + + ## F[1]=============================>F[2] + ## F[8]---------------F[9] + + + if (start_TE < end_TR) and (end_TR <= end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + overlap = (end_TR-start_TE)+1 + tmplist.append(overlap) + tmplist.append("TE_overlap_TSS") + linelist.append(tmplist) + tmplist = [] + + + # F[1]<======================F[2] + # --------------- + # F[8] F[9] + # + # + # F[1]<=============================F[2] + # F[8]---------------F[9] + + if (start_TE == end_TR) and (end_TR < end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + tmplist.append(0) + tmplist.append("TE_overlap_TSS") + linelist.append(tmplist) + tmplist = [] + + # F[1]<=============================F[2] + # F[8]---------------F[9] + + if (start_TR < start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE < end_TR) and (dist == 0): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + tmplist.append(0) + #tmplist.append(line.strip()) + tmplist.append("TE_in_gene") + linelist.append(tmplist) + tmplist = [] + + + # F[1] gene F[2] + # ============================== + # ----------- + # F[8] F[9] + + + if (start_TE < start_TR) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TR < end_TE): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + #lenTE = (end_TE-start_TE)+1 + tmplist.append(0) + tmplist.append("gene_in_TE") + linelist.append(tmplist) + tmplist = [] + + # F[1]======================F[2] + # F[8]----------------------------------------------------F[9] + + + if (strand_TR =="+") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + tmplist.append(0) + #tmplist.append(line.strip()) + tmplist.append("gene_in_TE") + linelist.append(tmplist) + tmplist = [] + + # F[1]==================================>F[2] + # F[8]----------------------------------------------------------F[9] + + if (strand_TR =="-") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + tmplist.append(0) + #tmplist.append(line.strip()) + tmplist.append("gene_in_TE") + linelist.append(tmplist) + tmplist = [] + + # F[1]<==================================F[2] + # F[8]----------------------------------------------------------F[9] + + if (strand_TR =="+") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + tmplist.append(0) + #tmplist.append(line.strip()) + tmplist.append("gene_in_TE") + linelist.append(tmplist) + tmplist = [] + + # F[1]==================================>F[2] + # F[8]----------------------------------------------------------F[9] + + if (strand_TR =="-") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + tmplist.append(0) + #tmplist.append(line.strip()) + tmplist.append("gene_in_TE") + linelist.append(tmplist) + tmplist = [] + + # F[1]<==================================F[2] + # F[8]----------------------------------------------------------F[9] + + return linelist + + + def writeOutputFromList(self, listPossitions , outputFile): + w = open(outputFile,'w') + for s in listPossitions: + line= "\t".join(str(item) for item in s) + w.write("%s\n" % line) + w.close() + + + def run(self): + self.checkoption() + listPossitions = self.getClosestToStartSite(self._inputFile) + if self._cuffcom_tmap == "": + self.writeOutputFromList(listPossitions, self._outputFile ) + else: + listclasscode = self.getClassCodeCuffcompare(self._cuffcom_tmap, listPossitions) + self.writeOutputFromList(listclasscode, self._outputFile) + +if __name__== "__main__": + iClosestToStartSite = ClosestToStartSite() + iClosestToStartSite.setAttributesFromCmdLine() + iClosestToStartSite.run() + + + + + + diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/TEiso/Cuffcompare.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/TEiso/Cuffcompare.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,125 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import os +from commons.core.checker.CheckerUtils import CheckerUtils +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.utils.FileUtils import FileUtils +import subprocess + +class Cuffcompare(object): + + def __init__(self, reference = "", transcripts = "", outprefix ="" , workingDir = "", verbosity = 3): + self._reference = reference + self._transcripts = transcripts + self._outprefix = outprefix + self._output_Dir = workingDir + self._verbosity = verbosity + + def setAttributesFromCmdLine(self): + description = "Cuffcompare provides classification, reference annotation mapping and various statistics for Cufflinks transfrags.\n" + usage = " Cuffcompare -r -i -o \n" + parser = RepetOptionParser(description = description, usage = usage) + parser.add_option( '-r', '--reference', dest='reference', help='a set of known mRNAs to use as a reference for assessing the accuracy of mRNAs or gene models given in ' ) + parser.add_option( '-i', '--transcripts', dest='transcripts', help='input transfrags' ) + parser.add_option( '-o', '--outprefix', dest='outprefix', help='write all output files with out prefix', default = "cuffcompare") + options, args = parser.parse_args() + self.setAttributesFromOptions(options) + + def setAttributesFromOptions(self, options): + self._reference = options.reference + self._transcripts = options.transcripts + self._outprefix = options.outprefix + + def checkExecutables(self): + if not CheckerUtils.isExecutableInUserPath("cuffcompare"): + raise Exception("ERROR: cuffcompare must be in your path") + + def checkOptions(self): + if self._transcripts != "": + if not FileUtils.isRessourceExists(self._transcripts): + raise Exception("ERROR: input file %s does not exist!" % self._transcripts) + else: + raise Exception("ERROR: No specified -i option!") + + if self._reference != "": + if not FileUtils.isRessourceExists(self._reference): + raise Exception("ERROR: reference file %s does not exist!" % self._reference) + else: + raise Exception("ERROR: No specified -r option!") + + def getCuffcompareCmd(self, reference, transcripts, outprefix): + cmd = 'cuffcompare -R -C -r %s %s -o %s' % (reference, transcripts, outprefix) + ##print cmd + return cmd + + def run(self): + self.checkExecutables() + self.checkOptions() + try: + workingDir = self._output_Dir + if os.path.exists(workingDir): + raise Exception("ERROR: %s already exists." % workingDir) + cmd_cuffcompare = self.getCuffcompareCmd(self._reference, self._transcripts, self._outprefix) + ## hide output of subprocess: stdout = index_dir_stderr + fstdout = open( "cuffcompare.log" , 'w' ) + process = subprocess.Popen(cmd_cuffcompare, shell = True, stdout = fstdout, stderr=subprocess.STDOUT) + returncode = process.wait() + fstdout.close() + # get stderr, allowing for case where it's very large + fstdout = open("cuffcompare.log", 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += fstdout.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + fstdout.close() + if returncode != 0: + raise Exception, stderr + + if not os.path.exists(workingDir): + os.mkdir(workingDir) + ##os.system("mv cuffcompare.log %s/cuffcompare.log " % workingDir) + os.system("mv cuffcompare.* %s" % workingDir) + except Exception: + raise Exception("ERROR in %s " % cmd_cuffcompare) + + +if __name__ == "__main__": + iLaunch = Cuffcompare() + iLaunch.setAttributesFromCmdLine() + iLaunch.run() + diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/TEiso/Cufflinks.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/TEiso/Cufflinks.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,124 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import os, sys +from commons.core.checker.CheckerUtils import CheckerUtils +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.utils.FileUtils import FileUtils +import subprocess + +class Cufflinks(object): + + def __init__(self, input_mapped = "", input_transcripts = "", workingDir = "", verbosity = 3): + self._input_mapped = input_mapped + self._transcripts = input_transcripts + self._output_dir = workingDir + self._verbosity = verbosity + + def setAttributesFromCmdLine(self): + description = "It accepts aligned RNA-Seq reads and assembles the alignments into a parsimonious set of transcripts." + usage = "Cufflinks.py -i -g -o \n" + parser = RepetOptionParser(description = description, usage = usage) + parser.add_option( '-i', '--input_mapped', dest='input_mapped', help='aligned RNA-Seq reads' ) + parser.add_option( '-g' , '--input_transcripts', dest='input_transcripts', help='GTF/GFF with known transcripts' , default="" ) + parser.add_option( '-o', '--output_dir', dest='output_dir', help='write all output files to this directory', default = "") + options, args = parser.parse_args() + self.setAttributesFromOptions(options) + + def setAttributesFromOptions(self, options): + self._input_mapped = options.input_mapped + self._transcripts = options.input_transcripts + self._output_dir = options.output_dir + + def checkExecutables(self): + if not CheckerUtils.isExecutableInUserPath("cufflinks"): + raise Exception("ERROR: cufflinks must be in your path") + + def checkOptions(self): + if self._input_mapped != "": + if not FileUtils.isRessourceExists(self._input_mapped): + raise Exception("ERROR: reference file %s does not exist!" % self._input_mapped) + else: + raise Exception("ERROR: No specified -i option!") + + if self._transcripts != "" : + if not FileUtils.isRessourceExists(self._input_mapped): + raise Exception("ERROR: reference file %s does not exist!" % self._transcripts) + + + def getCufflinksCmd(self, mapped, transcripts, output_dir ): + if self._transcripts != "" : + cmd = 'cufflinks %s -g %s -o %s' % (mapped, transcripts , output_dir) + else: + cmd = 'cufflinks %s -o %s' % (mapped , output_dir) +# print cmd + return cmd + + def run(self): + self.checkExecutables() + self.checkOptions() + try: + workingDir = self._output_dir + if os.path.exists(workingDir): + print "ERROR: %s already exists." % workingDir + sys.exit(1) + raise Exception("ERROR: %s already exists." % workingDir) + cmd_cufflinks = self.getCufflinksCmd(self._input_mapped, self._transcripts, self._output_dir) + ## hide output of subprocess: stdout = index_dir_stderr + fstdout = open( "cufflinks.log" , 'w' ) + process = subprocess.Popen(cmd_cufflinks, shell = True, stdout = fstdout, stderr=subprocess.STDOUT) + returncode = process.wait() + fstdout.close() + # get stderr, allowing for case where it's very large + fstdout = open("cufflinks.log", 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += fstdout.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + fstdout.close() + if returncode != 0: + raise Exception, stderr + os.system("mv cufflinks.log %s/cufflinks.log " % workingDir) + except Exception: + raise Exception("ERROR in %s " % cmd_cufflinks) + + +if __name__ == "__main__": + iLaunch = Cufflinks() + iLaunch.setAttributesFromCmdLine() + iLaunch.run() + diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/TEiso/CufflinksGTFToBed.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/TEiso/CufflinksGTFToBed.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,107 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +from commons.core.LoggerFactory import LoggerFactory +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.checker.RepetException import RepetException +from commons.core.utils.FileUtils import FileUtils +from commons.core.parsing.GtfParser import GtfParser +import os +LOG_NAME = "TEiso" + +class CufflinksGTFToBed(object): + + def __init__(self, inputFile = "", outputFile = "", verbosity = 3): + self._inputFile = inputFile + self._outputFile = outputFile + self._verbosity = verbosity + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_NAME, self.__class__.__name__), self._verbosity) + + def setAttributesFromCmdLine(self): + self._toolVersion = "1.0" + description = "CufflinksGTFToBed version %s" % self._toolVersion + epilog = "\n parses a GTF file of Cufflinks and create a bed file. \n" + epilog += "example: CufflinksGTFToBed.py -i -o \n" + parser = RepetOptionParser(description = description, epilog = epilog, version = self._toolVersion) + parser.add_option("-i", "--inputFile", dest = "inputFile", action = "store", type = "string", help = "Input GTF File name (transcript.gtf of Cufflinks).", default = "") + parser.add_option("-o", "--outputFile", dest = "outputFile", action = "store", type = "string", help = "output Bed File name", default = "") + parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "Verbosity [optional] [default: 3]",default = 3) + options = parser.parse_args()[0] + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self._inputFile = options.inputFile + self._outputFile = options.outputFile + self._verbosity = options.verbosity + + def _logAndRaise(self, errorMsg): + self._log.error(errorMsg) + raise RepetException(errorMsg) + + def checkoption(self): + if self._outputFile == "": + #self._log.info("Missing output file destination") + self._outputFile = "%s.bed" % os.path.splitext(self._inputFile)[0] + else: + if FileUtils.isRessourceExists(self._outputFile): + self._log.info("Output file '%s' already exists!" % self._outputFile) + + if self._inputFile == "": + self._log.info("Missing input file") + + def getTranscriptToBed (self, inputFile ,outputFile): + try: + filewrite=open(outputFile, "w") + gtfParser = GtfParser(self._inputFile, assemblyTools=True) + for transcript in gtfParser.getIterator(): + if(transcript.getDirection()==1): + strand="+" + else: + strand="-" + filewrite.write("%s\t%s\t%s\t%s\t%s\t%s\t%.3f\n" % (transcript.getChromosome(),transcript.getStart(), + transcript.getEnd(), transcript.getTagValue("ID"), transcript.getTagValue("gene_id"), + strand,float(transcript.getTagValue("FPKM")) )) + filewrite.close() + except: + raise Exception("Couldn't open %s for writing" % outputFile) + + def run(self): + self.checkoption() + self.getTranscriptToBed(self._inputFile, self._outputFile) + + +if __name__== "__main__": + iTranscriptToBed = CufflinksGTFToBed() + iTranscriptToBed.setAttributesFromCmdLine() + iTranscriptToBed.run() + + diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/TEiso/GFFToBed.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/TEiso/GFFToBed.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,107 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +from commons.core.LoggerFactory import LoggerFactory +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.checker.RepetException import RepetException +from commons.core.utils.FileUtils import FileUtils +import os +from commons.core.parsing.GffParser import GffParser +LOG_NAME = "TEiso" + +class GFFToBed(object): + + def __init__(self, inputFile = "", outputFile = "", verbosity = 3): + self._inputFile = inputFile + self._outputFile = outputFile + self._verbosity = verbosity + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_NAME, self.__class__.__name__), self._verbosity) + + def setAttributesFromCmdLine(self): + self._toolVersion = "1.0" + description = "GFFToBed version %s" % self._toolVersion + epilog = "\n parses a GFF3 file and create a bed file. \n" + epilog += "example: GFFToBed.py -i -o \n" + parser = RepetOptionParser(description = description, epilog = epilog, version = self._toolVersion) + parser.add_option("-i", "--inputFile", dest = "inputFile", action = "store", type = "string", help = "Input GFF3 File name.", default = "") + parser.add_option("-o", "--outputFile", dest = "outputFile", action = "store", type = "string", help = "output Bed File name", default = "") + parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "Verbosity [optional] [default: 3]",default = 3) + options = parser.parse_args()[0] + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self._inputFile = options.inputFile + self._outputFile = options.outputFile + self._verbosity = options.verbosity + + def _logAndRaise(self, errorMsg): + self._log.error(errorMsg) + raise RepetException(errorMsg) + + def checkoption(self): + if self._outputFile == "": + #self._log.info("Missing output file destination") + self._outputFile = "%s.bed" % os.path.splitext(self._inputFile)[0] + else: + if FileUtils.isRessourceExists(self._outputFile): + self._log.info("Output file '%s' already exists!" % self._outputFile) + + if self._inputFile == "": + self._log.info("Missing input file") + + def getGFFToBed (self, inputFile ,outputFile): + try: + filewrite=open(outputFile, "w") + gffParser = GffParser(inputFile) + for transcript in gffParser.getIterator(): + if(transcript.getDirection()==1): + strand="+" + else: + strand="-" + filewrite.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (transcript.getChromosome(),transcript.getStart(), + transcript.getEnd(), transcript.getTagValue("ID"), transcript.getTagValue("Target"), strand) ) + filewrite.close() + except: + raise Exception("Couldn't open %s for writing" % outputFile) + + + def run(self): + self.checkoption() + self.getGFFToBed(self._inputFile, self._outputFile) + + +if __name__== "__main__": + iGFFToBed = GFFToBed() + iGFFToBed.setAttributesFromCmdLine() + iGFFToBed.run() + + diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/TEiso/LaunchTEiso.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/TEiso/LaunchTEiso.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,503 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.parsing.GtfParser import GtfParser +from commons.core.parsing.GffParser import GffParser +from TEiso.Bowtie_build import Bowtie_build +from TEiso.Tophat import Tophat +from TEiso.Cufflinks import Cufflinks +from TEiso.Cuffcompare import Cuffcompare +from TEiso.Bedtools_closest import Bedtools_closest +from commons.core.LoggerFactory import LoggerFactory +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.utils.FileUtils import FileUtils +import os +import time +import re +import sys +LOG_NAME = "repet.TEiso" + +class LaunchTEiso(object): + + def __init__(self, reference="", input_transcripts="", single_paired="", single_reads="", left_reads="", right_reads="", transposable_element = "", assembly_tool="", verbosity=3): + self._reference = reference + self._transcripts = input_transcripts + self._type = single_paired + self._single_reads = single_reads.split(",") + self._left_reads = left_reads.split(",") + self._right_reads = right_reads.split(",") + self._TE = transposable_element + self._assembly_tool = assembly_tool + self._verbosity = verbosity + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_NAME, self.__class__.__name__), self._verbosity) + + def _setAttributesFromCmdLine(self): + self._toolVersion = "0.1" + description = "TEiso version %s" % self._toolVersion + epilog = "\n if reads are single:\n LaunchTEiso.py -f -g -e single -s -t -a cufflinks \n" + epilog += " if reads are paired:\n LaunchTEiso.py -f -g -e paired -l -r -t -a cufflinks \n" + parser = RepetOptionParser(description = description, epilog = epilog, version = self._toolVersion) + parser.add_option('-f' , '--input_reference' , dest='input_reference' , help='file with ref sequences') + parser.add_option('-g' , '--input_transcripts', dest='input_transcripts', help='GTF/GFF with known transcripts' , default="") + parser.add_option('-e' , '--single_paired' , dest='single_paired' , help='type of input reads, single or paired end', default="paired") + parser.add_option('-s' , '--single_read' , dest='single_read' , help='a single input read' , default="") + parser.add_option('-l', '--left_read' , dest='left_read' , help='left reads' , default="") + parser.add_option('-r', '--right_read' , dest='right_read' , help='right reads' , default="") + parser.add_option('-t' , '--input_transposable_element', dest='input_transposable_element', help='GFF with known transposable_element' , default="") + parser.add_option('-a' , '--assembly_tool' , dest='assembly_tool' , help='type of RNA-Seq assembly tool' , default="cufflinks") + options = parser.parse_args()[0] + self.setAttributesFromOptions(options) + + def setAttributesFromOptions(self, options): + self._reference = options.input_reference + self._transcripts = options.input_transcripts + self._type = options.single_paired + self._single_reads = options.single_read.split(",") + self._left_reads = options.left_read.split(",") + self._right_reads = options.right_read.split(",") + self._TE = options.input_transposable_element + self._assembly_tool = options.assembly_tool + + def _logAndRaise(self, errorMsg): + self._log.error(errorMsg) + #raise Exception(errorMsg) + sys.exit(1) + + def checkOptions(self): + if self._type == "paired": + if self._single_reads != ['']: + self._logAndRaise("ERROR: for paired reads, you shoud use option left and right reads!") + if self._left_reads == ['']: + self._logAndRaise("ERROR: for paired reads, you shoud use option left and right reads!") + if self._right_reads == ['']: + self._logAndRaise("ERROR: for paired reads, you shoud use option left and right reads!") + if self._right_reads == self._left_reads: + self._logAndRaise("ERROR: -l and -r options are same!") + + if self._type == "single": + if self._left_reads != ['']: + self._logAndRaise("ERROR: for single reads, you shoud use option single reads!") + if self._right_reads != ['']: + self._logAndRaise("ERROR: for single reads, you shoud use option single reads!") + if self._single_reads == ['']: + self._logAndRaise("ERROR: for single reads, you shoud use option single reads!") + + if self._reference != "": + if not FileUtils.isRessourceExists(self._reference): + self._logAndRaise("ERROR: reference file %s does not exist!" % self._reference) + else: + self._logAndRaise("ERROR: No specified -f option!") + + if self._transcripts != "": + if not FileUtils.isRessourceExists(self._transcripts): + self._logAndRaise("ERROR: transcripts file %s does not exist!" % self._transcripts) + else: + self._logAndRaise("ERROR: No specified -g option!") + + if self._TE != "": + if not FileUtils.isRessourceExists(self._TE): + self._logAndRaise("ERROR: transposable elements %s does not exist!" % self._TE) + else: + self._logAndRaise("ERROR: No specified -t option!") + + + + def getTranscriptToBed(self, inputFile,outputFile): + try: + filewrite=open(outputFile, "w") + gtfParser = GtfParser(inputFile, assemblyTools=True) + for transcript in gtfParser.getIterator(): + if(transcript.getDirection()==1): + strand="+" + else: + strand="-" + filewrite.write("%s\t%s\t%s\t%s\t%s\t%s\t%.3f\n" % (transcript.getChromosome(),transcript.getStart(), + transcript.getEnd(), transcript.getTagValue("ID"), transcript.getTagValue("gene_id"), + strand,float(transcript.getTagValue("FPKM")) )) + + filewrite.close() + except: + self._logAndRaise("Couldn't open %s for writing" % outputFile) + + + def getTEGFFToBed(self, inputFile, outputFile): + """TODO Dont write bed line when the strand is '.' + See Gtf parser option assemblyTools + """ + try: + filewrite=open(outputFile, "w") + gffParser = GffParser(inputFile) + for transcript in gffParser.getIterator(): + if(transcript.getDirection()==1): + strand="+" + else: + strand="-" + filewrite.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (transcript.getChromosome(),transcript.getStart(), + transcript.getEnd(), transcript.getTagValue("ID").split("_")[0]+"_", transcript.getTagValue("Target").split("_")[0], strand) ) + + filewrite.close() + except: + self._logAndRaise("Couldn't open %s for writing" % outputFile) + + + def getTEnearPromoter (self, bedtoolsfile): + #### BEdParser.py in commons is not used because the format of this bed file is different. +# #Chrom starttr endtr transcript_id gene_ID strand fpkm chromte startte endte idte targetTE strandTE distance + #scaffold_1 37570 37785 GSSPFG00034586001-RA GSSPFG00034586001 + 0.0000000000 scaffold_1 33914 40164 ms162_ PotentialHostGene - 0 + + linelist = [] + tmplist = [] + with open(bedtoolsfile, "r") as bedFile: + for line in bedFile.readlines(): + m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\t+([^\t]+)\t+([^\t]+)\t+([+-])\t+(\d+\.\d+)\t+([^\t]+)+\t+(\d+)\t+(\d+)\t+([^\t]+)+\t+([^\t]+)\t+([+-])\t+([^\t]+)",line) + if(m != None): + start_TR = int(m.group(2))##F[1] + end_TR = int(m.group(3))##F[2] + strand_TR= m.group(6) ##[5] + start_TE = int(m.group(9))##[8] + end_TE = int(m.group(10))##[9] + dist = int(m.group(14))##[13] + if (start_TE < start_TR) and (end_TE < start_TR) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE) and (dist != 0): + tmplist.append(line.strip()) + tmplist.append("TE_closest_TSS") + linelist.append(tmplist) + tmplist = [] + # F[1] gene F[2] + # =========================> + # ------------ + # F[8] F[9] + + if (start_TE > end_TR) and (end_TE > end_TR) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE) and (dist != 0): + tmplist.append(line.strip()) + tmplist.append("TE_closest_TSS") + linelist.append(tmplist) + tmplist = [] + + # F[1] F[2] + # <====================== + # --------------- + + if (start_TE <= start_TR) and (start_TR < end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + overlap = (end_TE-start_TR)+1 + tmplist.append(overlap) + tmplist.append("TE_overlap_TSS") + linelist.append(tmplist) + tmplist = [] + + # F[1] gene F[2] + # =========================> + # ------------- + # F[8] F[9] + + # gene + # F[1]=========================>F[2] + + # F[8]---------------F[9] + + + if (start_TE < start_TR) and (start_TR == end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + tmplist.append(0) + tmplist.append("TE_overlap_TSS") + linelist.append(tmplist) + tmplist = [] + + ## F[1]=============================>F[2] + ## F[8]---------------F[9] + + + if (start_TE < end_TR) and (end_TR <= end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + overlap = (end_TR-start_TE)+1 + tmplist.append(overlap) + tmplist.append("TE_overlap_TSS") + linelist.append(tmplist) + tmplist = [] + + + # F[1]<======================F[2] + # --------------- + # F[8] F[9] + # + # + # F[1]<=============================F[2] + # F[8]---------------F[9] + + if (start_TE == end_TR) and (end_TR < end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + tmplist.append(0) + tmplist.append("TE_overlap_TSS") + linelist.append(tmplist) + tmplist = [] + + # F[1]<=============================F[2] + # F[8]---------------F[9] + + if (start_TR < start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE < end_TR) and (dist == 0): + tmplist.append(line.strip()) + tmplist.append("TE_in_gene") + linelist.append(tmplist) + tmplist = [] + + # F[1] gene F[2] + # ============================== + # ----------- + # F[8] F[9] + + + if (start_TE < start_TR) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TR < end_TE): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + lenTE = (end_TE-start_TE)+1 + tmplist.append(lenTE) + tmplist.append("gene_in_TE") + linelist.append(tmplist) + tmplist = [] + + # F[1]======================F[2] + # F[8]----------------------------------------------------F[9] + + + if (strand_TR =="+") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR): + tmplist.append(line.strip()) + tmplist.append("gene_in_TE") + linelist.append(tmplist) + tmplist = [] + + # F[1]==================================>F[2] + # F[8]----------------------------------------------------------F[9] + + if (strand_TR =="-") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR): + tmplist.append(line.strip()) + tmplist.append("gene_in_TE") + linelist.append(tmplist) + tmplist = [] + + # F[1]<==================================F[2] + # F[8]----------------------------------------------------------F[9] + + if (strand_TR =="+") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR): + tmplist.append(line.strip()) + tmplist.append("gene_in_TE") + linelist.append(tmplist) + tmplist = [] + + # F[1]==================================>F[2] + # F[8]----------------------------------------------------------F[9] + + if (strand_TR =="-") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR): + tmplist.append(line.strip()) + tmplist.append("gene_in_TE") + linelist.append(tmplist) + tmplist = [] + + # F[1]<==================================F[2] + # F[8]----------------------------------------------------------F[9] + + favorablecases = "%s_TSSoverlaps_and_TE_closest_TSS_and_inclus_ALL" % bedtoolsfile + w = open(favorablecases,'w') + for s in linelist: + line= "\t".join(str(item) for item in s) + w.write("%s\n" % line) + w.close() + + + def getClassCodeCuffcompare(self, tmap_file, bedtools_file): + class_code_dic = {} + lcode_ref = [] + tmp = [] + linetowrite =[] + with open(tmap_file) as tmap: + tmapline = tmap.readlines() + for i in range(1,len(tmapline)): + cuff_id = tmapline[i].split("\t")[4].strip() + class_code = tmapline[i].split("\t")[2].strip() + ref_id = tmapline[i].split("\t")[1].strip() + lcode_ref.append(class_code) + lcode_ref.append(ref_id) + class_code_dic[cuff_id] = lcode_ref + lcode_ref = [] + + with open(bedtools_file) as bedtools: + bedtoolsline = bedtools.readlines() + for i in xrange(0,len(bedtoolsline)): + tmp = bedtoolsline[i].strip().split("\t") + transcript_bedtools = bedtoolsline[i].split("\t")[3].strip() + if transcript_bedtools in class_code_dic.keys(): + tmp.append(class_code_dic[transcript_bedtools][0]) + tmp.append(class_code_dic[transcript_bedtools][1]) + else: + tmp.append("NA") + tmp.append("NA") + + linetowrite.append(tmp) + tmp=[] + + + output = "%s_with_Ref" % bedtools_file + w = open(output,'w') + line = "" + for i in xrange(0,len(linetowrite)): + for j in range(0,17): + line = line + linetowrite[i][j] + "\t" + w.write(line) + w.write("\n") + line = "" + w.close() + + def run(self): + self.checkOptions() + try: + LoggerFactory.setLevel(self._log, self._verbosity) + exeDir = os.getcwd() + workingDir = "out_TEiso_%s" % time.strftime("%Y%m%d%H%M%S") + if os.path.exists(workingDir): + self._logAndRaise("ERROR: %s already exists." % workingDir) + os.mkdir(workingDir) + referencefile = os.path.abspath(self._reference) + transcriptsfile = os.path.abspath(self._transcripts) + TEfile = os.path.abspath(self._TE) + print "workingDir >>>>> ",workingDir + os.symlink("%s" % os.path.abspath(self._reference), "%s/%s" % (workingDir, os.path.basename(self._reference))) + os.symlink("%s" % os.path.abspath(self._transcripts), "%s/%s" % (workingDir, os.path.basename(self._transcripts))) + os.chdir(workingDir) + bowtie_build_Dir = "bowtie_build" + prefixbowtie = os.path.basename(self._reference).split(".")[0] + iLaunchBowtie = Bowtie_build(referencefile, prefixbowtie, bowtie_build_Dir) + iLaunchBowtie.run() + os.chdir(exeDir) + self._log.info("Indexing genome is finished!!!!") + tophat_Dir = "tophat" + if self._type == "single": + l_single_reads = [] + for reads in range(0, len(self._single_reads)): + os.symlink("%s" % os.path.abspath(self._single_reads[reads]), "%s/%s" % (workingDir, os.path.basename(self._single_reads[reads]))) + filename = os.path.splitext(self._single_reads[reads])[0] + filetype = os.path.splitext(self._single_reads[reads])[1] + if filetype == ".gz": + os.system("gunzip -c %s > %s/%s" % (self._single_reads[reads], workingDir, os.path.basename(filename))) + if filetype == ".bz2": + os.system("bunzip2 -c %s > %s/%s" % (os.path.abspath(self._single_reads[reads]), workingDir, os.path.basename(filename))) + if filetype ==".fq": + filename = self._single_reads[reads] + l_single_reads.append("%s" % os.path.basename(filename)) + bowtiePrefix = "%s/%s" % (bowtie_build_Dir, prefixbowtie) + path = ("%s/%s") % (exeDir,workingDir) + os.chdir(path) + iLaunchTophat = Tophat(tophat_Dir, bowtiePrefix, self._type, l_single_reads, self._left_reads, self._right_reads) + iLaunchTophat.run() + if self._type == "paired": + l_left_reads = [] + l_right_reads = [] + for reads in range(0, len(self._left_reads)): + os.symlink("%s" % os.path.abspath(self._left_reads[reads]), "%s/%s" % (workingDir, os.path.basename(self._left_reads[reads]))) + filename = os.path.splitext(self._left_reads[reads])[0] + filetype = os.path.splitext(self._left_reads[reads])[1] + ##TODO : check type input file: mimetypes.guess_type(self._single_reads[reads]) + if filetype == ".gz": + os.system("gunzip -c %s > %s/%s" % (self._left_reads[reads],workingDir, os.path.basename(filename))) + if filetype == ".bz2": + os.system("bunzip2 -c %s > %s/%s" % (self._left_reads[reads],workingDir, os.path.basename(filename))) + if filetype ==".fq": + filename = self._left_reads[reads] + l_left_reads.append("%s" % os.path.basename(filename)) + + for reads in range(0, len(self._right_reads)): + os.symlink("%s" % os.path.abspath(self._right_reads[reads]), "%s/%s" % (workingDir, os.path.basename(self._right_reads[reads]))) + filename = os.path.splitext(self._right_reads[reads])[0] + filetype = os.path.splitext(self._right_reads[reads])[1] + if filetype == ".gz": + os.system("gunzip -c %s > %s/%s" % (self._right_reads[reads],workingDir, os.path.basename(filename))) + if filetype == ".bz2": + os.system("bunzip2 -c %s > %s/%s" % (self._right_reads[reads],workingDir, os.path.basename(filename))) + if filetype ==".fq": + filename = self._right_reads[reads] + l_right_reads.append("%s" % os.path.basename(filename)) + bowtiePrefix = "%s/%s" % (bowtie_build_Dir, prefixbowtie) + path= ("%s/%s") % (exeDir,workingDir) + os.chdir(path) + iLaunchTophat = Tophat(tophat_Dir, bowtiePrefix, self._type, self._single_reads, l_left_reads, l_right_reads) + iLaunchTophat.run() + self._log.info("Mapping reads is finished!!!!") + + if self._assembly_tool == "cufflinks": + cufflinks_Dir = "cufflinks" + accepted_hits = "%s/accepted_hits.bam" % tophat_Dir + iLaunchCufflinks = Cufflinks(accepted_hits, transcriptsfile , cufflinks_Dir) + iLaunchCufflinks.run() + self._log.info("%s is finished!!!!" % self._assembly_tool) + os.symlink("cufflinks/transcripts.gtf", "transcripts.gtf") + cuffcompare_Dir = "cuffcompare" + transcripts = "transcripts.gtf" + iLaunchCuffcompare = Cuffcompare(transcriptsfile, transcripts, outprefix = "cuffcompare", workingDir = cuffcompare_Dir) + iLaunchCuffcompare.run() + self._log.info("Cuffcompare is finished!!!!") + + + bedtools_closest_Dir = "bedtools_closest" + os.mkdir(bedtools_closest_Dir) + os.chdir(bedtools_closest_Dir) + + transcriptsgtf = "%s_transcripts.gtf" % self._assembly_tool + os.symlink("../%s/transcripts.gtf" % self._assembly_tool,transcriptsgtf) + transcriptsbed = "%s_transcripts.bed" % self._assembly_tool + self.getTranscriptToBed(transcriptsgtf,transcriptsbed) + TEgff = os.path.basename(os.path.splitext(TEfile)[0]) + ".gff3" + TEbed = os.path.basename(os.path.splitext(TEfile)[0]) + ".bed" + os.symlink("%s" % TEfile,TEgff) + self.getTEGFFToBed(TEgff,TEbed) + iLauncherBdc= Bedtools_closest(transcriptsbed, TEbed, "bedtools_closest_%s" % transcriptsgtf.split(".")[0]) + iLauncherBdc.run() + self._log.info("Bedtools closest is finished!!!!") + bedtoolsfile = "bedtools_closest_%s" % transcriptsgtf.split(".")[0] + self.getTEnearPromoter(bedtoolsfile) + tmap_file = "../cuffcompare/cuffcompare.transcripts.gtf.tmap" + bedtools_file = "%s_TSSoverlaps_and_TE_closest_TSS_and_inclus_ALL" % bedtoolsfile + + self.getClassCodeCuffcompare(tmap_file,bedtools_file) + os.chdir("..") + self._log.info("Done!!!!") + + except Exception: + self._logAndRaise("ERROR in TEiso") + + +if __name__ == "__main__": + iLaunch = LaunchTEiso() + iLaunch._setAttributesFromCmdLine() + iLaunch.run() diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/TEiso/Tophat.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/TEiso/Tophat.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,161 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import os, glob +import subprocess +import time +from commons.core.checker.CheckerUtils import CheckerUtils +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.LoggerFactory import LoggerFactory +from commons.core.utils.FileUtils import FileUtils + +LOG_DEPTH = "repet.RNAseq_pipe" + + +class Tophat(object): + + def __init__(self, workingDir = "", index_genome = "", single_paired = "", single_read = "", left_read ="", right_read = "", verbosity = 3): + #self._transcripts = input_transcripts + self._outputDir = workingDir + self._bowtie_index = index_genome + self._type = single_paired + self._single_read = single_read + self._left_read = left_read + self._right_read = right_read + self._verbosity = verbosity + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) + def setAttributesFromCmdLine(self): + description = "TopHat maps short sequences from spliced transcripts to whole genomes..\n" + usage = "if reads are single:/n Tophat.py -G -b -t single -r \n" + usage +="if reads are paired:/n Tophat.py -G -b -t paired -r1 -r2 \n" + parser = RepetOptionParser(description = description, usage = usage) + # parser.add_option( '-G', '--input_transcripts', dest='input_transcripts', help='GTF/GFF with known transcripts', default = "") + parser.add_option( '-o', '--outputDir', dest='outputDir', help='write all output files to this directory', default = "") + parser.add_option( '-b', '--index_genome', dest='index_genome', help='Indexing reference genome', default = "") + parser.add_option( '-e', '--single_paired', dest='single_paired', help='types of input reads', default = "paired") + parser.add_option( '-s', '--single_read', dest = 'single_read', help='a single input read', default = "" ) + parser.add_option( '-l', '--left_read', dest='left_read', help='left reads', default = "" ) + parser.add_option( '-r', '--right_read', dest='right_read', help='right reads', default = "" ) + options, args = parser.parse_args() + self.setAttributesFromOptions(options) + + def setAttributesFromOptions(self, options): +## self._transcripts = options.input_transcripts + self._outputDir = options.outputDir + self._bowtie_index = options.index_genome + self._type = options.single_paired + self._single_read = options.single_read + self._left_read = options.left_read + self._right_read = options.right_read + + + def checkExecutables(self): + + if not CheckerUtils.isExecutableInUserPath("tophat2"): + raise Exception("ERROR: tophat must be in your path") + + def checkOptions(self): + if self._bowtie_index == "": + raise Exception("ERROR: No specified -b option!") + + ## if self._transcripts != "": + ## if not FileUtils.isRessourceExists(self._transcripts): + ## raise Exception("ERROR: %s does not exist!" % self._transcripts) + + if self._type == "paired": + for f in self._left_read: + if not FileUtils.isRessourceExists(f): + raise Exception("ERROR: %s does not exist!" % f) + for f in self._right_read: + if not FileUtils.isRessourceExists(f): + raise Exception("ERROR: %s does not exist!" % f) + elif self._type == "single": + for f in self._single_read: + if not FileUtils.isRessourceExists(f): + raise Exception("ERROR: %s does not exist!" % f) + else: + raise Exception("ERROR: No specified -t option!") + + def getTophatCmd_single(self, out_tophat, BowtiePrefix, single_read): + cmd = "tophat2 -p 8 -o %s %s %s" % (out_tophat, BowtiePrefix, ",".join(single_read)) + return cmd + + def getTophatCmd_paired(self, out_tophat, BowtiePrefix, left_read, right_read): + ####sur SGE comme saruman + #cmd = "echo " + "'tophat -p 8 -o %s ../%s %s %s'" % (out_tophat, prefix, ",".join(left_Read), ",".join(right_Read))+ "|qsub -V -cwd -pe multithread 8" + cmd = "tophat2 -p 8 -o %s %s %s %s" % (out_tophat, BowtiePrefix, ",".join(left_read), ",".join(right_read)) + #print cmd + return cmd + + def run(self): + self.checkExecutables() + self.checkOptions() + try: + if os.path.exists(self._outputDir): + raise Exception("ERROR: %s already exists." % self._outputDir) + if self._type == "single": + cmd_tophat = self.getTophatCmd_single(self._outputDir, self._bowtie_index, self._single_read) + if self._type == "paired": + cmd_tophat = self.getTophatCmd_paired(self._outputDir, self._bowtie_index, self._left_read, self._right_read) + #print cmd_tophat + ## hide output of subprocess: stdout = index_dir_stderr + fstdout = open( "tophat.log" , 'w' ) + process = subprocess.Popen(cmd_tophat, shell = True, stdout = fstdout, stderr=subprocess.STDOUT) + returncode = process.wait() + fstdout.close() + # get stderr, allowing for case where it's very large + fstdout = open("tophat.log", 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += fstdout.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + fstdout.close() + if returncode != 0: + raise Exception, stderr + + os.system("mv tophat.log %s/tophat.log " % self._outputDir) + except Exception: + raise Exception("ERROR in %s " % cmd_tophat) + + + + +if __name__ == "__main__": + iLaunch = Tophat() + iLaunch.setAttributesFromCmdLine() + iLaunch.run() + diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/TEiso/doc/README_TEiso.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/TEiso/doc/README_TEiso.txt Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,68 @@ +============================== +README for TEiso +============================== +------------------------------------------------------------ +======================= dependencies ======================= +------------------------------------------------------------ + +Bowtie, Tophat, Cufflinks, Cuffcompare, Bedtools_closest. + +------------------------------------------------------------ +===================== INSTALLATION ========================= +------------------------------------------------------------ + +$tar -xzf TEiso-1.2.tar.gz +$cd TEiso-1.2/ +$python setup_TEiso.py install +$export PYTHONPATH=$PWD +$export PATH=$PATH:$PWD/bin + + + +------------------------------------------------------------ +======================== description ======================= +------------------------------------------------------------ + +TEiso is a python script that allows to find distance between the element transposable and TSS of isoforms : + + +------------------------------------------------------------ +===================== command examples ===================== +------------------------------------------------------------ +if reads are single: + LaunchTEiso.py -f -g -e single -s -t -a cufflinks +if reads are paired: + LaunchTEiso.py -f -g -e paired -l -r -t -a cufflinks + +------------------------------------------------------------ +========================== options ========================= +------------------------------------------------------------ + + --version show program's version number and exit + -h, --help show this help message and exit + + -f INPUT_REFERENCE, --input_reference=INPUT_REFERENCE + file with ref sequences + + -g INPUT_TRANSCRIPTS, --input_transcripts=INPUT_TRANSCRIPTS + GTF/GFF with known transcripts + + -e SINGLE_PAIRED, --single_paired=SINGLE_PAIRED + type of input reads, single or paired end + + -s SINGLE_READ, --single_read=SINGLE_READ + a single input read + + -l LEFT_READ, --left_read=LEFT_READ + left reads + + -r RIGTH_READ, --rigth_read=RIGTH_READ + right reads + + -t INPUT_TRANSPOSABLE_ELEMENT, --input_transposable_element=INPUT_TRANSPOSABLE_ELEMENT + GFF with known transposable_element + + -a ASSEMBLY_TOOL, --assembly_tool=ASSEMBLY_TOOL + type of RNA-Seq assembly tool + +- diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/__init__.py diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/LoggerFactory.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/LoggerFactory.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,139 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +## @mainpage Documentation of the REPET API +# +# Welcome to the API documentation! +# This API is a set of packages and classes for pipeline(s) development. +# +# @par The "logger" package +# +# Logging is managed via LoggerFactory. This class creates instances of logging.logging python class. It's strongly encouraged to use this factory each time you need to log something. +# +# @par The "checker" package +# +# This package is a set of classes designed to facilitate development of different kind of checks: filesystem checks, environment checks, configuration file checks ... +# +# Classes should subclass checker::IChecker or if a logger is needed: checker::AbstractChecker. +# +# Methods should raise checker::CheckerException. +# +# Use checker::ConfigChecker and checker::ConfigException for configuration files checks. +# +# checker::CheckerUtils is a set of small static methods shared by other classes of checker package. +# +# @par The "coord" package +# +# This package is a set of classes dedicated to coordinates manipulations. +# +# A coord::Range instance records a region on a given sequence (start, end and sequence name). +# +# A coord::Map instance is a coord::Range instance and record a named region on a given sequence (start, end, sequence name and name). +# +# A coord::Set instance is a coord::Map instance and record a named region on a given sequence with an identifier (start, end, sequence name, name and id). +# +# A coord::Align instance handle a match between two sequences, query and subject (pair of coordinates with E-value, score and identity). +# +# A coord::Path instance is a coord::Align instance and handle a match between two sequences, query and subject (pair of coordinates with E-value, score and identity) with an identifier. +# +# A coord::Match instance is a coord::Path instance and handle a chain of match(es) between two sequences, query and subject, with an identifier and the length of the input sequences. +# +# coord::Align, coord::Map, coord::Path and coord::Set come with utils classes: coord::AlignUtils, coord::MapUtils, coord::PathUtils and coord::SetUtils. +# +# @par The "seq" package +# +# This package a set of classes dedicated to sequences manipulations. +# +# A seq::Bioseq instance records a sequence with its header. seq::Bioseq comes with an utils class: seq::BioseqUtils. +# +# A seq::BioseqDB instance handle a collection of a Bioseq (header-sequence). +# +# A seq::AlignedBioseqDB instance is a multiple sequence alignment representation. +# +# A seq::FastaUtils is a set of static methods for fasta file manipulation. +# +# @par The "sql" package +# +# This package is dedicated to persistance of coord package objects. +# All classes come with dedicated interfaces. Use these interfaces for class manipulation. +# Class names patterns are ITable*Adaptator and Table*Adaptator. +# +# sql::ITablePathAdaptator, sql::TablePathAdaptator / +# sql::ITableSetAdaptator, sql::TableSetAdaptator / +# sql::ITableSeqAdaptator, sql::TableSeqAdaptator / +# sql::ITableMapAdaptator, sql::TableMapAdaptator / +# sql::ITableMatchAdaptator, sql::TableMatchAdaptator. +# + +import logging +import sys + +DEFAULT_LEVEL = 1 +DEFAULT_FORMAT = "%(asctime)s - %(module)s - %(levelname)s - %(message)s" +DATE_FORMAT = "%Y-%m-%d %H:%M:%S" + +## Use this class to create a instance of logging class. +# +class LoggerFactory(object): + + def createLogger(name, verbosity = DEFAULT_LEVEL, format = DEFAULT_FORMAT, out = sys.stdout): + log = logging.getLogger(name) + + hasStreamHandler = False + for handler in log.handlers: + if handler.__class__ == logging.StreamHandler: + hasStreamHandler = True + break + if not hasStreamHandler: + formatter = logging.Formatter(format, DATE_FORMAT) + handler = logging.StreamHandler(out) + handler.setFormatter(formatter) + log.addHandler(handler) + + LoggerFactory.setLevel(log, verbosity) + return log + + createLogger = staticmethod(createLogger) + + def setLevel(log, verbosity): + log.disabled = False + if verbosity >= 4: + log.setLevel(logging.DEBUG) + elif verbosity == 3: + log.setLevel(logging.INFO) + elif verbosity == 2: + log.setLevel(logging.WARNING) + elif verbosity == 1: + log.setLevel(logging.ERROR) + elif verbosity == 0: + log.disabled = True + + setLevel = staticmethod(setLevel) diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/__init__.py diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/checker/AbstractChecker.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/checker/AbstractChecker.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,61 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.checker.IChecker import IChecker +from commons.core.LoggerFactory import LoggerFactory + + +## Enable a Logger in your Checker. +# +# Subclasses of AbstractChecker have a already a logger enabled (referenced by self._log attribute). Subclasses also already implements IChecker. +# All you have to do is to call __init__() method in your own constructor. +class AbstractChecker( IChecker ): + + ## Constructor + # + # @param logFileName name of log file where logger outputs + # + def __init__(self, logFileName): + self._log = LoggerFactory.createLogger(logFileName) + + + ## Set (change) default logger + # + # @param logger a new logger + # + def setLogger(self, logger): + self._log = logger + + + ## Return the logger instance + # + def getLogger(self): + return self._log diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/checker/CheckerException.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/checker/CheckerException.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,52 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +## Exception raised during check +# +# This class wraps Exception class +# +class CheckerException( Exception ): + + ## Constructor + # + # @param msg message embedded in Exception class + def __init__(self,msg=""): + self.messages = [] + self.msg = msg + Exception.__init__(self, msg) + + + def setMessages(self,lMessages): + self.messages = lMessages + + + def getMessages(self): + return self.messages diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/checker/CheckerUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/checker/CheckerUtils.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,315 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import os +import sys +import re +import glob +from ConfigParser import NoOptionError +from ConfigParser import NoSectionError +from commons.core.checker.CheckerException import CheckerException + + +## A set of static methods used to perform checks. +# +# +class CheckerUtils( object ): + + ## Check if blastName param is in ["blastn", "blastp", "blastx", "tblastn", "tblastx"] + # + # @param blastName name to check + # @return True if name is in list False otherwise + # + def isBlastNameNotInBlastValues( blastName ): + blastValuesSet = set( ["blastn", "blastp", "blastx", "tblastn", "tblastx"] ) + blastNameSet = set( [ blastName ] ) + return not blastNameSet.issubset( blastValuesSet ) + + isBlastNameNotInBlastValues = staticmethod( isBlastNameNotInBlastValues ) + + + ## Check if param is NOT "TRUE" and NOT false "FALSE" + # + # @param param str to check + # @return True if param is not eq to "TRUE" AND not eq to "FALSE", false otherwise + # + def isNotTRUEisNotFALSE( param ): + return param != "TRUE" and param != "FALSE" + + isNotTRUEisNotFALSE = staticmethod( isNotTRUEisNotFALSE ) + + + ## Check if resource (file or dir) do NOT exists + # + # @param resource file or dir to check + # @return True if resource exists False otherwise + # + def isRessourceNotExits( resource ): + return not os.path.exists( resource ) + + isRessourceNotExits = staticmethod( isRessourceNotExits ) + + + ## Check a specific E-value format: de-dd + # + # @param param E-value to check + # @return True if format is de-dd False otherwise + # + def isNotAeValueWithOneDigit2DecimalsAtLeast( param ): + # \d\d stands for 2 digits and more ??? + return not re.match( "\de\-\d\d", param ) + + isNotAeValueWithOneDigit2DecimalsAtLeast = staticmethod( isNotAeValueWithOneDigit2DecimalsAtLeast ) + + + ## Check a number format + # + # @param param value to check + # @return True if param is a number (d+) False otherwise + # + def isNotANumber( param ): + return not re.match( "\d+", param ) + + isNotANumber = staticmethod( isNotANumber ) + + + ## Check if an executable is in the user's PATH + # + # @param exeName name of the executable + # @return True if executable in user's PATH, False otherwise + # + def isExecutableInUserPath( exeName ): + dirPathList = os.environ["PATH"].split(":") + for dirPath in dirPathList: + if os.path.isdir( dirPath ): + try: + binPathList = glob.glob( dirPath + "/*" ) + except OSError, e: + continue + for binPath in binPathList: + bin = os.path.basename( binPath ) + if bin == exeName: + return True + return False + + isExecutableInUserPath = staticmethod( isExecutableInUserPath ) + + + ## Return the full path of a given executable + # + def getFullPathFromExecutable( exeName ): + lDirFromUserPath = os.environ["PATH"].split(":") + for dir in lDirFromUserPath: + if os.path.isdir( dir ): + try: + lExecutables = glob.glob( "%s/*" % ( dir ) ) + except OSError, e: + continue + for exe in lExecutables: + path, exe = os.path.split( exe ) + if exe == exeName: + return path + return "" + + getFullPathFromExecutable = staticmethod( getFullPathFromExecutable ) + + + #TODO: to remove ? + ## Check if a queue Name is valid. Warning: Only with the queue manager SGE + # + # @param fullQueueName name of the queue to test (with or without parameters) + # @return True if queue name is valid, False otherwise + # + def isQueueNameValid( fullQueueName ): + queueName = fullQueueName.split()[0] + if queueName == "none": + return True + queueFile = "queueName.txt" + if not CheckerUtils.isExecutableInUserPath( "qconf" ): + msg = "executable 'qconf' can't be found" + sys.stderr.write( "%s\n" % ( msg ) ) + return False + cmd = "qconf -sql > " + queueFile + os.system( cmd ) + queueFileHandler = open( queueFile, "r" ) + lQueueNames = queueFileHandler.readlines() + queueFileHandler.close() + os.remove( queueFile ) + queueNameValid = False + for qName in lQueueNames: + qName = qName.strip() + if qName == queueName: + queueNameValid = True + break + return queueNameValid + + isQueueNameValid = staticmethod( isQueueNameValid ) + + + ## Check if a string length is lower or equal than 15 + # + # @param strName any string + # @return True if string length is <= 15, False otherwise + # + def isMax15Char( strName ): + return (len(strName) <= 15 ) + + isMax15Char = staticmethod( isMax15Char ) + + + ## Check if a string is made with only alphanumeric or underscore character + # + # @param strName any string + # @return True if string is with alphanumeric or underscore, False otherwise + # + def isCharAlphanumOrUnderscore( strName ): + # authorized ALPHABET [a-z,A-Z,0-9,_] + p = re.compile('\W') + errList=p.findall(strName) + if len( errList ) > 0 : + return False + else: + return True + + isCharAlphanumOrUnderscore = staticmethod( isCharAlphanumOrUnderscore ) + + + ## Check if sectionName is in the configuration file + # + # @param config filehandle of configuration file + # @param sectionName string of section name to check + # @exception NoSectionError: if section not found raise a NoSectionError + # + def checkSectionInConfigFile( config, sectionName ): + if not (config.has_section(sectionName)): + raise NoSectionError(sectionName) + + checkSectionInConfigFile = staticmethod( checkSectionInConfigFile ) + + + ## Check if an option is in a specified section in the configuration file + # + # @param config filehandle of configuration file + # @param sectionName string of section name + # @param optionName string of option name to check + # @exception NoOptionError: if option not found raise a NoOptionError + # + def checkOptionInSectionInConfigFile( config, sectionName, optionName ): + config.get( sectionName, optionName ) + + checkOptionInSectionInConfigFile = staticmethod( checkOptionInSectionInConfigFile ) + + + ## Check version number coherency between configFile and CHANGELOG + # + # @param config ConfigParser Instance of configuration file + # @param changeLogFileHandle CHANGELOG file handle + # @exception NoOptionError: if option not found raise a NoOptionError + # + def checkConfigVersion( changeLogFileHandle, config ): + line = changeLogFileHandle.readline() + while not line.startswith("REPET release "): + line = changeLogFileHandle.readline() + numVersionChangeLog = line.split()[2] + + numVersionConfig = config.get("repet_env", "repet_version") + + if not numVersionChangeLog == numVersionConfig: + message = "*** Error: wrong config file version. Expected version num is " + numVersionChangeLog + " but actual in config file is " + numVersionConfig + raise CheckerException(message) + + checkConfigVersion = staticmethod( checkConfigVersion ) + + + ## Get version number from CHANGELOG + # + # @param changeLogFile CHANGELOG file name + # + def getVersionFromChangelogFile(changeLogFileName): + with open(changeLogFileName) as changeLogFileHandle: + line = changeLogFileHandle.readline() + while not line.startswith("REPET release "): + line = changeLogFileHandle.readline() + numVersionChangeLog = line.split()[2] + return numVersionChangeLog + + + getVersionFromChangelogFile = staticmethod( getVersionFromChangelogFile ) + + + ## Check if headers of an input file contain only alpha numeric characters and "_ : . -" + # + # @param fileHandler file handle + # @exception CheckerException if bad header raise a CheckerException + # + def checkHeaders( fileHandler ): + lHeaders = CheckerUtils._getHeaderFromFastaFile(fileHandler) + p = re.compile('[^a-zA-Z0-9_:\.\-]', re.IGNORECASE) + lWrongHeaders = [] + for header in lHeaders: + errList=p.findall(header) + if len( errList ) > 0 : + lWrongHeaders.append(header) + if lWrongHeaders != []: + exception = CheckerException() + exception.setMessages(lWrongHeaders) + raise exception + + checkHeaders = staticmethod( checkHeaders ) + + + def _getHeaderFromFastaFile( inFile ): + lHeaders = [] + while True: + line = inFile.readline() + if line == "": + break + if line[0] == ">": + lHeaders.append( line[1:-1] ) + return lHeaders + + _getHeaderFromFastaFile = staticmethod( _getHeaderFromFastaFile ) + + + ## Return True if an option is in a specified section in the configuration file, False otherwise + # + # @param config handler of configuration file + # @param sectionName string of section name + # @param optionName string of option name to check + # + def isOptionInSectionInConfig( configHandler, section, option ): + try: + CheckerUtils.checkOptionInSectionInConfigFile( configHandler, section, option ) + except NoOptionError: + return False + return True + + isOptionInSectionInConfig = staticmethod( isOptionInSectionInConfig ) diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/checker/ConfigChecker.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/checker/ConfigChecker.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,225 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import re +import sys +from commons.core.utils.RepetConfigParser import RepetConfigParser +from commons.core.checker.ConfigValue import ConfigValue +from commons.core.checker.IChecker import IChecker +from commons.core.checker.RepetException import RepetException +from commons.core.utils.FileUtils import FileUtils + +#TODO: add more tests! + +class Rule(object): + + def __init__(self, mandatory= False, isPattern=False, type="", set=(), help =""): + self.mandatory = mandatory + self.isPattern = isPattern + self.type = type + self.set = set + self.help = help + +class ConfigRules(object): + + def __init__(self, configName = "", configDescription = ""): + self.configName = configName + self.configDescription = configDescription + self.dRules4Sections={} + + def _addRule(self, section, option="DEFAULT", mandatory=False, isPattern=False, type="", set=(), help =""): + if not self.dRules4Sections.has_key(section): + self.dRules4Sections[section] = {} + self.dRules4Sections[section][option]=Rule(mandatory, isPattern, type.lower(), set) + + def addRuleSection(self, section, mandatory=False, isPattern=False, help = ""): + self._addRule(section = section, option = "DEFAULT", mandatory = mandatory, isPattern = isPattern, help = "") + + def addRuleOption(self, section, option, mandatory=False, isPattern=False, type="", set=(), help = ""): + self._addRule(section = section, option = option, mandatory = mandatory, isPattern = isPattern, type = type, set=set , help = "") + + def isSectionMandatory(self, section): + if self.dRules4Sections.has_key(section): + if self.dRules4Sections[section].has_key("DEFAULT"): + return self.dRules4Sections[section]["DEFAULT"].mandatory + return False + + def isOptionMandatory(self, section, option): + if self.dRules4Sections.has_key(section): + if self.dRules4Sections[section].has_key(option): + return self.dRules4Sections[section][option].mandatory + return False + + def getRule(self, section, option): + if self.dRules4Sections.has_key(section): + if self.dRules4Sections[section].has_key(option): + return self.dRules4Sections[section][option] + return None + +class ConfigChecker(IChecker): + + def __init__ (self, cfgFileName, iCfgRules): + self._configFileName = cfgFileName + self._iConfigRules = iCfgRules + self._iRawConfig = ConfigValue() + self._iExtendedConfigRules = ConfigRules() + + def readConfigFile(self): + iConfig = RepetConfigParser() + try: + iConfig.readfp(open(self._configFileName)) + return iConfig +# TODO USE OF CONFIG ERROR +# if DuplicateSectionError: +# raise Exception ("Duplicate section exist in config file %s" %(self._configFileName )) + except : + raise RepetException ("Unexpected error: %s" % sys.exc_info()[0]) + + def setRawConfig(self, iConfig ): + for sectionName in iConfig.sections(): + for optionName in iConfig.options(sectionName): + optionValue = iConfig.get(sectionName, optionName) + self._iRawConfig.set(sectionName, optionName, optionValue) + + def getOptionValueAccordingRule(self, iConfig, sectionName, optionName): + optionRule = self._iExtendedConfigRules.getRule(sectionName, optionName) + if optionRule == None : + return iConfig.get(sectionName, optionName) + + if optionRule.type == "int": + optionValue = iConfig.getint(sectionName, optionName) + elif optionRule.type == "float": + optionValue = iConfig.getfloat(sectionName, optionName) + elif optionRule.type == "bool" or optionRule.type == "boolean": + optionValue = iConfig.getboolean(sectionName, optionName) + else: + optionValue = iConfig.get(sectionName, optionName) + if optionRule.set!=() and not(optionValue in optionRule.set): + raise RepetException ("value must be in '%s'" % str(optionRule.set)) + + return optionValue + + def setConfig(self, iConfig ): + config = ConfigValue() + valueErr = "" + for sectionName in iConfig.sections(): + for optionName in iConfig.options(sectionName): + try: + optionValue = self.getOptionValueAccordingRule(iConfig, sectionName, optionName ) + config.set(sectionName, optionName, optionValue) + except RepetException, re : + valueErr += "\n\t- %s" % re.getMessage() + if valueErr == "": + self._iRawConfig = config + else: + raise RepetException ("Following errors occurred: %s\n" % valueErr) + + def checkIfExistsConfigFile (self): + if not (FileUtils.isRessourceExists(self._configFileName)): + raise RepetException("CONFIG FILE not found - '%s'" % self._configFileName) + + def checkMandatorySections (self): + missingSection = "" + for sectionName in self._iExtendedConfigRules.dRules4Sections.keys(): + if self._iExtendedConfigRules.isSectionMandatory(sectionName) and not self._iRawConfig.has_section(sectionName): + missingSection += "\n - %s" %(sectionName) + if missingSection != "": + raise RepetException ("Error in configuration file %s, following sections are missing:%s\n" % (self._configFileName, missingSection)) + + def checkMandatoryOptions (self): + missingOption = "" + for sectionName in self._iExtendedConfigRules.dRules4Sections.keys(): + if self._iExtendedConfigRules.isSectionMandatory(sectionName) or self._iRawConfig.has_section(sectionName) : + dRules4OptionsOfThisSection = self._iExtendedConfigRules.dRules4Sections[sectionName] + for optionName in dRules4OptionsOfThisSection.keys(): + if optionName != "DEFAULT" and self._iExtendedConfigRules.isOptionMandatory(sectionName, optionName) and not self._iRawConfig.has_option(sectionName, optionName): + missingOption += "\n - [%s]: %s" % (sectionName, optionName) + if missingOption != "": + raise RepetException ("Error in configuration file %s, following options are missing: %s\n" % (self._configFileName, missingOption)) + + def getSectionNamesAccordingPatternRules (self, sectionWordOrPattern, isPattern): + lSectionsFoundAccordingPatternRules=[] + if isPattern == False: + if self._iRawConfig.has_section(sectionWordOrPattern): + lSectionsFoundAccordingPatternRules.append(sectionWordOrPattern) + else: + for sectionName in self._iRawConfig.sections(): + if re.search(sectionWordOrPattern, sectionName, re.IGNORECASE): + lSectionsFoundAccordingPatternRules.append(sectionName) + return lSectionsFoundAccordingPatternRules + + def getOptionsNamesAccordingPatternRules(self, sectionName, optionWordOrPattern, isPattern): + lOptionsFoundAccordingPatternRules=[] + if isPattern == False: + if self._iRawConfig.has_option(sectionName, optionWordOrPattern): + lOptionsFoundAccordingPatternRules.append(optionWordOrPattern) + else : + for optionName in self._iRawConfig.options(sectionName): + if re.search(optionWordOrPattern, optionName, re.IGNORECASE)!= None: + lOptionsFoundAccordingPatternRules.append(optionName) + return lOptionsFoundAccordingPatternRules + + def extendConfigRulesWithPatternRules(self): + for sectionName in self._iConfigRules.dRules4Sections.keys(): + dRules4OptionsOfThisSection = self._iConfigRules.dRules4Sections[sectionName] + lRawSections=[] + if dRules4OptionsOfThisSection.has_key("DEFAULT"): + mandatorySection = dRules4OptionsOfThisSection["DEFAULT"].mandatory + isPatternSection = dRules4OptionsOfThisSection["DEFAULT"].isPattern + lRawSections=self.getSectionNamesAccordingPatternRules(sectionName, isPatternSection) + for rawSectionName in lRawSections: + self._iExtendedConfigRules.addRuleSection(rawSectionName, "DEFAULT", mandatorySection ) + if mandatorySection and (len(lRawSections)==0): + self._iExtendedConfigRules.addRuleSection(sectionName, "DEFAULT", mandatorySection ) + else: + lRawSections.append(sectionName) + for optionName in dRules4OptionsOfThisSection.keys(): + setOption = dRules4OptionsOfThisSection[optionName].set + isPatternOption = dRules4OptionsOfThisSection[optionName].isPattern + mandatoryOption = dRules4OptionsOfThisSection[optionName].mandatory + typeOption = dRules4OptionsOfThisSection[optionName].type + if optionName != "DEFAULT": + for rawSectionName in lRawSections: + lRawOptions=self.getOptionsNamesAccordingPatternRules(rawSectionName, optionName, isPatternOption) + for rawOptionName in lRawOptions: + self._iExtendedConfigRules.addRuleOption(rawSectionName, rawOptionName, mandatoryOption, False, typeOption, setOption) + if mandatoryOption and (len(lRawOptions)==0): + self._iExtendedConfigRules.addRuleOption(rawSectionName, optionName, mandatoryOption, False, typeOption, setOption) + + def getConfig(self): + self.checkIfExistsConfigFile() + iConfig = self.readConfigFile() + self.setRawConfig(iConfig) + self.extendConfigRulesWithPatternRules() + self.checkMandatorySections() + self.checkMandatoryOptions() + self.setConfig(iConfig) + return self._iRawConfig \ No newline at end of file diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/checker/ConfigException.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/checker/ConfigException.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,53 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +from commons.core.checker.RepetException import RepetException + +## A exception raised by check() method of class ConfigChecker +# +# This class allow storage of multiple messages (see messages attribute). +# Example: use one instance of ConfigException class for one section in configuration file. +# All messages relatives to this section are stored in messages attribute. +class ConfigException( RepetException ): + + ## Constructor + # + # @param msg message embedded in Exception class + # + def __init__(self, msg, messages = []): + RepetException.__init__(self, msg) + self.messages = messages + + def getMessages(self): + return self.messages + + def setMessages(self, messages): + self.messages = messages + diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/checker/ConfigValue.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/checker/ConfigValue.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,76 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +class ConfigValue(object): + + def __init__(self): + self.dOptionsValues4Sections={} + + def has_section(self,sectionName): + return self.dOptionsValues4Sections.has_key(sectionName) + + def has_option(self, sectionName, optionName): + isOptionExist = False + if self.has_section(sectionName): + isOptionExist = self.dOptionsValues4Sections[sectionName].has_key(optionName) + return isOptionExist + + def sections(self): + lSectionsKeys = self.dOptionsValues4Sections.keys() + return lSectionsKeys + + def options(self, sectionName): + lOptionsKeys = [] + if self.has_section(sectionName): + lOptionsKeys = self.dOptionsValues4Sections[sectionName].keys() + return lOptionsKeys + + def get(self, sectionName, optionName): + if self.has_option(sectionName, optionName): + return self.dOptionsValues4Sections[sectionName][optionName] + return None + + def set(self, sectionName, optionName, optionValue): + if not (self.has_section(sectionName)): + self.dOptionsValues4Sections[sectionName] = {} + self.dOptionsValues4Sections[sectionName][optionName] = optionValue + + def setdOptionsValues4Sections(self, dOptionsValues4Sections): + self.dOptionsValues4Sections = dOptionsValues4Sections + + def __eq__(self, o): + if type(o) is not type(self): + return False + else: + return self.dOptionsValues4Sections == o.dOptionsValues4Sections + + def __ne__(self, o): + return not self.__eq__(o) diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/checker/IChecker.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/checker/IChecker.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,45 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +## Interface for a checker +# +# This class emulates an interface for a checker. +# +# All checkers are subclasses of IChecker. +# +class IChecker( object ): + + ## perform check, raise a CheckerException if error occurred + # + # @param arg a collecting parameter: put here all you need to perform check + # + def check(self, arg=""): + pass diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/checker/OldConfigChecker.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/checker/OldConfigChecker.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,101 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import ConfigParser +from ConfigParser import NoOptionError +from commons.core.checker.IChecker import IChecker +from commons.core.checker.ConfigException import ConfigException + + +## A checker for a configuration file +# +# +# A configuration file is formatted as follow: +# +# [section1] +# +# option_name1: option_value1 +# +# option_name2: option_value2 +# +# option_name3: option_value3 +# +# [section2] +# +# ... +# +# +# This class performs 3 checkes on a configuration file: +# +# (i) check if file exists +# +# (ii) check if section exists +# +# (iii) check if option exists +# +class ConfigChecker( IChecker ): + + ## Constructor A checker for configuration file. + # + # @param sectionName name of section to check in configuration file + # @param optionsDict dictionary with option(s) to check as keys and empty strings ("") as values + def __init__ (self, sectionName, optionsDict): + self._sectionName = sectionName + self._optionsDict = optionsDict + + + ## Perform 3 checks : file exists, sections exists, option exists + # + # @param configFile configuration file to check + # @exception ConfigException with a list of messages + def check (self, configFile): + config = ConfigParser.ConfigParser() + msg = [] + try: + config.readfp( open(configFile) ) + except IOError, e: + msg.append("CONFIG FILE not found - " + e.message) + raise ConfigException("", msg) + + if not (config.has_section(self._sectionName)): + msg.append("[" + self._sectionName + "]" + " section not found - ") + raise ConfigException("", msg) + + isExceptionOccured = False + for key in self._optionsDict.keys(): + try: + self._optionsDict[key] = config.get(self._sectionName, key) + except NoOptionError, e: + msg.append("[" + self._sectionName + "]" + " - " + e.message) + isExceptionOccured = True + + if (isExceptionOccured): + raise ConfigException("", msg) diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/checker/RepetException.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/checker/RepetException.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,51 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +class RepetException(Exception): + + def __init__(self, msg): + Exception.__init__(self) + self._message = msg + + def __str__(self): + return self._message + + def getMessage(self): + return self._message + + def setMessage(self, msg): + self._message = msg + + +class RepetDataException(RepetException): + + def __init__(self, msg): + RepetException.__init__(self, msg) \ No newline at end of file diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/checker/__init__.py diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/Align.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/Align.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,429 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import time +from commons.core.coord.Map import Map +from commons.core.coord.Range import Range + +## Handle a match between two sequences, query and subject (pair of coordinates with E-value, score and identity) +# +class Align( object ): + + __slots__ = ("range_query", "range_subject", "e_value", "score", "identity", '__dict__') + + ## Constructor + # + # @param range_q: a Range instance for the query + # @param range_s: a Range instance for the subject + # @param e_value: E-value of the match + # @param identity: identity percentage of the match + # @param score: score of the match + # + def __init__(self, range_q=Range(), range_s=Range(), e_value=0, score=0, identity=0): + self.range_query = range_q + self.range_subject = range_s + self.e_value = float(e_value) + self.score = float(score) + self.identity = float(identity) + + ## Return True if the instance is empty, False otherwise + # + def isEmpty(self): + return self.range_query.isEmpty() or self.range_subject.isEmpty() + + ## Equal operator + # + def __eq__(self, o): + if type(o) is not type(self): + return False + else: + return self.range_query==o.range_query and self.range_subject==o.range_subject and \ + self.e_value==o.e_value and self.score==o.score and self.identity==o.identity + + ## Unequal operator + # + # @param o a Range instance + # + def __ne__(self, o): + return not self.__eq__(o) + + ## Convert the object into a string + # + # @note used in 'print myObject' + # + def __str__( self ): + return self.toString() + + ## Read attributes from an Align file + # + # @param fileHandler: file handler of the file being read + # @return: 1 on success, 0 at the end of the file + # + def read(self, fileHandler): + self.reset() + line = fileHandler.readline() + if line == "": + return 0 + tokens = line.split("\t") + if len(tokens) < 5: + return 0 + self.setFromTuple(tokens) + return 1 + + ## Set attributes from tuple + # + # @param tuple a tuple with (queryName,queryStart,queryEnd,subjectName,subjectStar,subjectEnd,E-value,score,identity) + # @note data are loaded such that the query is always on the direct strand + # + def setFromTuple( self, tuple ): + #TODO: we need to create Range instances because of __eq__() and isEmpty() tests, but WHY ??? + self.range_query = Range() + self.range_subject = Range() + if int(tuple[1]) < int(tuple[2]): + self.range_query.setFromTuple( ( tuple[0], tuple[1], tuple[2] ) ) + self.range_subject.setFromTuple( ( tuple[3], tuple[4], tuple[5] ) ) + else: + self.range_query.setFromTuple( ( tuple[0], tuple[2], tuple[1] ) ) + self.range_subject.setFromTuple( ( tuple[3], tuple[5], tuple[4] ) ) + self.e_value = float(tuple[6]) + self.score = float(tuple[7]) + self.identity = float(tuple[8]) + + ## Reset + # + def reset( self ): + self.range_query.reset() + self.range_subject.reset() + self.e_value = 0 + self.score = 0 + self.identity = 0 + + ## Return the attributes as a formatted string + # + def toString(self): + string = "%s" % ( self.range_query.toString() ) + string += "\t%s" % ( self.range_subject.toString() ) + string += "\t%g\t%i\t%f" % ( self.e_value, self.score, self.identity ) + return string + + + ## Return the attributes as a GFF-formatted string + # + def toStringAsGff( self, source="REPET", type="match", phase=".", ID="", Parent="" ): + if not self.isSubjectOnDirectStrand(): + self.reverse() + string = "%s" % ( self.getQueryName() ) + string += "\t%s" % ( source ) + string += "\t%s" % ( type ) + string += "\t%s" % ( self.getQueryMin() ) + string += "\t%s" % ( self.getQueryMax() ) + string += "\t%g" % ( self.e_value ) + string += "\t%s" % ( self.getQueryStrand() ) + string += "\t%s" % ( phase ) + attributes = "" + if ID != "": + attributes += "ID=%s" % ( ID ) + else: + attributes += "ID=%i" % ( str(time.time())[-8:-1].replace(".","") ) + if Parent != "": + attributes += ";Parent=%s" % ( Parent ) + attributes += ";Target=%s %i %i" % ( self.getSubjectName(), self.getSubjectStart(), self.getSubjectEnd() ) + string += "\t%s" % ( attributes ) + return string + + + ## Reverse query and subject + # + def reverse(self): + self.range_query.reverse() + self.range_subject.reverse() + + ## Show the attributes + # + def show(self): + print self.toString() + + ## Write attributes into an Align file + # + # @param fileHandler: file handler of the file being filled + # + def write(self, fileHandler): + fileHandler.write("%s\n" % (self.toString())) + + ## Save attributes into an Align file + # + # @param file: name of the file being filled + # + def save(self, file): + fileHandler = open( file, "a" ) + self.write( fileHandler ) + fileHandler.close() + + ## Return the score + # + def getScore(self): + return self.score + + ## Return the identity + # + def getIdentity(self): + return self.identity + + def getEvalue(self): + return self.e_value + + ## Return the length on the query + # + def getLengthOnQuery(self): + return self.range_query.getLength() + + ## Return the name of the query + # + def getQueryName( self ): + return self.range_query.seqname + + ## Return the start of the query + # + def getQueryStart( self ): + return self.range_query.start + + ## Return the end of the query + # + def getQueryEnd( self ): + return self.range_query.end + + ## Return the min of the query + # + def getQueryMin( self ): + return self.range_query.getMin() + + ## Return the max of the query + # + def getQueryMax( self ): + return self.range_query.getMax() + + ## Return the strand of the query + # + def getQueryStrand( self ): + return self.range_query.getStrand() + + ## Return the length on the subject + # + def getLengthOnSubject(self): + return self.range_subject.getLength() + + ## Return the name of the subject + # + def getSubjectName( self ): + return self.range_subject.seqname + + ## Return the start of the subject + # + def getSubjectStart( self ): + return self.range_subject.start + + ## Return the end of the subject + # + def getSubjectEnd( self ): + return self.range_subject.end + + ## Return the min of the subject + # + def getSubjectMin( self ): + return self.range_subject.getMin() + + ## Return the max of the subject + # + def getSubjectMax( self ): + return self.range_subject.getMax() + + ## Return the strand of the subject + # + def getSubjectStrand( self ): + return self.range_subject.getStrand() + + ## Return the query as a Range instance + # + def getQueryAsRange( self ): + return self.range_query + + ## Return the subject as a Range instance + # + def getSubjectAsRange( self ): + return self.range_subject + + ## Set the name of the query + # + def setQueryName( self, name ): + self.range_query.seqname = name + + ## Set the start of the query + # + def setQueryStart( self, start ): + self.range_query.start = start + + ## Set the end of the query + # + def setQueryEnd( self, end ): + self.range_query.end = end + + ## Set the name of the subject + # + def setSubjectName( self, name ): + self.range_subject.seqname = name + + ## Set the start of the subject + # + def setSubjectStart( self, start ): + self.range_subject.start = start + + ## Set the end of the subject + # + def setSubjectEnd( self, end ): + self.range_subject.end = end + + ## Merge the instance with another Align instance + # + # @param o an Align instance + # + def merge(self, o): + if self.range_query.seqname != o.range_query.seqname \ + or self.range_subject.seqname != o.range_subject.seqname: + return + self.range_query.merge(o.range_query) + self.range_subject.merge(o.range_subject) + self.score = max(self.score,o.score) + self.e_value = min(self.e_value,o.e_value) + self.identity = max(self.identity,o.identity) + + ## Return a Map instance with the subject mapped on the query + # + def getSubjectAsMapOfQuery(self): + iMap = Map() + iMap.name = self.range_subject.seqname + iMap.seqname = self.range_query.seqname + if self.range_subject.isOnDirectStrand(): + iMap.start = self.range_query.start + iMap.end = self.range_query.end + else: + iMap.start = self.range_query.end + iMap.end = self.range_query.start + return iMap + + ## Return True if query is on direct strand + # + def isQueryOnDirectStrand( self ): + return self.range_query.isOnDirectStrand() + + ## Return True if subject is on direct strand + # + def isSubjectOnDirectStrand( self ): + return self.range_subject.isOnDirectStrand() + + ## Return True if query and subject are on the same strand, False otherwise + # + def areQrySbjOnSameStrand(self): + return self.isQueryOnDirectStrand() == self.isSubjectOnDirectStrand() + + ## Return False if query and subject are on the same strand, True otherwise + # + def areQrySbjOnOppositeStrands(self): + return not self.areQrySbjOnSameStrand() + + ## Set attributes from string + # + # @param string a string formatted like queryName queryStart queryEnd subjectName subjectStart subjectEnd E-value score identity + # @param sep field separator + # + def setFromString(self, string, sep="\t"): + if string[-1] == "\n": + string = string[:-1] + self.setFromTuple( string.split(sep) ) + + ## Return a first Map instance for the query and a second for the subject + # + def getMapsOfQueryAndSubject(self): + iMapQuery = Map( name="repet", + seqname=self.range_query.seqname, + start=self.range_query.start, + end=self.range_query.end ) + iMapSubject = Map( name="repet", + seqname=self.range_subject.seqname, + start=self.range_subject.start, + end=self.range_subject.end ) + return iMapQuery, iMapSubject + + ## Write query coordinates as Map in a file + # + # @param fileHandler: file handler of the file being filled + # + def writeSubjectAsMapOfQuery( self, fileHandler ): + m = self.getSubjectAsMapOfQuery() + m.write( fileHandler ) + + ## Return a bin for fast database access + # + def getBin(self): + return self.range_query.getBin() + + ## Switch query and subject + # + def switchQuerySubject( self ): + tmpRange = self.range_query + self.range_query = self.range_subject + self.range_subject = tmpRange + if not self.isQueryOnDirectStrand(): + self.reverse() + + ## Return True if the query overlaps with the query of another Align instance, False otherwise + # + def isQueryOverlapping( self, iAlign ): + return self.getQueryAsRange().isOverlapping( iAlign.getQueryAsRange() ) + + ## Return True if the subject overlaps with the subject of another Align instance, False otherwise + # + def isSubjectOverlapping( self, iAlign ): + return self.getSubjectAsRange().isOverlapping( iAlign.getSubjectAsRange() ) + + ## Return True if the Align instance overlaps with another Align instance, False otherwise + # + def isOverlapping( self, iAlign ): + if self.isQueryOverlapping( iAlign ) and self.isSubjectOverlapping( iAlign ): + return True + else: + return False + + ## Update the score + # + # @note the new score is the length on the query times the percentage of identity + # + def updateScore( self ): + newScore = self.getLengthOnQuery() * self.getIdentity() / 100.0 + self.score = newScore diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/AlignUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/AlignUtils.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,359 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import os +import sys +import shutil +from commons.core.coord.Align import Align + + +## Static methods manipulating Align instances +# +class AlignUtils( object ): + + ## Return a list with Align instances from the given file + # + # @param inFile name of a file in the Align format + # + def getAlignListFromFile( inFile ): + lAlignInstances = [] + inFileHandler = open( inFile, "r" ) + while True: + line = inFileHandler.readline() + if line == "": + break + a = Align() + a.setFromString( line ) + lAlignInstances.append( a ) + inFileHandler.close() + return lAlignInstances + + getAlignListFromFile = staticmethod( getAlignListFromFile ) + + + ## Return a list with all the scores + # + # @param lAlignInstances: list of Align instances + # + def getListOfScores( lAlignInstances ): + lScores = [] + for iAlign in lAlignInstances: + lScores.append( iAlign.score ) + return lScores + + getListOfScores = staticmethod( getListOfScores ) + + + ## Return a list with all the scores from the given file + # + # @param inFile name of a file in the Align format + # + def getScoreListFromFile(inFile): + lScores = [] + append = lScores.append + with open(inFile, "r") as inFileHandler: + line = inFileHandler.readline() + while line: + if line != "\n": + append(int(line.split('\t')[7])) + line = inFileHandler.readline() + return lScores + + getScoreListFromFile = staticmethod( getScoreListFromFile ) + + + ## for each line of a given Align file, write the coordinates on the query and the subject as two distinct lines in a Map file + # + # @param alignFile: name of the input Align file + # @param mapFile: name of the output Map file + # + def convertAlignFileIntoMapFileWithQueriesAndSubjects( alignFile, mapFile ): + alignFileHandler = open( alignFile, "r" ) + mapFileHandler = open( mapFile, "w" ) + iAlign = Align() + while True: + line = alignFileHandler.readline() + if line == "": + break + iAlign.setFromString( line ) + iMapQ, iMapS = iAlign.getMapsOfQueryAndSubject() + iMapQ.write( mapFileHandler ) + iMapS.write( mapFileHandler ) + alignFileHandler.close() + mapFileHandler.close() + + convertAlignFileIntoMapFileWithQueriesAndSubjects = staticmethod( convertAlignFileIntoMapFileWithQueriesAndSubjects ) + + + ## for each line of a given Align file, write the coordinates of the subject on the query as one line in a Map file + # + # @param alignFile: name of the input Align file + # @param mapFile: name of the output Map file + # + def convertAlignFileIntoMapFileWithSubjectsOnQueries( alignFile, mapFile ): + alignFileHandler = open( alignFile, "r" ) + mapFileHandler = open( mapFile, "w" ) + iAlign = Align() + while True: + line = alignFileHandler.readline() + if line == "": + break + iAlign.setFromString( line ) + iMapQ = iAlign.getSubjectAsMapOfQuery() + iMapQ.write( mapFileHandler ) + alignFileHandler.close() + mapFileHandler.close() + + convertAlignFileIntoMapFileWithSubjectsOnQueries = staticmethod( convertAlignFileIntoMapFileWithSubjectsOnQueries ) + + + ## return a list of Align instances sorted in decreasing order according to their score, then their length on the query and finally their initial order + # + # @param lAligns: list of Align instances + # + def getAlignListSortedByDecreasingScoreThenLength( lAligns ): + return sorted( lAligns, key=lambda iAlign: ( 1 / float(iAlign.getScore()), 1 / float(iAlign.getLengthOnQuery()) ) ) + + getAlignListSortedByDecreasingScoreThenLength = staticmethod( getAlignListSortedByDecreasingScoreThenLength ) + + + ## Convert an Align file into a Path file + # + # @param alignFile string name of the input Align file + # @param pathFile string name of the output Path file + # + def convertAlignFileIntoPathFile( alignFile, pathFile ): + alignFileHandler = open( alignFile, "r" ) + pathFileHandler = open( pathFile, "w" ) + iAlign = Align() + countAlign = 0 + while True: + line = alignFileHandler.readline() + if line == "": + break + countAlign += 1 + iAlign.setFromString( line, "\t" ) + pathFileHandler.write( "%i\t%s\n" % ( countAlign, iAlign.toString() ) ) + alignFileHandler.close() + pathFileHandler.close() + + convertAlignFileIntoPathFile = staticmethod( convertAlignFileIntoPathFile ) + + + ## Sort an Align file + # + def sortAlignFile( inFile, outFile="" ): + if outFile == "": + outFile = "%s.sort" % ( inFile ) + prg = "sort" + cmd = prg + cmd += " -k 1,1 -k 4,4 -k 2,2n -k 3,3n -k 5,5n -k 6,6n -k 8,8n" + cmd += " %s" % ( inFile ) + cmd += " > %s" % ( outFile ) + exitStatus = os.system( cmd ) + if exitStatus != 0: + msg = "ERROR: '%s' returned '%i'" % ( prg, exitStatus ) + sys.stderr.write( "%s\n" % ( msg ) ) + sys.exit( exitStatus ) + + sortAlignFile = staticmethod( sortAlignFile ) + + + ## Write Align instances contained in the given list + # + # @param lAlign a list of Align instances + # @param fileName name of the file to write the Align instances + # @param mode the open mode of the file ""w"" or ""a"" + # + def writeListInFile( lAlign, fileName, mode="w" ): + fileHandler = open( fileName, mode ) + for iAlign in lAlign: + iAlign.write( fileHandler ) + fileHandler.close() + + writeListInFile = staticmethod( writeListInFile ) + + + ## Split a list of Align instances according to the name of the query + # + # @param lInAlign list of align instances + # @return lOutAlignList list of align instances lists + # + def splitAlignListByQueryName( lInAlign ): + lSortedAlign = sorted(lInAlign, key=lambda o: o.range_query.seqname) + lOutAlignList = [] + if len(lSortedAlign) != 0 : + lAlignForCurrentQuery = [] + previousQuery = lSortedAlign[0].range_query.seqname + for align in lSortedAlign : + currentQuery = align.range_query.seqname + if previousQuery != currentQuery : + lOutAlignList.append(lAlignForCurrentQuery) + previousQuery = currentQuery + lAlignForCurrentQuery = [] + lAlignForCurrentQuery.append(align) + + lOutAlignList.append(lAlignForCurrentQuery) + + return lOutAlignList + + splitAlignListByQueryName = staticmethod( splitAlignListByQueryName ) + + + ## Create an Align file from each list of Align instances in the input list + # + # @param lAlignList list of lists with Align instances + # @param pattern string + # @param dirName string + # + def createAlignFiles( lAlignList, pattern, dirName="" ): + savedDir = os.getcwd() + nbFiles = len(lAlignList) + countFile = 1 + if dirName != "" : + try: + os.makedirs(dirName) + except: + pass + os.chdir(dirName) + + for lAlign in lAlignList: + fileName = "%s_%s.align" % (pattern, str(countFile).zfill(len(str(nbFiles)))) + AlignUtils.writeListInFile(lAlign, fileName) + countFile += 1 + os.chdir(savedDir) + + createAlignFiles = staticmethod( createAlignFiles ) + + + ## Return a list with Align instances sorted by query name, subject name, query start, query end and score + # + def sortList( lAligns ): + return sorted( lAligns, key=lambda iAlign: ( iAlign.getQueryName(), + iAlign.getSubjectName(), + iAlign.getQueryStart(), + iAlign.getQueryEnd(), + iAlign.getScore() ) ) + + sortList = staticmethod( sortList ) + + + ## Return a list after merging all overlapping Align instances + # + def mergeList( lAligns ): + lMerged = [] + + lSorted = AlignUtils.sortList( lAligns ) + + prev_count = 0 + for iAlign in lSorted: + if prev_count != len(lSorted): + for i in lSorted[ prev_count + 1: ]: + if iAlign.isOverlapping( i ): + iAlign.merge( i ) + IsAlreadyInList = False + for newAlign in lMerged: + if newAlign.isOverlapping( iAlign ): + IsAlreadyInList = True + newAlign.merge( iAlign ) + lMerged [ lMerged.index( newAlign ) ] = newAlign + if not IsAlreadyInList: + lMerged.append( iAlign ) + prev_count += 1 + + return lMerged + + mergeList = staticmethod( mergeList ) + + + ## Merge all Align instance in a given Align file + # + def mergeFile( inFile, outFile="" ): + if outFile == "": + outFile = "%s.merged" % ( inFile ) + if os.path.exists( outFile ): + os.remove( outFile ) + + tmpFile = "%s.sorted" % ( inFile ) + AlignUtils.sortAlignFile( inFile, tmpFile ) + + tmpF = open( tmpFile, "r" ) + dQrySbj2Aligns = {} + prevPairQrySbj = "" + while True: + line = tmpF.readline() + if line == "": + break + iAlign = Align() + iAlign.setFromString( line ) + pairQrySbj = "%s_%s" % ( iAlign.getQueryName(), iAlign.getSubjectName() ) + if not dQrySbj2Aligns.has_key( pairQrySbj ): + if prevPairQrySbj != "": + lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] ) + AlignUtils.writeListInFile( lMerged, outFile, "a" ) + del dQrySbj2Aligns[ prevPairQrySbj ] + prevPairQrySbj = pairQrySbj + else: + prevPairQrySbj = pairQrySbj + dQrySbj2Aligns[ pairQrySbj ] = [] + dQrySbj2Aligns[ pairQrySbj ].append( iAlign ) + lMerged = [] + if len(dQrySbj2Aligns.keys()) > 0: + lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] ) + AlignUtils.writeListInFile( lMerged, outFile, "a" ) + tmpF.close() + os.remove( tmpFile ) + + mergeFile = staticmethod( mergeFile ) + + + ## Update the scores of each match in the input file + # + # @note the new score is the length on the query times the percentage of identity + # + def updateScoresInFile( inFile, outFile ): + inHandler = open( inFile, "r" ) + outHandler = open( outFile, "w" ) + iAlign = Align() + + while True: + line = inHandler.readline() + if line == "": + break + iAlign.reset() + iAlign.setFromString( line, "\t" ) + iAlign.updateScore() + iAlign.write( outHandler ) + + inHandler.close() + outHandler.close() + + updateScoresInFile = staticmethod( updateScoresInFile ) diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/ConvCoord.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/ConvCoord.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,502 @@ +#!/usr/bin/env python + +##@file +# Convert coordinates from chunks to chromosomes or the opposite. +# +# usage: ConvCoord.py [ options ] +# options: +# -h: this help +# -i: input data with coordinates to convert (file or table) +# -f: input data format (default='align'/'path') +# -c: coordinates to convert (query, subject or both; default='q'/'s'/'qs') +# -m: mapping of chunks on chromosomes (format='map') +# -x: convert from chromosomes to chunks (opposite by default) +# -o: output data (file or table, same as input) +# -C: configuration file (for database connection) +# -v: verbosity level (default=0/1/2) + + +import os +import sys +import getopt +import time +from commons.core.sql.DbFactory import DbFactory +from commons.core.coord.MapUtils import MapUtils +from commons.core.sql.TableMapAdaptator import TableMapAdaptator +from commons.core.sql.TablePathAdaptator import TablePathAdaptator +from commons.core.coord.PathUtils import PathUtils +from commons.core.coord.Align import Align +from commons.core.coord.Path import Path +from commons.core.coord.Range import Range + + +## Class to handle coordinate conversion +# +class ConvCoord( object ): + + ## Constructor + # + def __init__( self, inData="", mapData="", outData="", configFile="", verbosity=0): + self._inData = inData + self._formatInData = "align" + self._coordToConvert = "q" + self._mapData = mapData + self._mergeChunkOverlaps = True + self._convertChunks = True + self._outData = outData + self._configFile = configFile + self._verbose = verbosity + self._typeInData = "file" + self._typeMapData = "file" + self._tpa = None + if self._configFile != "" and os.path.exists(self._configFile): + self._iDb = DbFactory.createInstance(self._configFile) + else: + self._iDb = DbFactory.createInstance() + + + ## Display the help on stdout + # + def help( self ): + print + print "usage: ConvCoord.py [ options ]" + print "options:" + print " -h: this help" + print " -i: input data with coordinates to convert (file or table)" + print " -f: input data format (default='align'/'path')" + print " -c: coordinates to convert (query, subject or both; default='q'/'s'/'qs')" + print " -m: mapping of chunks on chromosomes (format='map')" + print " -M: merge chunk overlaps (default=yes/no)" + print " -x: convert from chromosomes to chunks (opposite by default)" + print " -o: output data (file or table, same as input)" + print " -C: configuration file (for database connection)" + print " -v: verbosity level (default=0/1/2)" + print + + + ## Set the attributes from the command-line + # + def setAttributesFromCmdLine( self ): + try: + opts, args = getopt.getopt(sys.argv[1:],"hi:f:c:m:M:xo:C:v:") + except getopt.GetoptError, err: + sys.stderr.write( "%s\n" % ( str(err) ) ) + self.help(); sys.exit(1) + for o,a in opts: + if o == "-h": + self.help(); sys.exit(0) + elif o == "-i": + self.setInputData( a ) + elif o == "-f": + self.setInputFormat( a ) + elif o == "-c": + self.setCoordinatesToConvert( a ) + elif o == "-m": + self.setMapData( a ) + elif o == "-M": + self.setMergeChunkOverlaps( a ) + elif o == "-o": + self.setOutputData( a ) + elif o == "-C": + self.setConfigFile( a ) + elif o == "-v": + self.setVerbosityLevel( a ) + + + def setInputData( self, inData ): + self._inData = inData + + def setInputFormat( self, formatInData ): + self._formatInData = formatInData + + def setCoordinatesToConvert( self, coordToConvert ): + self._coordToConvert = coordToConvert + + def setMapData( self, mapData ): + self._mapData = mapData + + def setMergeChunkOverlaps( self, mergeChunkOverlaps ): + if mergeChunkOverlaps == "yes": + self._mergeChunkOverlaps = True + else: + self._mergeChunkOverlaps = False + + def setOutputData( self, outData ): + self._outData = outData + + def setConfigFile( self, configFile ): + self._configFile = configFile + + def setVerbosityLevel( self, verbose ): + self._verbose = int(verbose) + + + ## Check the attributes are valid before running the algorithm + # + def checkAttributes( self ): + if self._inData == "": + msg = "ERROR: missing input data (-i)" + sys.stderr.write( "%s\n" % ( msg ) ) + self.help(); sys.exit(1) + if self._formatInData not in ["align","path"]: + msg = "ERROR: unrecognized format '%s' (-f)" % ( self._formatInData ) + sys.stderr.write( "%s\n" % ( msg ) ) + self.help(); sys.exit(1) + if self._configFile == "": + self._iDb = DbFactory.createInstance() + elif not os.path.exists( self._configFile ): + msg = "ERROR: configuration file '%s' doesn't exist" % ( self._configFile ) + sys.stderr.write( "%s\n" % ( msg ) ) + self.help(); sys.exit(1) + else: + self._iDb = DbFactory.createInstance(self._configFile) + if not os.path.exists( self._inData ) and not self._iDb.doesTableExist( self._inData ): + msg = "ERROR: input data '%s' doesn't exist" % ( self._inData ) + sys.stderr.write( "%s\n" % ( msg ) ) + self.help(); sys.exit(1) + if os.path.exists( self._inData ): + self._typeInData = "file" + elif self._iDb.doesTableExist( self._inData ): + self._typeInData = "table" + if self._coordToConvert == "": + msg = "ERROR: missing coordinates to convert (-c)" + sys.stderr.write( "%s\n" % ( msg ) ) + self.help(); sys.exit(1) + if self._coordToConvert not in [ "q", "s", "qs" ]: + msg = "ERROR: unrecognized coordinates to convert '%s' (-c)" % ( self._coordToConvert ) + sys.stderr.write( "%s\n" % ( msg ) ) + self.help(); sys.exit(1) + if self._mapData == "": + msg = "ERROR: missing mapping coordinates of chunks on chromosomes (-m)" + sys.stderr.write( "%s\n" % ( msg ) ) + self.help(); sys.exit(1) + if not os.path.exists( self._mapData ) and not self._iDb.doesTableExist( self._mapData ): + msg = "ERROR: mapping data '%s' doesn't exist" % ( self._mapData ) + sys.stderr.write( "%s\n" % ( msg ) ) + self.help(); sys.exit(1) + if os.path.exists( self._mapData ): + self._typeMapData = "file" + elif self._iDb.doesTableExist( self._mapData ): + self._typeMapData = "table" + if self._outData == "": + if self._convertChunks: + self._outData = "%s.onChr" % ( self._inData ) + else: + self._outData = "%s.onChk" % ( self._inData ) + if self._typeInData == "table": + self._outData = self._outData.replace(".","_") + + + ## Return a dictionary with the mapping of the chunks on the chromosomes + # + def getChunkCoordsOnChromosomes( self ): + if self._typeMapData == "file": + dChunks2CoordMaps = MapUtils.getDictPerNameFromMapFile( self._mapData ) + elif self._typeMapData == "table": + tma = TableMapAdaptator( self._iDb, self._mapData ) + dChunks2CoordMaps = tma.getDictPerName() + if self._verbose > 1: + msg = "nb of chunks: %i" % ( len(dChunks2CoordMaps.keys()) ) + sys.stdout.write( "%s\n" % ( msg ) ) + return dChunks2CoordMaps + + + def getRangeOnChromosome( self, chkRange, dChunks2CoordMaps ): + chrRange = Range() + chunkName = chkRange.seqname + chrRange.seqname = dChunks2CoordMaps[ chunkName ].seqname + if dChunks2CoordMaps[ chunkName ].start == 1: + chrRange.start = chkRange.start + chrRange.end = chkRange.end + else: + startOfChkOnChr = dChunks2CoordMaps[ chunkName ].start + chrRange.start = startOfChkOnChr + chkRange.start - 1 + chrRange.end = startOfChkOnChr + chkRange.end - 1 + return chrRange + + + def convCoordsChkToChrFromAlignFile( self, inFile, dChunks2CoordMaps ): + return self.convCoordsChkToChrFromAlignOrPathFile( inFile, dChunks2CoordMaps, "align" ) + + + def convCoordsChkToChrFromPathFile( self, inFile, dChunks2CoordMaps ): + return self.convCoordsChkToChrFromAlignOrPathFile( inFile, dChunks2CoordMaps, "path" ) + + + + ## Convert coordinates of a Path or Align file from chunks to chromosomes + # + def convCoordsChkToChrFromAlignOrPathFile( self, inFile, dChunks2CoordMaps, format ): + if self._verbose > 1: + msg = "start method 'convCoordsChkToChrFromAlignOrPathFile'" + sys.stdout.write( "%s\n" % ( msg ) ) + outFile = "%s.tmp" % ( inFile ) + inFileHandler = open( inFile, "r" ) + outFileHandler = open( outFile, "w" ) + if format == "align": + iObject = Align() + else: + iObject = Path() + countLine = 0 + + while True: + line = inFileHandler.readline() + if line == "": + break + countLine += 1 + iObject.setFromString( line ) + if self._coordToConvert in [ "q", "qs" ]: + queryOnChr = self.getRangeOnChromosome( iObject.range_query, dChunks2CoordMaps ) + iObject.range_query = queryOnChr + if self._coordToConvert in [ "s", "qs" ]: + subjectOnChr = self.getRangeOnChromosome( iObject.range_subject, dChunks2CoordMaps ) + iObject.range_subject = subjectOnChr + iObject.write( outFileHandler ) + iObject.reset() + + inFileHandler.close() + outFileHandler.close() + if self._verbose > 1: + msg = "end method 'convCoordsChkToChrFromAlignOrPathFile'" + sys.stdout.write( "%s\n" % ( msg ) ) + return outFile + + ## Convert coordinates of a file from chunks to chromosomes + # + def convCoordsChkToChrFromFile( self, inFile, format, dChunks2CoordMaps ): + if self._verbose > 1: + msg = "start convCoordsChkToChrFromFile" + sys.stdout.write( "%s\n" % ( msg ) ) + if format == "align": + tmpAlignFile = self.convCoordsChkToChrFromAlignFile( inFile, dChunks2CoordMaps ) + tmpAlignTable = os.path.basename(tmpAlignFile.replace(".","_").replace("-","_")) + self._iDb.createTable( tmpAlignTable, "align", tmpAlignFile, True) + os.remove( tmpAlignFile ) + self._iDb.removeDoublons( tmpAlignTable ) + outTable = "%s_path" % ( tmpAlignTable ) + self._iDb.convertAlignTableIntoPathTable( tmpAlignTable, outTable ) + self._iDb.dropTable( tmpAlignTable ) + elif format == "path": + tmpPathFile = self.convCoordsChkToChrFromPathFile( inFile, dChunks2CoordMaps ) + outTable = tmpPathFile.replace(".","_").replace("-","_") + self._iDb.createTable( outTable, "path", tmpPathFile, True) + os.remove( tmpPathFile ) + if self._verbose > 1: + msg = "end convCoordsChkToChrFromFile" + sys.stdout.write( "%s\n" % ( msg ) ) + return outTable + + + ## Convert coordinates of a table from chunks to chromosomes + # + def convCoordsChkToChrFromTable( self, inTable, format, dChunks2CoordMaps ): + tmpFile = inTable + self._iDb.exportDataToFile( inTable, tmpFile, False ) + outTable = self.convCoordsChkToChrFromFile( tmpFile, format, dChunks2CoordMaps ) + os.remove( tmpFile ) + return outTable + + + def getListsDirectAndReversePaths( self, lPaths ): + lDirectPaths = [] + lReversePaths = [] + for iPath in lPaths: + if iPath.isQueryOnDirectStrand() and iPath.isSubjectOnDirectStrand(): + lDirectPaths.append( iPath ) + else: + lReversePaths.append( iPath ) + return lDirectPaths, lReversePaths + + + def mergePaths( self, lPaths, lIdsToInsert, lIdsToDelete, dOldIdToNewId ): + if len(lPaths) < 2: + lIdsToInsert.append( lPaths[0].id ) + return + i = 0 + while i < len(lPaths) - 1: + i += 1 + if self._verbose > 1 and i==1 : + print lPaths[i-1] + if self._verbose > 1: + print lPaths[i] + sys.stdout.flush() + idPrev = lPaths[i-1].id + idNext = lPaths[i].id + if lPaths[i-1].canMerge( lPaths[i] ): + dOldIdToNewId[ idNext ] = idPrev + if idPrev not in lIdsToInsert: + lIdsToInsert.append( idPrev ) + if idNext not in lIdsToDelete: + lIdsToDelete.append( idNext ) + lPaths[i-1].merge( lPaths[i] ) + del lPaths[i] + i -= 1 + + + def insertPaths( self, lPaths, lIdsToInsert, dOldIdToNewId ): + for iPath in lPaths: + if dOldIdToNewId.has_key( iPath.id ): + iPath.id = dOldIdToNewId[ iPath.id ] + if iPath.id in lIdsToInsert: + self._tpa.insert( iPath ) + + + ## Merge Path instances in a Path table when they correspond to chunk overlaps + # + def mergeCoordsOnChunkOverlaps( self, dChunks2CoordMaps, tmpPathTable ): + if self._verbose > 1: + msg = "start method 'mergeCoordsOnChunkOverlaps'" + sys.stdout.write( "%s\n" % ( msg ) ) + self._tpa = TablePathAdaptator( self._iDb, tmpPathTable ) + nbChunks = len(dChunks2CoordMaps.keys()) + for numChunk in range(1,nbChunks): + chunkName1 = "chunk%s" % ( str(numChunk).zfill( len(str(nbChunks)) ) ) + chunkName2 = "chunk%s" % ( str(numChunk+1).zfill( len(str(nbChunks)) ) ) + if not dChunks2CoordMaps.has_key( chunkName2 ): + break + if self._verbose > 1: + msg = "try merge on '%s' and '%s'" % ( chunkName1, chunkName2 ) + sys.stdout.write( "%s\n" % ( msg ) ) + sys.stdout.flush() + chrName = dChunks2CoordMaps[ chunkName1 ].seqname + if dChunks2CoordMaps[ chunkName2 ].seqname != chrName: + if self._verbose > 1: + msg = "not on same chromosome (%s != %s)" % ( dChunks2CoordMaps[ chunkName2 ].seqname, chrName ) + sys.stdout.write( "%s\n" % ( msg ) ) + sys.stdout.flush() + continue + minCoord = min( dChunks2CoordMaps[ chunkName1 ].end, dChunks2CoordMaps[ chunkName2 ].start ) + maxCoord = max( dChunks2CoordMaps[ chunkName1 ].end, dChunks2CoordMaps[ chunkName2 ].start ) + lPaths = self._tpa.getChainListOverlappingQueryCoord( chrName, minCoord, maxCoord ) + if len(lPaths) == 0: + if self._verbose > 1: + msg = "no overlapping matches on %s (%i->%i)" % ( chrName, minCoord, maxCoord ) + sys.stdout.write( "%s\n" % ( msg ) ) + sys.stdout.flush() + continue + if self._verbose > 1: + msg = "%i overlapping matche(s) on %s (%i->%i)" % ( len(lPaths), chrName, minCoord, maxCoord ) + sys.stdout.write( "%s\n" % ( msg ) ) + sys.stdout.flush() + lSortedPaths = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier( lPaths ) + lDirectPaths, lReversePaths = self.getListsDirectAndReversePaths( lSortedPaths ) + lIdsToInsert = [] + lIdsToDelete = [] + dOldIdToNewId = {} + if len(lDirectPaths) > 0: + self.mergePaths( lDirectPaths, lIdsToInsert, lIdsToDelete, dOldIdToNewId ) + if len(lReversePaths) > 0: + self.mergePaths( lReversePaths, lIdsToInsert, lIdsToDelete, dOldIdToNewId ) + self._tpa.deleteFromIdList( lIdsToDelete ) + self._tpa.deleteFromIdList( lIdsToInsert ) + self.insertPaths( lDirectPaths, lIdsToInsert, dOldIdToNewId ) + self.insertPaths( lReversePaths, lIdsToInsert, dOldIdToNewId ) + if self._verbose > 1: + msg = "end method 'mergeCoordsOnChunkOverlaps'" + sys.stdout.write( "%s\n" % ( msg ) ) + sys.stdout.flush() + + + def saveChrCoordsAsFile( self, tmpPathTable, outFile ): + self._iDb.exportDataToFile( tmpPathTable, tmpPathTable, False ) + self._iDb.dropTable( tmpPathTable ) + if self._formatInData == "align": + PathUtils.convertPathFileIntoAlignFile( tmpPathTable, outFile ) + os.remove( tmpPathTable ) + elif self._formatInData == "path": + os.rename( tmpPathTable, outFile ) + + + def saveChrCoordsAsTable( self, tmpPathTable, outTable ): + if self._formatInData == "align": + self._iDb.convertPathTableIntoAlignTable( tmpPathTable, outTable ) + self._iDb.dropTable( tmpPathTable ) + elif self._formatInData == "path": + self._iDb.renameTable( tmpPathTable, outTable ) + + + ## Convert coordinates from chunks to chromosomes + # + def convertCoordinatesFromChunksToChromosomes( self ): + dChunks2CoordMaps = self.getChunkCoordsOnChromosomes() + + if self._typeInData == "file": + tmpPathTable = self.convCoordsChkToChrFromFile( self._inData, self._formatInData, dChunks2CoordMaps ) + elif self._typeInData == "table": + tmpPathTable = self.convCoordsChkToChrFromTable( self._inData, self._formatInData, dChunks2CoordMaps ) + + if self._mergeChunkOverlaps: + self.mergeCoordsOnChunkOverlaps( dChunks2CoordMaps, tmpPathTable ); + + if self._typeInData == "file": + self.saveChrCoordsAsFile( tmpPathTable, self._outData ) + elif self._typeInData == "table": + self.saveChrCoordsAsTable( tmpPathTable, self._outData ) + + + ## Convert coordinates from chromosomes to chunks + # + def convertCoordinatesFromChromosomesToChunks( self ): + msg = "ERROR: convert coordinates from chromosomes to chunks not yet available" + sys.stderr.write( "%s\n" % ( msg ) ) + sys.exit(1) + + + ## Useful commands before running the program + # + def start( self ): + self.checkAttributes() + msg = "START ConvCoord.py (%s)\n" % ( time.strftime("%m/%d/%Y %H:%M:%S") ) + if self._verbose > 1: + msg += "input data: %s" % ( self._inData ) + if self._typeInData == "file": + msg += " (file)\n" + else: + msg += " (table)\n" + msg += "format: %s\n" % ( self._formatInData ) + msg += "coordinates to convert: %s\n" % ( self._coordToConvert ) + msg += "mapping data: %s" % ( self._mapData ) + if self._typeMapData == "file": + msg += " (file)\n" + else: + msg += " (table)\n" + if self._mergeChunkOverlaps: + msg += "merge chunk overlaps\n" + else: + msg += "don't merge chunk overlaps\n" + if self._convertChunks: + msg += "convert chunks to chromosomes\n" + else: + msg += "convert chromosomes to chunks\n" + msg += "output data: %s" % ( self._outData ) + if self._typeInData == "file": + msg += " (file)\n" + else: + msg += " (table)\n" + sys.stdout.write( msg ) + + + ## Useful commands before ending the program + # + def end( self ): + self._iDb.close() + msg = "END ConvCoord.py (%s)" % ( time.strftime("%m/%d/%Y %H:%M:%S") ) + sys.stdout.write( "%s\n" % ( msg ) ) + + ## Run the program + # + def run( self ): + self.start() + + if self._convertChunks: + self.convertCoordinatesFromChunksToChromosomes() + else: + self.convertCoordinatesFromChromosomesToChunks() + + self.end() + + +if __name__ == "__main__": + i = ConvCoord() + i.setAttributesFromCmdLine() + i.run() diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/CountOverlapping.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/CountOverlapping.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,96 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import bisect +from commons.core.checker.RepetException import RepetException +from commons.core.LoggerFactory import LoggerFactory + +LOG_DEPTH = "commons.coord" +class CountOverlapping(object): + + ## lFeatures must be a list of objects implementing getStart, getEnd, getSeqname methods. + # If areFeaturesOnDirectStrandsOnly is set to False, isOnReverseStrand and reverse methods must be implemented too. + # Throws a RepetException if all the features in lFeatures don't share the same getSeqname() result + # + # This implementation may not be very efficient but it works + # + def __init__(self, lFeatures, areFeaturesOnDirectStrandsOnly = False, verbosity = 2): + self._verbosity = verbosity + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) + + self._areFeaturesOnDirectStrandsOnly = areFeaturesOnDirectStrandsOnly + self._lFeaturesToCheck = lFeatures + self._prepareData() + + def _prepareData(self): + self._nbFeatures = len(self._lFeaturesToCheck) + sNames = set() + for seqName in [iFeature.getSeqname() for iFeature in self._lFeaturesToCheck]: + sNames.add(seqName) + + if len(sNames) not in [0, 1]: + self._logAndRaise("ERROR: different sequence names in input features list") + + if not self._areFeaturesOnDirectStrandsOnly: + for iFeature in self._lFeaturesToCheck: + if iFeature.isOnReverseStrand(): + iFeature.reverse() + self._areFeaturesOnDirectStrandsOnly = True + + def _logAndRaise(self, errorMsg): + self._log.error(errorMsg) + raise RepetException(errorMsg) + + ## Count number of features overlapping with a given interval + # + # @param queryInterval feature to check overlaps number with (must implement getStart, getEnd, getSeqname, isOnReverseStrand and reverse methods) + # @return int number of input features overlapping with queryInterval + # + def count(self, queryInterval): + if queryInterval.isOnReverseStrand(): + queryInterval.reverse() + if self._nbFeatures == 0: + self._log.warning("WARNING: empty feature list. Will return 0 overlap.") + return 0 + else: + featuresName = self._lFeaturesToCheck[0].getSeqname() + queryName = queryInterval.getSeqname() + if featuresName != queryName: + self._log.warning("WARNING: different sequence names between feature '%s' and queryInterval '%s'. Will return 0 overlap." % (featuresName, queryName)) + + lOrderedStart = [iFeature.getStart() for iFeature in self._lFeaturesToCheck] + lOrderedEnd = [iFeature.getEnd() for iFeature in self._lFeaturesToCheck] + + lOrderedStart.sort() + lOrderedEnd.sort() + + first = bisect.bisect_right(lOrderedStart, queryInterval.getEnd()) + last = bisect.bisect_right(lOrderedEnd, queryInterval.getStart()) + return self._nbFeatures - (last +(self._nbFeatures - first)) diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/Map.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/Map.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,168 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.coord.Range import Range + + +## Record a named region on a given sequence +# +class Map( Range ): + + __slots__ = ("name") + + ## Constructor + # + # @param name the name of the region + # @param seqname the name of the sequence + # @param start the start coordinate + # @param end the end coordinate + # + def __init__(self, name="", seqname="", start=-1, end=-1): + self.name = name + Range.__init__( self, seqname, start, end ) + + ## Equal operator + # + # @param o a Map instance + # + def __eq__(self, o): + if type(o) is type(self): + if self.name == o.name: + return Range.__eq__(self, o) + return False + + ## Not equal operator + # + def __ne__(self, o): + return not self.__eq__(o) + + ## Return name + # + def getName( self ): + return self.name + + ## Set attributes from tuple + # + # @param tuple: a tuple with (name,seqname,start,end) + # + def setFromTuple(self, tuple): + self.name = tuple[0] + Range.setFromTuple(self, tuple[1:]) + + ## Set attributes from string + # + # @param string a string formatted like nameseqnamestartend + # @param sep field separator + # + def setFromString(self, string, sep="\t"): + string.strip() + self.setFromTuple(tuple(string.split(sep))) + + ## Reset + # + def reset(self): + self.setFromTuple(("", "", -1, -1)) + + ## Read attributes from a Map file + # + # @param fileHandler: file handler of the file being read + # @return: 1 on success, 0 at the end of the file + # + def read(self, fileHandler): + self.reset() + line = fileHandler.readline() + if line == "": + return 0 + tokens = line.split("\t") + if len(tokens) < 4: + return 0 + self.setFromTuple(tuple(tokens)) + return 1 + + ## Return the attributes as a formatted string + # + def toString(self): + string = "%s" % (self.name) + string += "\t%s" % (Range.toString(self)) + return string + + ## Write attributes into a Map file + # + # @param fileHandler: file handler of the file being filled + # + def write(self, fileHandler): + fileHandler.write("%s\n" % (self.toString())) + + ## Save attributes into a Map file + # + # @param file: name of the file being filled + # + def save(self, file): + fileHandler = open( file, "a" ) + self.write( fileHandler ) + fileHandler.close() + + ## Return a Range instance with the attributes + # + def getRange(self): + return Range( self.seqname, self.start, self.end) + + ## Remove in the instance the region overlapping with another Map instance + # + # @param o a Map instance + # + def diff(self, o): + iRange = Range.diff(self, o.getRange()) + new = Map() + if not iRange.isEmpty(): + new.name = self.name + new.seqname = self.seqname + new.start = iRange.start + new.end = iRange.end + return new + + ## Write attributes in a Path file, the name being the subject and the rest the Range query + # + # @param fileHandler: file handler of a Path file + # + def writeAsQueryOfPath(self, fileHandler): + string = "0" + string += "\t%s" % ( self.seqname ) + string += "\t%i" % ( self.getMin() ) + string += "\t%i" % ( self.getMax() ) + string += "\t%s" % ( self.name ) + string += "\t0" + string += "\t0" + string += "\t0.0" + string += "\t0" + string += "\t0" + fileHandler.write( "%s\n" % ( string ) ) + diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/MapUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/MapUtils.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,223 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import os +import sys +from commons.core.coord.Map import Map +from commons.core.coord.Set import Set +from commons.core.checker.CheckerUtils import CheckerUtils + + +## static methods manipulating Map instances +# +class MapUtils( object ): + + ## Return a Map list from a Map file + # + # @param mapFile string name of a Map file + # @return a list of Map instances + # + @staticmethod + def getMapListFromFile(mapFile): + lMaps = [] + with open(mapFile, "r") as mapF: + for line in mapF: + iMap = Map() + iMap.setFromString(line) + lMaps.append(iMap) + return lMaps + + + ## Return a list of Map instances sorted in increasing order according to the min, then the max, and finally their initial order + # + # @param lMaps list of Map instances + # + @staticmethod + def getMapListSortedByIncreasingMinThenMax(lMaps): + return sorted(lMaps, key = lambda iMap: (iMap.getMin(), iMap.getMax())) + + + ## Return a list of Map instances sorted in increasing order according to the name, then the seqname, then the min, then the max + # + # @param lMaps list of Map instances + # + @staticmethod + def getMapListSortedByIncreasingNameThenSeqnameThenMinThenMax(lMaps): + return sorted(lMaps, key = lambda iMap: (iMap.getName(), iMap.getSeqname(), iMap.getMin(), iMap.getMax())) + + + ## Return a dictionary which keys are Map names and values the corresponding Map instances + # + # @param mapFile string input map file name + # + @staticmethod + def getDictPerNameFromMapFile(mapFile): + dName2Maps = {} + with open(mapFile) as mapFileHandler: + for line in mapFileHandler: + iMap = Map() + iMap.setFromString(line, "\t") + if dName2Maps.has_key(iMap.name): + if iMap != dName2Maps[iMap.name]: + msg = "ERROR: in file '%s' two different Map instances have the same name '%s'" % (mapFile, iMap.name) + sys.stderr.write("%s\n" % msg) + sys.exit(1) + else: + dName2Maps[iMap.name] = iMap + return dName2Maps + + + ## Give a list of Set instances from a list of Map instances + # + # @param lMaps list of Map instances + # @return lSets list of Set instances + # + @staticmethod + def mapList2SetList(lMaps): + lSets = [] + c = 0 + for iMap in lMaps: + c += 1 + iSet = Set() + iSet.id = c + iSet.name = iMap.getName() + iSet.seqname = iMap.getSeqname() + iSet.start = iMap.getStart() + iSet.end = iMap.getEnd() + lSets.append(iSet) + return lSets + + + ## Merge the Map instances in a Map file using 'mapOp' + # + # @param mapFile string input map file name + # + @staticmethod + def mergeCoordsInFile(mapFile, outFile): + if not CheckerUtils.isExecutableInUserPath("mapOp"): + msg = "WARNING: can't find executable 'mapOp'" + sys.stderr.write("%s\n" % msg) + else: + cmd = "mapOp" + cmd += " -q %s" % mapFile + cmd += " -m" + cmd += " 2>&1 > /dev/null" + returnStatus = os.system(cmd) + if returnStatus != 0: + print "ERROR: mapOp returned %i" % returnStatus + sys.exit(1) + os.rename("%s.merge" % mapFile, outFile) + + + ## Return a dictionary which keys are Map seqnames and values the corresponding Map instances + # + # @param mapFile string input map file name + # + @staticmethod + def getDictPerSeqNameFromMapFile(mapFile): + dSeqName2Maps = {} + with open(mapFile) as mapFileHandler: + for line in mapFileHandler: + iMap = Map() + iMap.setFromString(line, "\t") + if not dSeqName2Maps.has_key(iMap.seqname): + dSeqName2Maps[iMap.seqname] = [] + dSeqName2Maps[iMap.seqname].append(iMap) + return dSeqName2Maps + + + ## Convert an Map file into a Set file + # + # @param mapFileName string input map file name + # @param setFileName string output set file name + # + @staticmethod + def convertMapFileIntoSetFile(mapFileName, setFileName = ""): + if not setFileName: + setFileName = "%s.set" % mapFileName + setFileHandler = open(setFileName, "w") + count = 0 + iMap = Map() + with open(mapFileName, "r") as mapFileHandler: + for line in mapFileHandler: + count += 1 + iMap.setFromString(line) + iSet = Set() + iSet.id = count + iSet.name = iMap.getName() + iSet.seqname = iMap.getSeqname() + iSet.start = iMap.getStart() + iSet.end = iMap.getEnd() + iSet.write(setFileHandler) + setFileHandler.close() + + + ## Write Map instances contained in the given list + # + # @param lMaps list of Map instances + # @param fileName a file name + # @param mode the open mode of the file '"w"' or '"a"' + # + @staticmethod + def writeListInFile(lMaps, fileName, mode = "w"): + fileHandler = open(fileName, mode) + for iMap in lMaps: + iMap.write(fileHandler) + fileHandler.close() + + + ## Get the length of the shorter annotation in map file + # + # @param mapFile string input map file name + # + @staticmethod + def getMinLengthOfMapFile(mapFileName): + lSizes = [] + with open(mapFileName) as fH: + for line in fH: + start = int(line.split('\t')[2]) + end = int(line.split('\t')[3]) + lSizes.append(end - start + 1) + return min(lSizes) + + + ## Get the length of the longest annotation in map file + # + # @param mapFile string input map file name + # + @staticmethod + def getMaxLengthOfMapFile(mapFileName): + maxLength = 0 + with open(mapFileName) as fH: + for line in fH: + start = int(line.split('\t')[2]) + end = int(line.split('\t')[3]) + maxLength = max(maxLength, end - start + 1) + return maxLength \ No newline at end of file diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/Match.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/Match.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,213 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import sys +from commons.core.coord.Range import Range +from commons.core.coord.Path import Path + + +## Handle a chain of match(es) between two sequences, query and subject, with an identifier and the length of the input sequences +# +class Match( Path ): + + __slots__ = ("query_length", "query_length_perc", "query_seqlength", "match_length_perc", "subject_length", "subject_length_perc", "subject_seqlength") + + ## Constructor + # + def __init__(self): + Path.__init__(self) + self.query_length = -1 + self.query_length_perc = -1 # length of the match on the query / length of the query + self.query_seqlength = -1 + self.match_length_perc = -1 # length of the match on the query / total length of the subject + self.subject_length = -1 + self.subject_length_perc = -1 # length of the match on the subject / length of the subject + self.subject_seqlength = -1 + + ## Equal operator + # + def __eq__(self, o): + if type(o) is not type(self)\ + or self.query_length != o.query_length or self.query_length_perc != o.query_length_perc\ + or self.query_seqlength != o.query_seqlength or self.subject_length != o.subject_length\ + or self.subject_length_perc != o.subject_length_perc or self.subject_seqlength != o.subject_seqlength\ + or self.match_length_perc != o.match_length_perc: + return False + return Path.__eq__(self, o) + + ## Not equal operator + # + def __ne__(self, o): + return not self.__eq__(o) + + ## Return the length of the match on the query divided by the total length of the query + # + def getLengthPercOnQuery(self): + return self.query_length_perc + + ## Return the length of the match on the subject divided by the total length of the subject + # + def getLengthPercOnSubject(self): + return self.subject_length_perc + + ## Return the length of the match on the subject + # + def getLengthMatchOnSubject(self): + return self.subject_length + + ## Set attributes from a tuple + # + # @param tuple: a tuple with (query name,query start,query end, + # query length, query length perc (between 0-1), match length perc (between 0-1), subject name, + # subject start,subject end,subject length, subject length percentage (between 0-1), e_value,score,identity,id) + # + def setFromTuple( self, tuple ): + queryStart = int(tuple[1]) + queryEnd = int(tuple[2]) + subjectStart = int(tuple[7]) + subjectEnd = int(tuple[8]) + if queryStart < queryEnd: + self.range_query = Range(tuple[0],queryStart,queryEnd) + self.range_subject = Range(tuple[6],subjectStart,subjectEnd) + else: + self.range_query = Range(tuple[0],queryEnd,queryStart) + self.range_subject = Range(tuple[6],subjectEnd,subjectStart) + self.query_length = int(tuple[3]) + self.query_length_perc = float(tuple[4]) + self.query_seqlength = int( self.query_length / self.query_length_perc ) + self.match_length_perc = float(tuple[5]) + self.subject_length = int(tuple[9]) + self.subject_length_perc = float(tuple[10]) + self.subject_seqlength = int( self.subject_length / self.subject_length_perc ) + self.e_value = float(tuple[11]) + self.score = float(tuple[12]) + self.identity = float(tuple[13]) + self.id = int(tuple[14]) + + ## Reset + # + def reset( self ): + Path.reset( self ) + self.query_length = -1 + self.query_length_perc = -1 + self.query_seqlength = -1 + self.match_length_perc = -1 + self.subject_length = -1 + self.subject_length_perc = -1 + self.subject_seqlength = -1 + + ## Return a formated string of the attribute data + # + def toString( self ): + string = "%s" % ( self.range_query.toString() ) + string += "\t%i\t%f" % ( self.query_length, + self.query_length_perc ) + string += "\t%f" % ( self.match_length_perc ) + string += "\t%s" % ( self.range_subject.toString() ) + string += "\t%i\t%f" % ( self.subject_length, + self.subject_length_perc ) + string += "\t%g\t%i\t%f" % ( self.e_value, + self.score, + self.identity ) + string += "\t%i" % ( self.id ) + return string + + ## Return a Path instance + # + def getPathInstance( self ): + p = Path() + tuple = ( self.id, + self.range_query.seqname, + self.range_query.start, + self.range_query.end, + self.range_subject.seqname, + self.range_subject.start, + self.range_subject.end, + self.e_value, + self.score, + self.identity ) + p.setFromTuple( tuple ) + return p + + ## Give information about a match whose query is included in the subject + # + # @return string + # + def getQryIsIncluded( self ): + string = "query %s (%d bp: %d-%d) is contained in subject %s (%d bp: %d-%d): id=%.2f - %.3f - %.3f - %.3f" %\ + ( self.range_query.seqname, self.query_seqlength, self.range_query.start, self.range_query.end, + self.range_subject.seqname, self.subject_seqlength, self.range_subject.start, self.range_subject.end, + self.identity, self.query_length_perc, self.match_length_perc, self.subject_length_perc ) + return string + + def increaseLengthPercOnQuery(self, coverage): + self.query_length_perc += coverage + + ## Compare the object with another match and see if they are equal + # (same identity, E-value and score + same subsequences whether in query or subject) + # + # @return True if objects are equals False otherwise + # + def isDoublonWith( self, match, verbose=0 ): + + # if both matches have same identity, score and E-value + if self.identity == match.identity and self.score == match.score and self.e_value == match.e_value: + + # if query and subject are identical + if ( self.range_query.seqname == match.range_query.seqname \ + and self.range_subject.seqname == match.range_subject.seqname ): + + # if the coordinates are equal + if self.range_query.__eq__( match.range_query ) and self.range_subject.__eq__( match.range_subject ): + return True + + else: + if verbose > 0: print "different coordinates"; sys.stdout.flush() + return False + + # if query and subject are reversed but identical + elif self.range_query.seqname == match.range_subject.seqname and self.range_subject.seqname == match.range_query.seqname: + + # if the coordinates are equal + if self.range_query.__eq__( match.range_subject ) and self.range_subject.__eq__( match.range_query ): + return True + + else: + if verbose > 0: print "different coordinates"; sys.stdout.flush() + return False + + else: + if verbose > 0: print "different sequence names"; sys.stdout.flush() + return False + + else: + if verbose > 0: print "different match numbers"; sys.stdout.flush() + return False diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/MatchUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/MatchUtils.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,316 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import os +import sys +import math +from commons.core.coord.Match import Match +from commons.core.utils.FileUtils import FileUtils +from commons.core.checker.RepetException import RepetException + +# # Static methods for the manipulation of Match instances +# +class MatchUtils (object): + + # # Return a list with Match instances from the given file + # + # @param inFile name of a file in the Match format + # @return a list of Match instances + # + @staticmethod + def getMatchListFromFile(inFile): + lMatchInstances = [] + inFileHandler = open(inFile, "r") + while True: + line = inFileHandler.readline() + if line == "": + break + if line[0:10] == "query.name": + continue + m = Match() + m.setFromString(line) + lMatchInstances.append(m) + inFileHandler.close() + return lMatchInstances + + # # Split a Match list in several Match lists according to the subject + # + # @param lMatches a list of Match instances + # @return a dictionary which keys are subject names and values Match lists + # + @staticmethod + def getDictOfListsWithSubjectAsKey(lMatches): + dSubject2MatchList = {} + for iMatch in lMatches: + if not dSubject2MatchList.has_key(iMatch.range_subject.seqname): + dSubject2MatchList[ iMatch.range_subject.seqname ] = [] + dSubject2MatchList[ iMatch.range_subject.seqname ].append(iMatch) + return dSubject2MatchList + + # # Split a Match list in several Match lists according to the query + # + # @param lMatches a list of Match instances + # @return a dictionary which keys are query names and values Match lists + # + @staticmethod + def getDictOfListsWithQueryAsKey (lMatches): + dQuery2MatchList = {} + for iMatch in lMatches: + if not dQuery2MatchList.has_key(iMatch.range_query.seqname): + dQuery2MatchList[ iMatch.range_query.seqname ] = [] + dQuery2MatchList[ iMatch.range_query.seqname ].append(iMatch) + return dQuery2MatchList + + # # Write Match instances contained in the given list + # + # @param lMatches a list of Match instances + # @param fileName name of the file to write the Match instances + # @param mode the open mode of the file ""w"" or ""a"" + # + @staticmethod + def writeListInFile(lMatches, fileName, mode = "w", header = None): + fileHandler = open(fileName, mode) + if header: + fileHandler.write(header) + for iMatch in lMatches: + iMatch.write(fileHandler) + fileHandler.close() + + # # Give path id list from a list of Match instances + # + # @param lMatch list of Match instances + # @return lId integer list + # + @staticmethod + def getIdListFromMatchList(lMatch): + lId = [] + for iMatch in lMatch: + lId.append(iMatch.id) + return lId + + # # Remove duplicated matches in a match list + # ## replace old PyRepet.MatchDB.rmvDoublons() + # + # @param lMatch list of Match instances + # @return lMatchesUniq match unique list + # + @staticmethod + def rmvDuplicateMatches(lMatch): + lMatchesUniq = [] + for match in lMatch: + if len(lMatchesUniq) == 0: + lMatchesUniq.append(match) + else: + nbDoublons = 0 + for m in lMatchesUniq: + if match.isDoublonWith(m): + nbDoublons += 1 + if nbDoublons == 0: + lMatchesUniq.append(match) + + for match1 in lMatchesUniq: + for match2 in lMatchesUniq: + if match1.id != match2.id: + if match1.isDoublonWith(match2): + raise RepetException ("*** Error: doublon not removed") + return lMatchesUniq + + # # Return the list of queries 'included' in subjects when two different databanks are used. + # #replace old pyRepet.MatchDB.filterDiffQrySbj() + # + # @param iBioseqDB bioseqDB databank of queries + # @param thresIdentity float identity threshold + # @param thresLength float length threshold + # @param verbose int verbosity + # + # @return lMatches match list to keep according to length and identity thresholds + # TODO: don't take into account match for sequence against itself. To do ? + @staticmethod + def filterDiffQrySbj(iBioseqDB, matchFile, thresIdentity = 0.95, thresLength = 0.98, verbose = 0): + if verbose > 0: + print "filtering matches (id>=%.2f,qlgth>=%.2f)..." % (thresIdentity, thresLength); sys.stdout.flush() + + thresIdentityPerc = math.floor(thresIdentity * 100) + lQryToKeep = [] + dQry2Matches = MatchUtils.getDictOfListsWithQueryAsKey(MatchUtils.getMatchListFromFile(matchFile)) + + for seqH in iBioseqDB.idx.keys(): + # keep it if it has no match + if not dQry2Matches.has_key(seqH): + if seqH not in lQryToKeep: + lQryToKeep.append(seqH) + else: + isConditionsMet = False + for match in dQry2Matches[ seqH ]: + # check if they are above the thresholds + if match.identity >= thresIdentityPerc and match.query_length_perc >= thresLength: + isConditionsMet = True + break + if not isConditionsMet and seqH not in lQryToKeep: + lQryToKeep.append(seqH) + return lQryToKeep + + # # Count the number of distinct matches involved in at least one match above the thresholds. + # #replace old pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() and pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() + # + # @param thresIdentity float identity threshold + # @param thresLength float length threshold + # + @staticmethod + def getNbDistinctSequencesInsideMatchesWithThresh(lMatches, thresIdentity = 0.95, thresLength = 0.98, whatToCount = "query"): + thresIdentityPerc = math.floor(thresIdentity * 100) + countSbj = 0 + if whatToCount.lower() == "query": + dMatches = MatchUtils.getDictOfListsWithQueryAsKey(lMatches) + else: + dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches) + + for qry in dMatches.keys(): + countMatch = 0 + for match in dMatches[ qry ]: + + if match.identity >= thresIdentityPerc and getattr(match, whatToCount.lower() + "_length_perc") >= thresLength: + countMatch += 1 + if countMatch > 0: + countSbj += 1 + return countSbj + + # # Convert a 'match' file (output from Matcher) into an 'align' file + # # replace old parser.tab2align + # + # @param inFileName a string input file name + # + @staticmethod + def convertMatchFileToAlignFile(inFileName): + basename = os.path.splitext(inFileName)[0] + outFileName = "%s.align" % basename + outFile = open(outFileName, "w") + + lMatches = MatchUtils.getMatchListFromFile(inFileName) + + for match in lMatches: + string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity()) + outFile.write(string) + + outFile.close() + + # # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file) + # for each match, compute coverage on the smallest seq using this expression + # Min (|QMatchEnd - QMatchStart + 1|,|SMatchEnd - SMatchStart + 1|) + #coverage = ----------------------------------------------------------------- + # Min (QLength,SLength) + # + # Use this coverage for arc value. Matches of a sequence on itself are filtered. + # + # @param matchFileName string input match file name + # @param outFileName string output abc file name + # @param coverage float coverage filter threshold + # + @staticmethod + def convertMatchFileIntoABCFileWithCoverageComputeOnSmallestSeq(matchFileName, outFileName, coverageThreshold = 0): + sSeqNames = set() + sSeqNamesUsed = set() + with open(outFileName, "w") as outF: + with open(matchFileName) as inF: + inF.readline() + for inLine in inF: + splittedLine = inLine.split("\t") + qName = splittedLine[0] + sName = splittedLine[6] + sSeqNames.add(qName) + sSeqNames.add(sName) + if qName != sName: + matchMin = min(abs(float(splittedLine[2]) - float(splittedLine[1]) + 1), abs(float(splittedLine[8]) - float(splittedLine[7]) + 1)) + seqMin = min(int(splittedLine[3]) / float(splittedLine[4]), int(splittedLine[9]) / float(splittedLine[10])) + coverage = round(matchMin / seqMin, 2) + if coverage >= coverageThreshold: + outF.write("%s\n" % "\t".join([qName, sName, str(coverage)])) + sSeqNamesUsed.add(qName) + sSeqNamesUsed.add(sName) + + with open("%s.unused" % outFileName, "w") as outSeqNameOfUnusedMatchesF: + for seqName in sorted(sSeqNames - sSeqNamesUsed): + outSeqNameOfUnusedMatchesF.write("%s\n" % seqName) + + if FileUtils.isEmpty(outFileName): + print "WARNING: '%s' is empty." % outFileName + + # # Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file) + # Use coverage on query for arc value + # + # @param matchFileName string input match file name + # @param outFileName string output abc file name + # @param coverage float query coverage filter threshold + # + @staticmethod + def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0): + with open(matchFileName) as inF: + with open(outFileName, "w") as outF: + inF.readline() + for inLine in inF: + splittedLine = inLine.split("\t") + if float(splittedLine[4]) >= coverage: + outLine = "\t".join([splittedLine[0], splittedLine[6], splittedLine[4]]) + outLine += "\n" + outF.write(outLine) + + if FileUtils.isEmpty(outFileName): + print "WARNING: '%s' is empty." % outFileName + + # # Adapt the path IDs as the input file is the concatenation of several 'Match' files, and remove the extra header lines. + # # replace old parser.tabnum2id + # + # @param fileName a string input file name + # @param outputFileName a string output file name (optional) + # + @staticmethod + def generateMatchFileWithNewPathId(fileName, outputFileName = None): + if outputFileName is None: + outFile = open(fileName, "w") + else: + outFile = open(outputFileName, "w") + outFile.write("query.name\tquery.start\tquery.end\tquery.length\tquery.length.%\tmatch.length.%\tsubject.name\tsubject.start\tsubject.end\tsubject.length\tsubject.length.%\tE.value\tScore\tIdentity\tpath\n") + + lMatches = MatchUtils.getMatchListFromFile(fileName) + count = 1 + dMatchKeyIdcount = {} + + for match in lMatches: + key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName() + if not key_id in dMatchKeyIdcount.keys(): + newPath = count + count += 1 + dMatchKeyIdcount[ key_id ] = newPath + else: + newPath = dMatchKeyIdcount[ key_id ] + + match.id = newPath + outFile.write(match.toString() + "\n") + outFile.close() diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/MergeFromOverlaps.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/MergeFromOverlaps.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,52 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +class MergeFromOverlaps(object): + + def __init__(self, lFeatures): + self._lFeaturesToCluster = lFeatures + + ## Clusterize list of Features implementing isOverlapping() and merge() method (e.g Range-based and MergedRange-based objects). + # + # @param lUnsorted list of Features unsorted + # @return lSortedAndMerged list of Features clustered and sorted + # + def clusterize(self): + lSortedAndMerged = [] + for iInUnsorted in self._lFeaturesToCluster: + toBeRemoved = [] + for iInSorted in lSortedAndMerged: + if iInUnsorted.isOverlapping(iInSorted): + toBeRemoved.append(iInSorted) + iInUnsorted.merge(iInSorted) + for tbr in toBeRemoved: + lSortedAndMerged.remove(tbr) + lSortedAndMerged.append(iInUnsorted) + return lSortedAndMerged diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/MergedPath.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/MergedPath.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,136 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import collections +from commons.core.coord.Path import Path +from commons.core.LoggerFactory import LoggerFactory + +LOG_DEPTH = "repet.commons.core.coord" + +## Merge Path objects with the same ID in a single object +# +class MergedPath(object): + + __slots__ = ("_lPaths", "_length", "_ID", "_nbPaths", "_log") + + ## Constructor + # + def __init__(self, lPaths = None, verbosity = 0): + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), verbosity = verbosity) + self._lPaths = [] + self._length = 0 + self._ID = None + self._nbPaths = 0 + + if lPaths: + for p in lPaths: + self.add(p) + + ## repr + # + def __repr__(self): + return self.toString() + + ## Equal operator + # + def __eq__(self, o): + equal = True + if type(o) is not type(self): + equal = False + elif self.getID() != o.getID() or self.getLength() != o.getLength() or self._nbPaths != o._nbPaths: + equal = False + else: + equal = collections.Counter(self.getPathsList()) == collections.Counter(self.getPathsList()) + return equal + + ## Not equal operator + # + def __ne__(self, o): + return not self.__eq__(o) + + ## Try to add a Path instance to this MergedPath instance + # + # @param path a Path instance + # + def add(self, path): + if not isinstance(path, Path): + msg = "Error when adding to MergedPath : Not a Path" + self._log.warning(msg) + elif self._ID and self._ID != path.getIdentifier(): + msg = "Couldn't add Path (%i) to MergedPath (%i) : Different IDs" % (path.getIdentifier(), self._ID) + self._log.warning(msg) + else: + self._ID = path.getIdentifier() + self._lPaths.append(path) + self._length += path.getAlignInstance().getLengthOnQuery() + self._nbPaths = len(self._lPaths) + + ## Get the total length of the Paths in this MergedPath instance + # + def getLength(self): + return self._length + + ## Get the list of Paths objects in this MergedPath instance + # + def getPathsList(self): + return self._lPaths + + ## Get the ID of the Path + # + def getID(self): + return self._ID + + ## Get the number of paths (fragments) + # + def getNbPaths(self): + return self._nbPaths + + ## Get the weighted identity + # + def getWeightedIdentity(self): + res = 0 + if self._nbPaths > 0: + sumIdentity = sum((abs(path.range_query.end - path.range_query.start) + 1) * path.identity for path in self._lPaths) + sumSizes = sum(abs(path.range_query.end - path.range_query.start) + 1 for path in self._lPaths) + res = sumIdentity / sumSizes + res = float("{0:.2f}".format(res)) # Truncate to 2 decimals + return res + + ## Return the attributes as a formatted string + # + def toString(self): + string = "Empty MergedPath()" + if self._ID: + string = "ID: %i" % self._ID + string += "\tLength: %i" % self._length + string += "\tNbPaths: %i" % self._nbPaths + string += "\tPaths:\n" + string += "\n".join([p.toString() for p in self._lPaths]) + return string diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/MergedRange.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/MergedRange.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,106 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +## Record a region on multiple sequence using Path ID information +# +class MergedRange(object): + + __slots__ = ("_lId", "_start", "_end") + + ## Constructor + # + # @param lId list of Path ID + # @param start the start coordinate + # @param end the end coordinate + # + def __init__(self, lId = None, start = -1, end = -1): + self._lId = lId or [] + self._start = start + self._end = end + + ## Equal operator + # + # @param o a MergedRange instance + # + def __eq__(self, o): + if type(o) is type(self): + return o._lId == self._lId and o._start == self._start and o._end == self._end + else: + return False + + ## Not equal operator + # + def __ne__(self, o): + return not self.__eq__(o) + + ## Return True if the MergedRange instance overlaps with another MergedRange instance, False otherwise + # + # @param o a MergedRange instance + # @return boolean False or True + # + def isOverlapping(self, o): + if o._start <= self._start and o._end >= self._end: + return True + if o._start >= self._start and o._start <= self._end or o._end >= self._start and o._end <= self._end: + return True + return False + + ## Merge coordinates and ID of two Merged Range + # + # @param o a MergedRange instance + # + def merge(self, o): + self._start = min(self._start, o._start) + self._end = max(self._end, o._end) + self._lId.extend(o._lId) + self._lId.sort() + + ## Set a Merged Range instance using a Match instance + # + # @param iMatch instance Match instance + # + def setFromMatch(self, iMatch): + self._lId= [iMatch.id] + self._start = iMatch.range_query.start + self._end = iMatch.range_query.end + + ## Get a Merged Range instance list using a Match instance list + # + # @param lIMatch list Match instance list + # @return lMergedRange list MergedRange instance list + # + @staticmethod + def getMergedRangeListFromMatchList(lIMatch): + lMergedRange = [] + for iMatch in lIMatch: + mr = MergedRange() + mr.setFromMatch(iMatch) + lMergedRange.append(mr) + return lMergedRange diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/Path.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/Path.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,161 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.coord.Set import Set +from commons.core.coord.Align import Align +from commons.core.coord.Range import Range + + +## Handle a match between two sequences, query and subject (pair of coordinates with E-value, score and identity) with an identifier +# +class Path( Align ): + + __slots__ = ("id") + + ## Constructor + # + # @param id identifier + # @param range_q: a Range instance for the query + # @param range_s: a Range instance for the subject + # @param e_value: E-value of the match + # @param score: score of the match + # @param identity: identity percentage of the match + # + def __init__( self, id=-1, range_q=Range(), range_s=Range(), e_value=0, score=0, identity=0 ): + self.id = int( id ) + Align.__init__( self, range_q, range_s, e_value, score, identity ) + + ## Equal operator + # + def __eq__(self, o): + if type(o) is not type(self) or self.id != o.id: + return False + return Align.__eq__(self, o) + + ## Not equal operator + # + def __ne__(self, o): + return not self.__eq__(o) + + ## repr + # + def __repr__(self): + return self.toString() + + ## Set attributes from tuple + # + # @param tuple a tuple with (id,queryName,queryStart,queryEnd,subjectName,subjectStar,subjectEnd,E-value,score,identity) + # @note data are loaded such that the query is always on the direct strand + # + def setFromTuple(self, tuple): + self.id = int(tuple[0]) + Align.setFromTuple(self, tuple[1:]) + + ## Reset + # + def reset(self): + self.id = -1 + Align.reset(self) + + ## Return the attributes as a formatted string + # + def toString(self): + string = "%i" % ( self.id ) + string += "\t%s" % (Align.toString(self)) + return string + + + ## Return the identifier of the Path instance + # + def getIdentifier( self ): + return self.id + + ## Return a Set instance with the subject mapped on the query + # + def getSubjectAsSetOfQuery(self): + iSet = Set() + iSet.id = self.id + iSet.name = self.range_subject.seqname + iSet.seqname = self.range_query.seqname + if self.range_subject.isOnDirectStrand(): + iSet.start = self.range_query.start + iSet.end = self.range_query.end + else: + iSet.start = self.range_query.end + iSet.end = self.range_query.start + return iSet + + #TODO: add tests !!!! + #WARNING: subject always in direct strand !!! + ## Return a Set instance with the subject mapped on the query + # + def getQuerySetOfSubject(self): + iSet = Set() + iSet.id = self.id + iSet.name = self.range_query.seqname + iSet.seqname = self.range_subject.seqname + if self.range_subject.isOnDirectStrand(): + iSet.start = self.range_subject.start + iSet.end = self.range_subject.end + else: + iSet.start = self.range_subject.end + iSet.end = self.range_subject.start + return iSet + + ## Return True if the instance can be merged with another Path instance, False otherwise + # + # @param o a Path instance + # + def canMerge(self, o): + return o.id != self.id \ + and o.range_query.seqname == self.range_query.seqname \ + and o.range_subject.seqname == self.range_subject.seqname \ + and o.range_query.isOnDirectStrand() == self.range_query.isOnDirectStrand() \ + and o.range_subject.isOnDirectStrand() == self.range_subject.isOnDirectStrand() \ + and o.range_query.isOverlapping(self.range_query) \ + and o.range_subject.isOverlapping(self.range_subject) + + ## Return an Align instance with the same attributes, except the identifier + # + def getAlignInstance(self): + iAlign = Align() + lAttributes = [] + lAttributes.append( self.range_query.seqname ) + lAttributes.append( self.range_query.start ) + lAttributes.append( self.range_query.end ) + lAttributes.append( self.range_subject.seqname ) + lAttributes.append( self.range_subject.start ) + lAttributes.append( self.range_subject.end ) + lAttributes.append( self.e_value ) + lAttributes.append( self.score ) + lAttributes.append( self.identity ) + iAlign.setFromTuple( lAttributes ) + return iAlign diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/PathUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/PathUtils.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,922 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import os +import sys +import copy +from commons.core.coord.Map import Map +from commons.core.coord.Path import Path +from commons.core.coord.Align import Align +from commons.core.coord.Range import Range +from commons.core.coord.SetUtils import SetUtils +from commons.core.coord.AlignUtils import AlignUtils +from commons.core.checker.RepetException import RepetDataException + +## Static methods for the manipulation of Path instances +# +class PathUtils ( object ): + + ## Change the identifier of each Set instance in the given list + # + # @param lPaths list of Path instances + # @param newId new identifier + # + @staticmethod + def changeIdInList(lPaths, newId): + for iPath in lPaths: + iPath.id = newId + + + ## Return a list of Set instances containing the query range from a list of Path instances + # + # @param lPaths a list of Path instances + # + @staticmethod + def getSetListFromQueries(lPaths): + lSets = [] + for iPath in lPaths: + lSets.append( iPath.getSubjectAsSetOfQuery() ) + return lSets + + + ## Return a list of Set instances containing the subject range from a list of Path instances + # + # @param lPaths a list of Path instances + # + @staticmethod + def getSetListFromSubjects(lPaths): + lSets = [] + for iPath in lPaths: + lSets.append( iPath.getQuerySetOfSubject() ) + return lSets + + + ## Return a sorted list of Range instances containing the subjects from a list of Path instances + # + # @param lPaths a list of Path instances + # @note meaningful only if all Path instances have same identifier + # + @staticmethod + def getRangeListFromSubjects( lPaths ): + lRanges = [] + for iPath in lPaths: + lRanges.append( iPath.range_subject ) + if lRanges[0].isOnDirectStrand(): + return sorted( lRanges, key=lambda iRange: ( iRange.getMin(), iRange.getMax() ) ) + else: + return sorted( lRanges, key=lambda iRange: ( iRange.getMax(), iRange.getMin() ) ) + + + ## Return a tuple with min and max of query coordinates from Path instances in the given list + # + # @param lPaths a list of Path instances + # + @staticmethod + def getQueryMinMaxFromPathList(lPaths): + qmin = -1 + qmax = -1 + for iPath in lPaths: + if qmin == -1: + qmin = iPath.range_query.start + qmin = min(qmin, iPath.range_query.getMin()) + qmax = max(qmax, iPath.range_query.getMax()) + return (qmin, qmax) + + + ## Return a tuple with min and max of subject coordinates from Path instances in the given list + # + # @param lPaths lists of Path instances + # + @staticmethod + def getSubjectMinMaxFromPathList(lPaths): + smin = -1 + smax = -1 + for iPath in lPaths: + if smin == -1: + smin = iPath.range_subject.start + smin = min(smin, iPath.range_subject.getMin()) + smax = max(smax, iPath.range_subject.getMax()) + return (smin, smax) + + + ## Returns a Path objects list where Paths query coordinates overlapping with + # any Path in a list are removed. + # + # WARNING: input Path lists are modified (sort) + # + # @param lRefPaths list of paths to check overlaps + # @param lPathsToClean list of paths to remove overlapping Paths on query coordinates + # @return path list + @staticmethod + def removeOverlappingPathsOnQueriesBetweenPathLists(lRefPaths, lPathsToClean): + if not lRefPaths: + print "WARNING: empty reference Paths list" + return lPathsToClean + + lRefQueries = PathUtils.getListOfDistinctQueryNames(lRefPaths) + lToCleanQueries = PathUtils.getListOfDistinctQueryNames(lPathsToClean) + + lCommonQueries = list(set(lRefQueries) & set(lToCleanQueries)) + lCommonQueries.sort() + lSpecificToCleanQueries = list(set(lToCleanQueries) - set(lCommonQueries)) + lSpecificToCleanQueries.sort() + + lRefPaths.sort(key=lambda iPath: (iPath.getQueryName(), iPath.getIdentifier(), iPath.getQueryMin(), iPath.getQueryMax())) + lPathsToClean.sort(key=lambda iPath: (iPath.getQueryName(), iPath.getIdentifier(), iPath.getQueryMin(), iPath.getQueryMax())) + + lCleanedPaths = [] + lSpecificToCleanQueries = list(set(lToCleanQueries) - set(lCommonQueries)) + lCleanedPaths.extend(PathUtils.extractPathsFromQueryNameList(lPathsToClean, lSpecificToCleanQueries)) + + dRefQueryToPathList = PathUtils.getDictOfListsWithQueryNameAsKey(lRefPaths) + dToCleanQueryToPathList = PathUtils.getDictOfListsWithQueryNameAsKey(lPathsToClean) + + for queryName in lCommonQueries: + + refQueryHash = PathUtils.getDictOfListsWithIdAsKey(dRefQueryToPathList[queryName]) + toCleanQueryHash = PathUtils.getDictOfListsWithIdAsKey(dToCleanQueryToPathList[queryName]) + + for lCleanPathById in toCleanQueryHash.values(): + isOverlapping = False + + for lRefPathById in refQueryHash.values(): + if PathUtils.areQueriesOverlappingBetweenPathLists(lRefPathById, lCleanPathById, areListsAlreadySort = True): + isOverlapping = True + break + + if not isOverlapping: + lCleanedPaths.extend(lCleanPathById) + + return lCleanedPaths + + + ## Return True if the query range of any Path instance from the first list overlaps with the query range of any Path instance from the second list + # + # @param lPaths1: list of Path instances + # @param lPaths2: list of Path instances + # @return boolean + # + @staticmethod + def areQueriesOverlappingBetweenPathLists( lPaths1, lPaths2, areListsAlreadySort = False): + if not areListsAlreadySort: + lSortedPaths1 = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lPaths1 ) + lSortedPaths2 = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lPaths2 ) + else: + lSortedPaths1 = lPaths1 + lSortedPaths2 = lPaths2 + i = 0 + j = 0 + while i != len(lSortedPaths1): + j = 0 + while j != len(lSortedPaths2): + if not lSortedPaths1[i].range_query.isOverlapping( lSortedPaths2[j].range_query ): + j += 1 + else: + return True + i += 1 + return False + + + ## Show Path instances contained in the given list + # + # @param lPaths a list of Path instances + # + @staticmethod + def showList(lPaths): + for iPath in lPaths: + iPath.show() + + + ## Write Path instances contained in the given list + # + # @param lPaths a list of Path instances + # @param fileName name of the file to write the Path instances + # @param mode the open mode of the file ""w"" or ""a"" + # + @staticmethod + def writeListInFile(lPaths, fileName, mode="w"): + AlignUtils.writeListInFile(lPaths, fileName, mode) + + + ## Return new list of Path instances with no duplicate + # + # @param lPaths a list of Path instances + # @param useOnlyCoord boolean if True, check only coordinates and sequence names + # @return lUniqPaths a path instances list + # + @staticmethod + def getPathListWithoutDuplicates(lPaths, useOnlyCoord = False): + if len(lPaths) < 2: + return lPaths + lSortedPaths = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier( lPaths ) + lUniqPaths = [ lSortedPaths[0] ] + if useOnlyCoord: + for iPath in lSortedPaths[1:]: + if iPath.range_query.start != lUniqPaths[-1].range_query.start \ + or iPath.range_query.end != lUniqPaths[-1].range_query.end \ + or iPath.range_query.seqname != lUniqPaths[-1].range_query.seqname \ + or iPath.range_subject.start != lUniqPaths[-1].range_subject.start \ + or iPath.range_subject.end != lUniqPaths[-1].range_subject.end \ + or iPath.range_subject.seqname != lUniqPaths[-1].range_subject.seqname: + lUniqPaths.append( iPath ) + else: + for iPath in lSortedPaths[1:]: + if iPath != lUniqPaths[-1]: + lUniqPaths.append( iPath ) + return lUniqPaths + + + @staticmethod + def getPathListWithoutDuplicatesOnQueryCoord(lPaths): + if len(lPaths) < 2: + return lPaths + lSortedPaths = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier( lPaths ) + lUniqPaths = [ lSortedPaths[0] ] + for iPath in lSortedPaths[1:]: + if iPath.range_query.start != lUniqPaths[-1].range_query.start \ + or iPath.range_query.end != lUniqPaths[-1].range_query.end \ + or iPath.range_query.seqname != lUniqPaths[-1].range_query.seqname: + lUniqPaths.append( iPath ) + return lUniqPaths + + + ## Split a Path list in several Path lists according to the identifier + # + # @param lPaths a list of Path instances + # @return a dictionary which keys are identifiers and values Path lists + # + @staticmethod + def getDictOfListsWithIdAsKey(lPaths): + dId2PathList = dict((ident, []) for ident in PathUtils.getListOfDistinctIdentifiers(lPaths)) + for iPath in lPaths: + dId2PathList[iPath.id].append(iPath) + return dId2PathList + + + ## Split a Path list in several Path lists according to the query name + # + # @param lPaths a list of Path instances + # @return a dictionary which keys are query_names and values Path lists + # + @staticmethod + def getDictOfListsWithQueryNameAsKey(lPaths): + dId2PathList = dict((qn, []) for qn in PathUtils.getListOfDistinctQueryNames(lPaths)) + for iPath in lPaths: + dId2PathList[iPath.getQueryName()].append(iPath) + return dId2PathList + + + ## Split a Path file in several Path lists according to the identifier + # + # @param pathFile name of the input Path file + # @return a dictionary which keys are identifiers and values Path lists + # + @staticmethod + def getDictOfListsWithIdAsKeyFromFile( pathFile ): + dId2PathList = {} + pathFileHandler = open(pathFile, "r") + for line in pathFileHandler: + iPath = Path() + iPath.setFromString(line) + if dId2PathList.has_key(iPath.id): + dId2PathList[ iPath.id ].append(iPath) + else: + dId2PathList[ iPath.id ] = [ iPath ] + pathFileHandler.close() + return dId2PathList + + + ## Return a list of Path list(s) obtained while splitting a list of connected Path instances according to another based on query coordinates + # Only the path instance of lToKeep between path instance of lToUnjoin are used to split lToUnjoin + # @param lToKeep: a list of Path instances to keep (reference) + # @param lToUnjoin: a list of Path instances to unjoin + # @return: list of Path list(s) (can be empty if one of the input lists is empty) + # @warning: all the path instances in a given list MUST be connected (i.e. same identifier) + # @warning: if the path instances in a given list overlap neither within each other nor with the Path instances of the other list, these path instances are not used to split the lToUnjoin + # + @staticmethod + def getPathListUnjoinedBasedOnQuery( lToKeep, lToUnjoin ): + lSortedToKeep = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lToKeep ) + length_lSortedToKeep = len(lSortedToKeep) +# PathUtils.showList(lSortedToKeep) + lSortedToUnjoin = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lToUnjoin ) +# PathUtils.showList(lSortedToUnjoin) + length_lSortedToUnjoin = len(lSortedToUnjoin) + if lToUnjoin == []: + return [] + if lToKeep == []: + return [ lToUnjoin ] + + lLists = [] + k = 0 + while k < length_lSortedToKeep: + j1 = 0 + while j1 < length_lSortedToUnjoin and lSortedToKeep[k].range_query.getMin() > lSortedToUnjoin[j1].range_query.getMax(): + j1 += 1 + if j1 == length_lSortedToUnjoin: + break + if j1 != 0: + lLists.append( lSortedToUnjoin[:j1] ) + del lSortedToUnjoin[:j1] + j1 = 0 + if k+1 == len(lSortedToKeep): + break + j2 = j1 + minQueryOf_lSortedToKeepKplus1 = lSortedToKeep[k+1].range_query.getMin() + maxQueryOf_lSortedToUnjoinJ2 = lSortedToUnjoin[j2].range_query.getMax() + if j2 < length_lSortedToUnjoin and minQueryOf_lSortedToKeepKplus1 > maxQueryOf_lSortedToUnjoinJ2: + while j2 < len(lSortedToUnjoin) and minQueryOf_lSortedToKeepKplus1 > maxQueryOf_lSortedToUnjoinJ2: + j2 += 1 + maxQueryOf_lSortedToUnjoinJ2 = lSortedToUnjoin[j2].range_query.getMax() + lLists.append( lSortedToUnjoin[j1:j2] ) + del lSortedToUnjoin[j1:j2] + k += 1 + + if lLists != [] or k == 0: + lLists.append( lSortedToUnjoin ) + else: + lLists = lSortedToUnjoin + + return lLists + + + ## Return the identity of the Path list, the identity of each instance being weighted by the length of each query range + # All Paths should have the same query and subject. + # The Paths are merged using query coordinates only. + # + # @param lPaths list of Path instances + # + @staticmethod + def getIdentityFromPathList( lPaths, checkSubjects=True ): + if len( PathUtils.getListOfDistinctQueryNames( lPaths ) ) > 1: + msg = "ERROR: try to compute identity from Paths with different queries" + sys.stderr.write( "%s\n" % msg ) + sys.stderr.flush() + raise Exception + if checkSubjects and len( PathUtils.getListOfDistinctSubjectNames( lPaths ) ) > 1: + msg = "ERROR: try to compute identity from Paths with different subjects" + sys.stderr.write( "%s\n" % msg ) + sys.stderr.flush() + raise Exception + identity = 0 + lMergedPaths = PathUtils.mergePathsInListUsingQueryCoordsOnly( lPaths ) + lQuerySets = PathUtils.getSetListFromQueries( lMergedPaths ) + lMergedQuerySets = SetUtils.mergeSetsInList( lQuerySets ) + totalLengthOnQry = SetUtils.getCumulLength( lMergedQuerySets ) + for iPath in lMergedPaths: + identity += iPath.identity * iPath.getLengthOnQuery() + weightedIdentity = identity / float(totalLengthOnQry) + if weightedIdentity < 0: + msg = "ERROR: weighted identity '%.2f' outside range" % weightedIdentity + sys.stderr.write("%s\n" % msg) + sys.stderr.flush() + raise Exception + elif weightedIdentity > 100: + msg = "ERROR: weighted identity '%.2f' outside range" % weightedIdentity + sys.stderr.write("%s\n" % msg) + sys.stderr.flush() + raise RepetDataException(msg) + return weightedIdentity + + + ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, and finally their initial order. + # + # @param lPaths list of Path instances + # + @staticmethod + def getPathListSortedByIncreasingMinQueryThenMaxQuery(lPaths): + return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), iPath.getQueryMax() ) ) + + + ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, then their identifier, and finally their initial order. + # + # @param lPaths list of Path instances + # + @staticmethod + def getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier(lPaths): + return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), iPath.getQueryMax(), iPath.getIdentifier() ) ) + + + ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, then the min of the subject, then the max of the subject and finally their initial order. + # + # @param lPaths list of Path instances + # + @staticmethod + def getPathListSortedByIncreasingMinQueryThenMaxQueryThenMinSubjectThenMaxSubject(lPaths): + return sorted(lPaths, key=lambda iPath: (iPath.getQueryMin(), iPath.getQueryMax(), iPath.getSubjectMin(), iPath.getSubjectMax())) + + + ## Return a list of Path instances sorted in increasing order according to the min, then the inverse of the query length, and finally their initial order + # + # @param lPaths: list of Path instances + # + @staticmethod + def getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths ): + return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), 1 / float(iPath.getLengthOnQuery()) ) ) + + + ## Return a list of the distinct identifiers + # + # @param lPaths list of Path instances + # + @staticmethod + def getListOfDistinctIdentifiers( lPaths ): + sDistinctIdentifiers = set([iPath.id for iPath in lPaths]) + return list(sDistinctIdentifiers) + + + ## Return a list of the distinct query names present in the collection + # + # @param lPaths list of Path instances + # + @staticmethod + def getListOfDistinctQueryNames( lPaths ): + sDistinctQueryNames = set([iPath.range_query.seqname for iPath in lPaths]) + return list(sDistinctQueryNames) + + + ## Return a list of the distinct subject names present in the collection + # + # @param lPaths list of Path instances + # + @staticmethod + def getListOfDistinctSubjectNames( lPaths ): + sDistinctSubjectNames = set([iPath.range_subject.seqname for iPath in lPaths]) + return list(sDistinctSubjectNames) + + + ## Return a list of paths with matching query names + # + # @param lPaths list of Path instances + # @param queryName query name to extract + @staticmethod + def extractPathsFromQueryName(lPaths, queryName): + return [iPath for iPath in lPaths if iPath.getQueryName() == queryName] + + + ## Return a list of paths with matching query names + # + # @param lPaths list of Path instances + # @param lQueryName query name list to extract + @staticmethod + def extractPathsFromQueryNameList(lPaths, lQueryNames): + d = dict.fromkeys(lQueryNames) + return [iPath for iPath in lPaths if iPath.getQueryName() in d] + + + ## Return a list of paths with matching subject names + # + # @param lPaths list of Path instances + # @param subjectName subject name to extract + @staticmethod + def extractPathsFromSubjectName(lPaths, subjectName): + return [iPath for iPath in lPaths if iPath.getSubjectName() == subjectName] + + + ## Return a list of paths with coordinates overlap a given range + # + # @param lPaths list of Path instances + # @param queryName query name to extract + # @param start starting position + # @param end ending position + # @return list of Path instance + @staticmethod + def extractPathsFromQueryCoord(lPaths, queryName, start, end): + lExtractedPaths = [] + iAlign = Align(range_q = Range(queryName, start, end)) + + for path in PathUtils.extractPathsFromQueryName(lPaths, queryName): + if path.isQueryOverlapping(iAlign): + lExtractedPaths.append(path) + + return lExtractedPaths + + + ## Return a list of lists containing query coordinates of the connections sorted in increasing order. + # + # @param lConnectedPaths: list of Path instances having the same identifier + # @param minLength: threshold below which connections are not reported (default= 0 bp) + # @note: return only connections longer than threshold + # @note: if coordinate on query ends at 100, return 101 + # @warning: Path instances MUST be sorted in increasing order according to query coordinates + # @warning: Path instances MUST be on direct query strand (and maybe on reverse subject strand) + # + @staticmethod + def getListOfJoinCoordinatesOnQuery(lConnectedPaths, minLength=0): + lJoinCoordinates = [] + for i in xrange(1,len(lConnectedPaths)): + startJoin = lConnectedPaths[i-1].range_query.end + endJoin = lConnectedPaths[i].range_query.start + if endJoin - startJoin + 1 > minLength: + lJoinCoordinates.append( [ startJoin + 1, endJoin - 1 ] ) + return lJoinCoordinates + + + ## Return the length on the query of all Path instance in the given list + # + # @param lPaths list of Path instances + # @note overlapping ranges are not summed but truncated. + # + @staticmethod + def getLengthOnQueryFromPathList( lPaths ): + lSets = PathUtils.getSetListFromQueries( lPaths ) + lMergedSets = SetUtils.mergeSetsInList( lSets ) + length = SetUtils.getCumulLength( lMergedSets ) + return length + + + ## Convert a Path file into an Align file + # + # @param pathFile: name of the input Path file + # @param alignFile: name of the output Align file + # + @staticmethod + def convertPathFileIntoAlignFile(pathFile, alignFile): + pathFileHandler = open(pathFile, "r") + alignFileHandler = open(alignFile, "w") + iPath = Path() + for line in pathFileHandler: + iPath.setFromString(line) + iAlign = iPath.getAlignInstance() + iAlign.write(alignFileHandler) + pathFileHandler.close() + alignFileHandler.close() + + + #TODO: duplicated method => to rename with the name of the next method (which is called) ? + ## Convert a Path File into a Map file with query coordinates only + # + # @param pathFile: name of the input Path file + # @param mapFile: name of the output Map file + # + @staticmethod + def convertPathFileIntoMapFileWithQueryCoordsOnly( pathFile, mapFile ): + pathFileHandler = open(pathFile, "r") + mapFileHandler = open(mapFile, "w") + p = Path() + for line in pathFileHandler: + p.reset() + p.setFromTuple(line.split("\t")) + p.writeSubjectAsMapOfQuery(mapFileHandler) + pathFileHandler.close() + mapFileHandler.close() + + + ## for each line of a given Path file, write the coordinates of the subject on the query as one line in a Map file + # + # @param pathFile: name of the input Path file + # @param mapFile: name of the output Map file + # + @staticmethod + def convertPathFileIntoMapFileWithSubjectsOnQueries( pathFile, mapFile ): + PathUtils.convertPathFileIntoMapFileWithQueryCoordsOnly( pathFile, mapFile ) + + + ## Merge matches on queries + # + # @param inFile: name of the input Path file + # @param outFile: name of the output Path file + # + @staticmethod + def mergeMatchesOnQueries(inFile, outFile): + mapFile = "%s.map" % inFile + PathUtils.convertPathFileIntoMapFileWithQueryCoordsOnly(inFile, mapFile) + cmd = "mapOp" + cmd += " -q %s" % mapFile + cmd += " -m" + cmd += " 2>&1 > /dev/null" + exitStatus = os.system(cmd) + if exitStatus != 0: + print "ERROR: mapOp returned %i" % exitStatus + sys.exit(1) + os.remove(mapFile) + mergeFile = "%s.merge" % mapFile + mergeFileHandler = open(mergeFile, "r") + outFileHandler = open(outFile, "w") + m = Map() + for line in mergeFileHandler: + m.reset() + m.setFromString(line, "\t") + m.writeAsQueryOfPath(outFileHandler) + mergeFileHandler.close() + os.remove(mergeFile) + outFileHandler.close() + + + ## Filter chains of Path(s) which length is below a given threshold + # + # @param lPaths: list of Path instances + # @param minLengthChain: minimum length of a chain to be kept + # @note: a chain may contain a single Path instance + # @return: a list of Path instances + # + @staticmethod + def filterPathListOnChainLength( lPaths, minLengthChain ): + lFilteredPaths = [] + dPathnum2Paths = PathUtils.getDictOfListsWithIdAsKey( lPaths ) + for pathnum in dPathnum2Paths.keys(): + length = PathUtils.getLengthOnQueryFromPathList( dPathnum2Paths[ pathnum ] ) + if length >= minLengthChain: + lFilteredPaths += dPathnum2Paths[ pathnum ] + return lFilteredPaths + + + ## Return a Path list from a Path file + # + # @param pathFile string name of a Path file + # @return a list of Path instances + # + @staticmethod + def getPathListFromFile( pathFile ): + lPaths = [] + with open(pathFile, "r") as pathFileHandler: + for line in pathFileHandler: + iPath = Path() + iPath.setFromString(line) + lPaths.append(iPath) + return lPaths + + + ## Convert a chain into a 'pathrange' + # + # @param lPaths a list of Path instances with the same identifier + # @note: the min and max of each Path is used + # + @staticmethod + def convertPathListToPathrange( lPaths ): + if len(lPaths) == 0: + return + if len(lPaths) == 1: + return lPaths[0] + iPathrange = copy.deepcopy( lPaths[0] ) + iPathrange.identity = lPaths[0].identity * lPaths[0].getLengthOnQuery() + cumulQueryLength = iPathrange.getLengthOnQuery() + for iPath in lPaths[1:]: + if iPath.id != iPathrange.id: + msg = "ERROR: two Path instances in the chain have different identifiers" + sys.stderr.write( "%s\n" % ( msg ) ) + sys.exit(1) + if iPathrange.range_subject.isOnDirectStrand() != iPath.range_subject.isOnDirectStrand(): + msg = "ERROR: two Path instances in the chain are on different strands" + sys.stderr.write( "%s\n" % ( msg ) ) + sys.exit(1) + iPathrange.range_query.start = min( iPathrange.range_query.start, iPath.range_query.start ) + iPathrange.range_query.end = max( iPathrange.range_query.end, iPath.range_query.end ) + if iPathrange.range_subject.isOnDirectStrand(): + iPathrange.range_subject.start = min( iPathrange.range_subject.start, iPath.range_subject.start ) + iPathrange.range_subject.end = max( iPathrange.range_subject.end, iPath.range_subject.end ) + else: + iPathrange.range_subject.start = max( iPathrange.range_subject.start, iPath.range_subject.start ) + iPathrange.range_subject.end = min( iPathrange.range_subject.end, iPath.range_subject.end ) + iPathrange.e_value = min( iPathrange.e_value, iPath.e_value ) + iPathrange.score += iPath.score + iPathrange.identity += iPath.identity * iPath.getLengthOnQuery() + cumulQueryLength += iPath.getLengthOnQuery() + iPathrange.identity = iPathrange.identity / float(cumulQueryLength) + return iPathrange + + + ## Convert a Path file into an Align file via 'pathrange' + # + # @param pathFile: name of the input Path file + # @param alignFile: name of the output Align file + # @param verbose integer verbosity level + # @note: the min and max of each Path is used + # + @staticmethod + def convertPathFileIntoAlignFileViaPathrange( pathFile, alignFile, verbose=0 ): + lPaths = PathUtils.getPathListFromFile( pathFile ) + dId2PathList = PathUtils.getDictOfListsWithIdAsKey( lPaths ) + lIds = dId2PathList.keys() + lIds.sort() + if verbose > 0: + msg = "number of chains: %i" % ( len(lIds) ) + sys.stdout.write( "%s\n" % ( msg ) ) + sys.stdout.flush() + alignFileHandler = open( alignFile, "w" ) + for identifier in lIds: + iPath = PathUtils.convertPathListToPathrange( dId2PathList[ identifier ] ) + iAlign = iPath.getAlignInstance() + iAlign.write( alignFileHandler ) + alignFileHandler.close() + + + ## Split a list of Path instances according to the name of the query + # + # @param lInPaths list of align instances + # @return lOutPathLists list of align instances lists + # + @staticmethod + def splitPathListByQueryName( lInPaths ): + lInSortedPaths = sorted( lInPaths, key=lambda o: o.range_query.seqname ) + lOutPathLists = [] + if len(lInSortedPaths) != 0 : + lPathsForCurrentQuery = [] + previousQuery = lInSortedPaths[0].range_query.seqname + for iPath in lInSortedPaths : + currentQuery = iPath.range_query.seqname + if previousQuery != currentQuery : + lOutPathLists.append( lPathsForCurrentQuery ) + previousQuery = currentQuery + lPathsForCurrentQuery = [] + lPathsForCurrentQuery.append( iPath ) + + lOutPathLists.append(lPathsForCurrentQuery) + + return lOutPathLists + + + ## Create an Path file from each list of Path instances in the input list + # + # @param lPathList list of lists with Path instances + # @param pattern string + # @param dirName string + # + @staticmethod + def createPathFiles( lPathList, pattern, dirName="" ): + nbFiles = len(lPathList) + countFile = 1 + if dirName != "" : + if dirName[-1] != "/": + dirName = dirName + '/' + os.mkdir( dirName ) + + for lPath in lPathList: + fileName = dirName + pattern + "_%s.path" % ( str(countFile).zfill( len(str(nbFiles)) ) ) + PathUtils.writeListInFile( lPath, fileName ) + countFile += 1 + + + ## Merge all overlapping Path instances in a list without considering the identifiers + # Start by sorting the Path instances by their increasing min coordinate + # + # @return: a new list with the merged Path instances + # + @staticmethod + def mergePathsInList( lPaths ): + lMergedPaths = [] + if len(lPaths)==0: + return lMergedPaths + + lSortedPaths = PathUtils.getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths ) + + prev_count = 0 + for iPath in lSortedPaths[0:]: + if prev_count != len(lSortedPaths): + for i in lSortedPaths[ prev_count + 1: ]: + if iPath.isOverlapping( i ): + iPath.merge( i ) + isAlreadyInList = False + for newPath in lMergedPaths: + if newPath.isOverlapping( iPath ): + isAlreadyInList = True + newPath.merge( iPath ) + lMergedPaths [ lMergedPaths.index( newPath ) ] = newPath + if not isAlreadyInList: + lMergedPaths.append( iPath ) + prev_count += 1 + return lMergedPaths + + + ## Merge all overlapping Path instances in a list without considering if subjects are overlapping. + # Start by sorting the Path instances by their increasing min coordinate. + # + # @return: a new list with the merged Path instances + # + @staticmethod + def mergePathsInListUsingQueryCoordsOnly( lPaths ): + lMergedPaths = [] + if len(lPaths)==0: + return lMergedPaths + + lSortedPaths = PathUtils.getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths ) + + prev_count = 0 + for iPath in lSortedPaths[0:]: + if prev_count != len(lSortedPaths): + for i in lSortedPaths[ prev_count + 1: ]: + if iPath.isQueryOverlapping( i ): + iPath.merge( i ) + isAlreadyInList = False + for newPath in lMergedPaths: + if newPath.isQueryOverlapping( iPath ): + isAlreadyInList = True + newPath.merge( iPath ) + lMergedPaths [ lMergedPaths.index( newPath ) ] = newPath + if not isAlreadyInList: + lMergedPaths.append( iPath ) + prev_count += 1 + return lMergedPaths + + + ## Convert a Path file into a GFF file + # + # @param pathFile: name of the input Path file + # @param gffFile: name of the output GFF file + # @param source: source to write in the GFF file (column 2) + # + # @note the 'path' query is supposed to correspond to the 'gff' first column + # + @staticmethod + def convertPathFileIntoGffFile( pathFile, gffFile, source="REPET", verbose=0 ): + dId2PathList = PathUtils.getDictOfListsWithIdAsKeyFromFile( pathFile ) + if verbose > 0: + msg = "number of chains: %i" % ( len(dId2PathList.keys()) ) + sys.stdout.write( "%s\n" % msg ) + sys.stdout.flush() + gffFileHandler = open( gffFile, "w" ) + for id in dId2PathList.keys(): + if len( dId2PathList[ id ] ) == 1: + iPath = dId2PathList[ id ][0] + string = iPath.toStringAsGff( ID="%i" % iPath.getIdentifier(), + source=source ) + gffFileHandler.write( "%s\n" % string ) + else: + iPathrange = PathUtils.convertPathListToPathrange( dId2PathList[ id ] ) + string = iPathrange.toStringAsGff( ID="ms%i" % iPathrange.getIdentifier(), + source=source ) + gffFileHandler.write( "%s\n" % string ) + count = 0 + for iPath in dId2PathList[ id ]: + count += 1 + string = iPath.toStringAsGff( type="match_part", + ID="mp%i-%i" % ( iPath.getIdentifier(), count ), + Parent="ms%i" % iPathrange.getIdentifier(), + source=source ) + gffFileHandler.write( "%s\n" % string ) + gffFileHandler.close() + + + ## Convert a Path file into a Set file + # replace old parser.pathrange2set + # @param pathFile: name of the input Path file + # @param setFile: name of the output Set file + # + @staticmethod + def convertPathFileIntoSetFile( pathFile, setFile ): + pathFileHandler = open(pathFile, "r") + setFileHandler = open(setFile, "w") + iPath = Path() + for line in pathFileHandler: + iPath.setFromString(line) + iSet = iPath.getSubjectAsSetOfQuery() + iSet.write(setFileHandler) + pathFileHandler.close() + setFileHandler.close() + + ## Write Path File without duplicated Path (same query, same subject and same coordinate) + # + # @param inputFile: name of the input Path file + # @param outputFile: name of the output Path file + # + @staticmethod + def removeInPathFileDuplicatedPathOnQueryNameQueryCoordAndSubjectName(inputFile, outputFile): + f = open(inputFile, "r") + line = f.readline() + previousQuery = "" + previousSubject = "" + lPaths = [] + while line: + iPath = Path() + iPath.setFromString(line) + query = iPath.getQueryName() + subject = iPath.getSubjectName() + if (query != previousQuery or subject != previousSubject) and lPaths != []: + lPathsWithoutDuplicate = PathUtils.getPathListWithoutDuplicatesOnQueryCoord(lPaths) + PathUtils.writeListInFile(lPathsWithoutDuplicate, outputFile, "a") + lPaths = [] + lPaths.append(iPath) + previousQuery = query + previousSubject = subject + line = f.readline() + lPathsWithoutDuplicate = PathUtils.getPathListWithoutDuplicatesOnQueryCoord(lPaths) + PathUtils.writeListInFile(lPathsWithoutDuplicate, outputFile, "a") + f.close() diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/Range.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/Range.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,363 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +## Record a region on a given sequence +# +class Range( object ): + + __slots__ = ("seqname", "start", "end", '__dict__') + + ## Constructor + # + # @param seqname the name of the sequence + # @param start the start coordinate + # @param end the end coordinate + # + def __init__(self, seqname="", start=-1, end=-1): + self.seqname = seqname + self.start = int(start) + self.end = int(end) + + ## Equal operator + # + # @param o a Range instance + # + def __eq__(self, o): + if type(o) is type(self) and self.seqname == o.seqname and self.start == o.start and self.end == o.end: + return True + return False + + ## Unequal operator + # + # @param o a Range instance + # + def __ne__(self, o): + return not self.__eq__(o) + + ## Convert the object into a string + # + # @note used in 'print myObject' + # + def __str__( self ): + return self.toString() + + ## Convert the object into a string + # + # @note used in 'repr(myObject)' for debugging + # + def __repr__( self ): + return self.toString().replace("\t",";") + + def setStart(self, start): + self.start = start + + def setEnd(self, end): + self.end = end + + def setSeqName(self, seqName): + self.seqname = seqName + + ## Reset + # + def reset(self): + self.seqname = "" + self.start = -1 + self.end = -1 + + ## Return the attributes as a formatted string + # + def toString(self): + string = "%s" % (self.seqname) + string += "\t%d" % (self.start) + string += "\t%d" % (self.end) + return string + + ## Show the attributes + # + def show(self): + print self.toString() + + ## Return seqname + # + def getSeqname(self): + return self.seqname + + ## Return the start coordinate + # + def getStart(self): + return self.start + + ## Return the end coordinate + # + def getEnd(self): + return self.end + + ## Return the lowest value between start and end coordinates + # + def getMin(self): + return min(self.start, self.end) + + ## Return the greatest value between start and end attributes + # + def getMax(self): + return max(self.start, self.end) + + ## Return True if the instance is on the direct strand, False otherwise + # + def isOnDirectStrand(self): + if self.start <= self.end: + return True + else: + return False + + ## Return True if the instance is on the reverse strand, False otherwise + # + def isOnReverseStrand(self): + return not self.isOnDirectStrand() + + ## Return '+' if the instance is on the direct strand, '-' otherwise + # + def getStrand(self): + if self.isOnDirectStrand(): + return '+' + else: + return '-' + + ## Exchange start and end coordinates + # + def reverse(self): + tmp = self.start + self.start = self.end + self.end = tmp + + ## Return the length of the instance + # + # @warning old name is 'length' + # + def getLength(self): + return int(abs(self.start-self.end))+1 + + ## Return True if the instance is empty, False otherwise + # + def isEmpty(self): + if self.start==self.end and (self.start==0 or self.start==-1): + return True + return False + + ## Set attributes from tuple + # + # @param tuple a tuple with (name,start,end) + # + def setFromTuple(self, tuple): + self.seqname = tuple[0] + self.start = int(tuple[1]) + self.end = int(tuple[2]) + + ## Set attributes from string + # + # @param string a string formatted like namestartend + # @param sep field separator + # + def setFromString(self, string, sep="\t"): + if string[-1] == "\n": + string = string[:-1] + self.setFromTuple( string.split(sep) ) + + ## Merge the instance with another Range instance + # + # @param o a Range instance + # + def merge(self, o): + if self.seqname != o.seqname: + return + if self.isOnDirectStrand(): + self.start = min(self.getMin(), o.getMin()) + self.end = max(self.getMax(), o.getMax()) + else: + self.start = max(self.getMax(), o.getMax()) + self.end = min(self.getMin(), o.getMin()) + + ## Return True if the instance overlaps with another Range instance, False otherwise + # + # @param o a Range instance + # + def isOverlapping(self, o): + if o.seqname != self.seqname: + return False + smin = self.getMin() + smax = self.getMax() + omin = o.getMin() + omax = o.getMax() + if omin <= smin and omax >= smax: + return True + if omin >= smin and omin <= smax or omax >= smin and omax <= smax: + return True + return False + + + ## Return the length of the overlap between the instance and another Range, 0 if no overlap + # + # @param o a Range instance + # + def getOverlapLength( self, o ): + if self.isOverlapping( o ): + if self.isIncludedIn( o ): + return self.getLength() + elif o.isIncludedIn( self ): + return o.getLength() + elif o.getMin() <= self.getMax() and o.getMin() >= self.getMin(): + return self.getMax() - o.getMin() + 1 + elif o.getMax() <= self.getMax() and o.getMax() >= self.getMin(): + return o.getMax() - self.getMin() + 1 + return 0 + + + ## Return True if the instance is included within another Range, False otherwise + # + # @param o a Range instance + # + # @note the min (respectively max) coordinates can be equal + # + def isIncludedIn( self, o ): + if o.seqname != self.seqname: + return False + if self.getMin() >= o.getMin() and self.getMax() <= o.getMax(): + return True + else: + return False + + + ## Return the distance between the start of the instance and the start of another Range instance + # + # @param o a Range instance + # + def getDistance(self, o): + if self.isOnDirectStrand() == o.isOnDirectStrand(): + if self.isOverlapping(o): + return 0 + elif self.isOnDirectStrand(): + if self.start > o.start: + return self.start - o.end + else: + return o.start - self.end + else: + if self.start > o.start: + return self.end - o.start + else: + return o.end - self.start + return -1 + + ## Remove in the instance the region overlapping with another Range instance + # + # @param o a Range instance + # + def diff(self, o): + new_range = Range(self.seqname) + if not self.isOverlapping(o) or self.seqname != o.seqname: + return new_range + + istart = min(self.start, self.end) + iend = max(self.start, self.end) + jstart = min(o.start, o.end) + jend = max(o.start, o.end) + if istart < jstart: + if iend <= jend: + if self.isOnDirectStrand(): + self.start = istart + self.end = jstart - 1 + else: + self.start = jstart - 1 + self.end = istart + else: + if self.isOnDirectStrand(): + self.start = istart + self.end = jstart - 1 + new_range.start = jend + 1 + new_range.end = iend + else: + self.start = jstart - 1; + self.end = istart; + new_range.start = iend + new_range.end = jend + 1 + else: #istart>=jstart + if iend <= jend: + self.start = 0 + self.end = 0 + else: + if self.isOnDirectStrand(): + self.start = jend + 1 + self.end = iend + else: + self.start = iend + self.end = jend + 1 + return new_range + + ## Find the bin that contains the instance and compute its index + # + # @note Required for coordinate indexing via a hierarchical bin system + # + def findIdx(self): + min_lvl = 3 + max_lvl = 6 + for bin_lvl in xrange(min_lvl, max_lvl): + if getBin(self.start, bin_lvl) == getBin(self.end, bin_lvl): + return getIdx(self.start, bin_lvl) + return getIdx(self.start, max_lvl) + + ## Get a bin for fast database access + # + # @return bin number (float) + # + def getBin(self): + for i in xrange(3, 8): + bin_lvl = pow(10, i) + if int(self.start/bin_lvl) == int(self.end/bin_lvl): + return float(bin_lvl+(int(self.start/bin_lvl)/1e10)) + bin_lvl = pow(10, 8) + return float(bin_lvl+(int(self.start/bin_lvl)/1e10)) + + +# Functions + +# Get the bin number of a coordinate according to the bin level. Required for coordinate indexing with hierarchical bin system +# +def getBin(val, bin_lvl): + bin_size = pow(10, bin_lvl) + return long(val / bin_size) + +# Get an index from a coordinate according to the bin level. Required for coordinate indexing with hierarchical bin system +# +def getIdx(val, bin_lvl): + min_lvl = 3 + max_lvl = 6 + if bin_lvl >= max_lvl: + return long((bin_lvl-min_lvl+1)*pow(10,max_lvl)) + return long(((bin_lvl-min_lvl+1)*pow(10,max_lvl))+getBin(val,bin_lvl)) diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/Set.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/Set.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,132 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.coord.Map import Map + + +## Record a named region on a given sequence with an identifier +# +class Set( Map ): + + __slots__ = ("id") + + ## Constructor + # + # @param id identifier + # @param name the name of the region + # @param seqname the name of the sequence + # @param start the start coordinate + # @param end the end coordinate + # + def __init__(self, id=-1, name="", seqname="", start=-1, end=-1): + Map.__init__( self, name, seqname, start, end ) + self.id = id + + ## Equal operator + # + def __eq__(self, o): + if type(o) is not type(self) or self.id != o.id: + return False + else: + return Map.__eq__(self, o) + + ## Not equal operator + # + def __ne__(self, o): + return not self.__eq__(o) + + def getId(self): + return self.id + + ## Reset + # + def reset(self): + self.setFromTuple([-1, "", "", -1, -1 ]) + + ## Set attributes from tuple + # + # @param tuple: a tuple with (id, name, seqname, start, end) + # + def setFromTuple(self, tuple): + self.id = int(tuple[0]) + Map.setFromTuple(self, tuple[1:]) + + ## Return the attributes as a formatted string + # + def toString(self): + string = "%i" % (self.id) + string += "\t%s" % (Map.toString(self)) + return string + + ## Merge the instance with another Set instance + # + # @param o a Set instance + # + def merge(self, o): + if self.seqname == o.seqname: + Map.merge(self, o) + self.id = min(self.id, o.id) + + ## Return a Map instance with the attributes + # + def getMap(self): + return Map(self.name, self.seqname, self.start, self.end) + + ## Remove in the instance the region overlapping with another Set instance + # + # @param o a Set instance + # + def diff(self, o): + iMap = Map.diff(self, o.getMap()) + new = Set() + if not iMap.isEmpty(): + new.id = self.id + new.name = self.name + new.seqname = self.seqname + new.start = iMap.start + new.end = iMap.end + return new + + ## Return a Map instance with the identifier in the name + # + def set2map(self): + return Map(self.name+"::"+str(self.id),self.seqname,self.start,self.end) + + + def getMapInstance( self ): + iMap = Map() + lAttributes = [] + lAttributes.append( self.name ) + lAttributes.append( self.seqname ) + lAttributes.append( self.start ) + lAttributes.append( self.end ) + iMap.setFromTuple( lAttributes ) + return iMap diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/SetUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/SetUtils.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,553 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.coord.Set import Set + +## Static methods for the manipulation of Set instances +# +class SetUtils( object ): + + ## Change the identifier of each Set instance in the given list + # + # @param lSets list of Set instances + # @param newId new identifier + # + def changeIdInList(lSets, newId): + for iSet in lSets: + iSet.id = newId + + changeIdInList = staticmethod( changeIdInList ) + + ## Return the length of the overlap between two lists of Set instances + # + # @param lSets1 list of Set instances + # @param lSets2 list of Set instances + # @return length of overlap + # @warning sequence names are supposed to be identical + # + def getOverlapLengthBetweenLists(lSets1, lSets2): + lSet1Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets1) + lSet2Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets2) + osize = 0 + i = 0 + j = 0 + while i!= len(lSet1Sorted): + while j!= len(lSet2Sorted) and lSet1Sorted[i].getMin()>lSet2Sorted[j].getMax()\ + and not(lSet1Sorted[i].isOverlapping(lSet2Sorted[j])): + j+=1 + jj=j + while jj!= len(lSet2Sorted) and lSet1Sorted[i].isOverlapping(lSet2Sorted[jj]): + osize+=lSet1Sorted[i].getOverlapLength(lSet2Sorted[jj]) + jj+=1 + i+=1 + return osize + + getOverlapLengthBetweenLists = staticmethod( getOverlapLengthBetweenLists ) + + ## Return True if the two lists of Set instances overlap, False otherwise + # + # @param lSets1 list of Set instances + # @param lSets2 list of Set instances + # + def areSetsOverlappingBetweenLists( lSets1, lSets2 ): + lSet1Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets1) + lSet2Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets2) + i=0 + j=0 + while i!= len(lSet1Sorted): + while j!= len(lSet2Sorted) and lSet1Sorted[i].getMin()>lSet2Sorted[j].getMax()\ + and not(lSet1Sorted[i].isOverlapping(lSet2Sorted[j])): + j+=1 + if j!= len(lSet2Sorted) and lSet1Sorted[i].isOverlapping(lSet2Sorted[j]): + return True + i+=1 + return False + + areSetsOverlappingBetweenLists = staticmethod( areSetsOverlappingBetweenLists ) + + ## Merge all overlapping Set instances between two lists of Set and give the next identifier + # + # @param lSets1 list of Set instances + # @param lSets2 list of Set instances + # @param max_id start id value for inserting new Set + # @return a new list of the merged Set instances and the next identifier + # + def getListOfMergedSetsAndNextId(lSets1, lSets2, max_id=0): + lSets_merged = [] + list2merge = SetUtils.getListOfIdListOfOverlappingSets ( lSets1,lSets2 ) + idlist1 = SetUtils.getDictOfListsWithIdAsKey(lSets1) + idlist2 = SetUtils.getDictOfListsWithIdAsKey(lSets2) + if max_id == 0: + max_id = max(idlist1.keys()) + 1 + for i in list2merge: + if i == []: + continue + l = [] + min_id = max(i) + for j in i: + if j>0: + if min_id>j: + min_id=j + l.extend(idlist1[j]) + del idlist1[j] + else: + l.extend(idlist2[j*-1]) + del idlist2[j*-1] + l = SetUtils.mergeSetsInList(l) + SetUtils.changeIdInList(l, min_id) + lSets_merged.extend(l) + for id, alist in idlist1.items(): + lSets_merged.extend(alist) + for id,alist in idlist2.items(): + SetUtils.changeIdInList(alist,max_id) + lSets_merged.extend(alist) + max_id+=1 + return lSets_merged, max_id + + getListOfMergedSetsAndNextId = staticmethod ( getListOfMergedSetsAndNextId ) + +# ## Concatenate two Set instance lists and give the next identifier +# # +# # @param lSets1 list of Set instances +# # @param lSets2 list of Set instances +# # @param maxId start id value for inserting new Set +# # @return a new list of Set instances and the next identifier +# # +# @staticmethod +# def getSetsListOfTwoConcatenatedSetsListAndNextId(lSets1, lSets2, maxId = 0): +# lOutSets = lSets1 +# dId2SetsList2 = SetUtils.getDictOfListsWithIdAsKey(lSets2) +# if maxId == 0: +# dId2SetsList1 = SetUtils.getDictOfListsWithIdAsKey(lSets1) +# maxId = max(dId2SetsList1.keys()) +# for lSets in dId2SetsList2.values(): +# SetUtils.changeIdInList(lSets, maxId) +# lOutSets.extend(lSets) +# maxId += 1 +# return lOutSets, maxId + + ## Return the sum of the length of each Set instance in the given list + # + # @param lSets: list of Set instances + # + def getCumulLength(lSets): + length = 0 + for i in lSets: + length += i.getLength() + return length + + getCumulLength = staticmethod( getCumulLength ) + + ## Return a tuple with min and max coordinates of Set instances in the given list + # + # @param lSets list of Set instances + # + def getListBoundaries(lSets): + qmin = -1 + qmax = -1 + for iSet in lSets: + if qmin == -1: + qmin = iSet.start + qmin = min(qmin, iSet.getMin()) + qmax = max(qmax, iSet.getMax()) + return (qmin, qmax) + + getListBoundaries = staticmethod( getListBoundaries ) + + ## Show Set instances contained in the given list + # + # @param lSets list of Set instances + # + def showList(lSets): + for iSet in lSets: + iSet.show() + + showList = staticmethod( showList ) + + ## Write Set instances contained in the given list + # + # @param lSets list of Set instances + # @param fileName a file name + # @param mode the open mode of the file '"w"' or '"a"' + # + def writeListInFile(lSets, fileName, mode="w"): + fileHandler = open(fileName, mode) + for iSet in lSets: + iSet.write(fileHandler) + fileHandler.close() + + writeListInFile = staticmethod( writeListInFile ) + + ## Split a Set list in several Set lists according to the identifier + # + # @param lSets list of Set instances + # @return a dictionary which keys are identifiers and values Set lists + # + def getDictOfListsWithIdAsKey(lSets): + dId2SetList = {} + for iSet in lSets: + if dId2SetList.has_key(iSet.id): + dId2SetList[iSet.id].append(iSet) + else: + dId2SetList[iSet.id] = [iSet] + return dId2SetList + + getDictOfListsWithIdAsKey = staticmethod( getDictOfListsWithIdAsKey ) + + + ## Split a Set list in several Set lists according to the identifier + # + # @param lSets list of Set instances + # @return a dictionary which keys are identifiers and values Set lists + # + def getDictOfListsWithIdAsKeyFromFile( setFile ): + dId2SetList = {} + setFileHandler = open( setFile, "r" ) + while True: + line = setFileHandler.readline() + if line == "": + break + iSet = Set() + iSet.setFromTuple( line[:-1].split("\t") ) + if not dId2SetList.has_key( iSet.id ): + dId2SetList[ iSet.id ] = [] + dId2SetList[ iSet.id ].append( iSet ) + setFileHandler.close() + return dId2SetList + + getDictOfListsWithIdAsKeyFromFile = staticmethod( getDictOfListsWithIdAsKeyFromFile ) + + + ## Return a Map list from the given Set List + # + # @param lSets list of Set instances + # + def getMapListFromSetList(lSets): + lMaps = [] + for iSet in lSets: + lMaps.append(iSet.set2map()) + return lMaps + + getMapListFromSetList = staticmethod( getMapListFromSetList ) + + ## Construct a Set list from a Map list + # + # @param lMaps list of Map instances + # + def getSetListFromMapList(lMaps): + lSets = [] + c = 0 + for iMap in lMaps: + c += 1 + lSets.append( Set(c, iMap.name, iMap.seqname, iMap.start, iMap.end) ) + return lSets + + getSetListFromMapList = staticmethod( getSetListFromMapList ) + + ## Merge all overlapping Set instances in a list without considering the identifiers. + # Start by sorting Set instances by their increasing Min coordinate. + # + # @return: a new list of the merged Set instances + # + def mergeSetsInList(lSets): + l=[] + if len(lSets)==0: + return l + + lSortedSets = SetUtils.getSetListSortedByIncreasingMinThenInvLength( lSets ) + + prev_count = 0 + for iSet in lSortedSets[0:]: + if prev_count != len(lSortedSets): + for i in lSortedSets[ prev_count + 1: ]: + if iSet.isOverlapping( i ): + iSet.merge( i ) + IsAlreadyInList = False + for newSet in l: + if newSet.isOverlapping( iSet ): + IsAlreadyInList = True + newSet.merge( iSet ) + l [ l.index( newSet ) ] = newSet + if not IsAlreadyInList: + l.append( iSet ) + prev_count += 1 + return l + + mergeSetsInList = staticmethod( mergeSetsInList ) + + ## Unjoin a Set list according to another + # + # @param lToKeep: a list of Set instances to keep + # @param lToUnjoin: a list of Set instances to unjoin + # @return: lToUnjoin split in several list + # + def getSetListUnjoined(lToKeep, lToUnjoin): + lSortedToKeep = SetUtils.getSetListSortedByIncreasingMinThenMax( lToKeep ) + lSortedToUnjoin = SetUtils.getSetListSortedByIncreasingMinThenMax( lToUnjoin ) + if lSortedToUnjoin == []: + return [] + if lSortedToKeep == []: + return [ lSortedToUnjoin ] + + i=0 + resultListSet=[] + while i lSortedToUnjoin[j1].getMax(): + j1+=1 + if j1==len(lSortedToUnjoin): + break + if j1!=0: + resultListSet.append(lSortedToUnjoin[:j1]) + del lSortedToUnjoin[:j1] + j1=0 + if i+1==len(lSortedToKeep): + break + j2=j1 + if j2 lSortedToUnjoin[j2].getMax(): + while j2 lSortedToUnjoin[j2].getMax(): + j2+=1 + resultListSet.append(lSortedToUnjoin[j1:j2]) + del lSortedToUnjoin[j1:j2] + i+=1 + + if resultListSet!=[] or i == 0: + resultListSet.append(lSortedToUnjoin) + return resultListSet + + getSetListUnjoined = staticmethod(getSetListUnjoined) + + ## Return new list of Set instances with no duplicate + # + # @param lSets list of Set instances + # + def getSetListWithoutDuplicates( lSets ): + if len(lSets) < 2: + return lSets + lSortedSet = SetUtils.getSetListSortedByIncreasingMinThenMax( lSets ) + lUniqSet = [ lSortedSet[0] ] + for iSet in lSortedSet[1:]: + if iSet != lUniqSet[-1]: + lUniqSet.append( iSet ) + return lUniqSet + + getSetListWithoutDuplicates = staticmethod( getSetListWithoutDuplicates ) + + ## Return a list of Set instances sorted in increasing order according to the Min, then the Max, and finally their initial order + # + # @param lSets: list of Set instances + # + def getSetListSortedByIncreasingMinThenMax( lSets ): + return sorted( lSets, key=lambda iSet: ( iSet.getMin(), iSet.getMax() ) ) + + getSetListSortedByIncreasingMinThenMax = staticmethod( getSetListSortedByIncreasingMinThenMax ) + + ## Return a list of Set instances sorted in increasing order according to the min, then the inverse of the length, and finally their initial order + # + # @param lSets: list of Set instances + # + def getSetListSortedByIncreasingMinThenInvLength( lSets ): + return sorted( lSets, key=lambda iSet: ( iSet.getMin(), 1 / float(iSet.getLength()) ) ) + + getSetListSortedByIncreasingMinThenInvLength = staticmethod( getSetListSortedByIncreasingMinThenInvLength ) + + ## Return a list of Set instances sorted in increasing order according to the SeqName, then the Name, then the Min, then the Max and finally their initial order + # + # @param lSets: list of Set instances + # + def getSetListSortedBySeqThenRegionThenMinThenMax(lSets): + return sorted(lSets, key=lambda iSet: (iSet.getSeqname(), iSet.getName(), iSet.getMin(), iSet.getMax())) + + getSetListSortedBySeqThenRegionThenMinThenMax = staticmethod(getSetListSortedBySeqThenRegionThenMinThenMax) + + ## Return a list of identifier lists of overlapping Sets from the subject list, according to the reference list + # + # @param lRef list of Set instances + # @param lSubject list of Set instances + # + def getListOfIdListOfOverlappingSets(lRef,lSubject): + lSortedRef = SetUtils.getSetListSortedByIncreasingMinThenMax( lRef ) + lSortedSubject = SetUtils.getSetListSortedByIncreasingMinThenMax( lSubject ) + + lOverlappingSet = [] + lOverlappingSetCounter = 0 + + id2LOverlappingSet_pos = {} + + i = 0 + j = 0 + while i!= len(lSortedRef): + while j!= len(lSortedSubject) and lSortedRef[i].getMin()>lSortedSubject[j].getMax()\ + and not(lSortedRef[i].isOverlapping(lSortedSubject[j])\ + and lSortedRef[i].isOnDirectStrand()==lSortedSubject[j].isOnDirectStrand()): + j+=1 + jj=j + while jj!= len(lSortedSubject) and lSortedRef[i].isOverlapping(lSortedSubject[jj])\ + and lSortedRef[i].isOnDirectStrand()==lSortedSubject[jj].isOnDirectStrand(): + id1=lSortedRef[i].id + id2=lSortedSubject[jj].id*-1 + if id2LOverlappingSet_pos.has_key(id1) \ + and not id2LOverlappingSet_pos.has_key(id2): + lOverlappingSet[id2LOverlappingSet_pos[id1]].append(id2) + id2LOverlappingSet_pos[id2]=id2LOverlappingSet_pos[id1] + if id2LOverlappingSet_pos.has_key(id2) \ + and not id2LOverlappingSet_pos.has_key(id1): + lOverlappingSet[id2LOverlappingSet_pos[id2]].append(id1) + id2LOverlappingSet_pos[id1]=id2LOverlappingSet_pos[id2] + if not id2LOverlappingSet_pos.has_key(id2) \ + and not id2LOverlappingSet_pos.has_key(id1): + lOverlappingSet.append([id1,id2]) + id2LOverlappingSet_pos[id1]=lOverlappingSetCounter + id2LOverlappingSet_pos[id2]=lOverlappingSetCounter + lOverlappingSetCounter+=1 + jj+=1 + i+=1 + + return lOverlappingSet + + getListOfIdListOfOverlappingSets = staticmethod (getListOfIdListOfOverlappingSets) + + ## Return a list of sets without overlapping between two lists of sets + # + # @param lSet1 and lSet2 + # + def getListOfSetWithoutOverlappingBetweenTwoListOfSet(lSet1, lSet2): + for i in lSet1: + for idx,j in enumerate(lSet2): + n=j.diff(i) + if not n.isEmpty() and n.getLength()>=20: + lSet2.append(n) + lSet2WithoutOverlaps=[] + for i in lSet2: + if not i.isEmpty() and i.getLength()>=20: + lSet2WithoutOverlaps.append(i) + return lSet2WithoutOverlaps + + getListOfSetWithoutOverlappingBetweenTwoListOfSet = staticmethod (getListOfSetWithoutOverlappingBetweenTwoListOfSet) + + ## Return a Set list from a Set file + # + # @param setFile string name of a Set file + # @return a list of Set instances + # + def getSetListFromFile( setFile ): + lSets = [] + setFileHandler = open( setFile, "r" ) + while True: + line = setFileHandler.readline() + if line == "": + break + iSet = Set() + iSet.setFromString( line ) + lSets.append( iSet ) + setFileHandler.close() + return lSets + + getSetListFromFile = staticmethod( getSetListFromFile ) + + + def convertSetFileIntoMapFile( setFile, mapFile ): + setFileHandler = open( setFile, "r" ) + mapFileHandler = open( mapFile, "w" ) + iSet = Set() + while True: + line = setFileHandler.readline() + if line == "": + break + iSet.setFromString( line ) + iMap = iSet.getMapInstance() + iMap.write( mapFileHandler ) + setFileHandler.close() + mapFileHandler.close() + + convertSetFileIntoMapFile = staticmethod( convertSetFileIntoMapFile ) + + + def getDictOfListsWithSeqnameAsKey( lSets ): + dSeqnamesToSetList = {} + for iSet in lSets: + if not dSeqnamesToSetList.has_key( iSet.seqname ): + dSeqnamesToSetList[ iSet.seqname ] = [] + dSeqnamesToSetList[ iSet.seqname ].append( iSet ) + return dSeqnamesToSetList + + getDictOfListsWithSeqnameAsKey = staticmethod( getDictOfListsWithSeqnameAsKey ) + + + def filterOnLength( lSets, minLength=0, maxLength=10000000000 ): + if minLength == 0 and maxLength == 0: + return lSets + lFiltered = [] + for iSet in lSets: + if minLength <= iSet.getLength() <= maxLength: + lFiltered.append( iSet ) + return lFiltered + + filterOnLength = staticmethod( filterOnLength ) + + + def getListOfNames( setFile ): + lNames = [] + setFileHandler = open( setFile, "r" ) + iSet = Set() + while True: + line = setFileHandler.readline() + if line == "": + break + iSet.setFromTuple( line[:-1].split("\t") ) + if iSet.name not in lNames: + lNames.append( iSet.name ) + setFileHandler.close() + return lNames + + getListOfNames = staticmethod( getListOfNames ) + + + def getDictOfDictsWithNamesThenIdAsKeyFromFile( setFile ): + dNames2DictsId = {} + setFileHandler = open( setFile, "r" ) + while True: + line = setFileHandler.readline() + if line == "": + break + iSet = Set() + iSet.setFromTuple( line[:-1].split("\t") ) + if not dNames2DictsId.has_key( iSet.name ): + dNames2DictsId[ iSet.name ] = { iSet.id: [ iSet ] } + else: + if not dNames2DictsId[ iSet.name ].has_key( iSet.id ): + dNames2DictsId[ iSet.name ][ iSet.id ] = [ iSet ] + else: + dNames2DictsId[ iSet.name ][ iSet.id ].append( iSet ) + setFileHandler.close() + return dNames2DictsId + + getDictOfDictsWithNamesThenIdAsKeyFromFile = staticmethod( getDictOfDictsWithNamesThenIdAsKeyFromFile ) diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/SlidingWindow.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/SlidingWindow.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,106 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +class SlidingWindow(object): + + def __init__( self, length = 1, overlap = 1 ): + self._length = length + self._overlap = overlap + self._start = 1 + self._end = length + self._step = length - overlap + + def slideWindowOnce(self): + self._start = self._start + self._step + self._end = self._end + self._step + + def getStart(self): + return self._start + + def getEnd(self): + return self._end + + def setStart(self, start): + self._start = start + + def setEnd(self, end): + self._end = end + + def getLength(self): + return self._length + + def getOverlap(self): + return self._overlap + + def getMiddle(self): + return self._start + ((self._end - self._start - 1) / 2) + + def setLength(self, length): + self._length = length + + def setOverlap(self, overlap): + self._overlap = overlap + + def getSlidingMsg(self): + return "Window is sliding : %s %s" % (self._start, self._end) + +class SlidingWindowToCountMatchingBases(SlidingWindow): + + def getSetLengthOnWindow( self, iSet ): + if self._isSetIncludedInTheWindow(iSet): + return iSet.getLength() + if self._isWindowIncludedInTheSet(iSet): + return self._length + elif self._isSetOverlapTheRightSideOfTheWindow(iSet): + return self._end - iSet.getMin()+1 + elif self._isSetOverlapTheLeftSideOfTheWindow(iSet): + return iSet.getMax() - self._start+1 + + def getCoordSetOnWindow( self, iSet ): + if self._isSetIncludedInTheWindow(iSet): + return iSet.getStart(), iSet.getEnd() + if self._isWindowIncludedInTheSet(iSet): + return self.getStart(), self.getEnd() + elif self._isSetOverlapTheRightSideOfTheWindow(iSet): + return iSet.getStart(), self.getEnd() + elif self._isSetOverlapTheLeftSideOfTheWindow(iSet): + return self.getStart(), iSet.getEnd() + + def _isSetIncludedInTheWindow(self, feature): + return feature.getMin() >= self._start and feature.getMax() <= self._end + + def _isWindowIncludedInTheSet(self, feature): + return self._start >= feature.getMin() and self._end <= feature.getMax() + + def _isSetOverlapTheRightSideOfTheWindow(self, feature): + return feature.getMin() <= self._end and feature.getMin() >= self._start + + def _isSetOverlapTheLeftSideOfTheWindow(self, feature): + return feature.getMax() <= self._end and feature.getMax() >= self._start diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/SplitOnLength.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/SplitOnLength.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,73 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +from commons.core.LoggerFactory import LoggerFactory + +LOG_DEPTH = "repet.commons.core.coord" + +## Splits a list of objects implementing getLength() based on a list of length thresholds +# +class SplitOnLength(object): + + def __init__(self, lObjects, lThresholds, verbosity = 0): + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), verbosity = verbosity) + self._lObjects = lObjects + self._lThresholds = lThresholds + + ## Splits the list of objects over the list of thresholds. + # + # @return a list of lists (groups) of objects + # + def split(self): + lSplit = [self._lObjects] + doObjectsImplementGetLength = False not in set([hasattr(o, "getLength") for o in self._lObjects]) + + if not self._lObjects: + self._log.warning("Empty input objects list, no split.") + elif not doObjectsImplementGetLength: + self._log.warning("At least one object in the list does not implement getLength(), no split.") + elif not self._lThresholds: + self._log.warning("Empty input thresholds list, no split.") + elif not self._lThresholds == sorted(self._lThresholds): + self._log.warning("Input thresholds list isn't sorted, no split. (%s)" % self._lThresholds) + else: + lSplit = [[] for i in xrange(len(self._lThresholds) + 1)] + + for obj in self._lObjects: + if obj.getLength() <= self._lThresholds[0]: + lSplit[0].append(obj) + elif self._lThresholds[-1] < obj.getLength(): + lSplit[-1].append(obj) + else: + for i in range(0, len(self._lThresholds) - 1): + if self._lThresholds[i] < obj.getLength() <= self._lThresholds[i + 1]: + lSplit[i + 1].append(obj) + break + return lSplit diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/__init__.py diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/coord/align2set.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/coord/align2set.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,86 @@ +#!/usr/bin/env python + +import sys +import getopt +from commons.core.coord.Align import Align + +def help(): + print + print "usage: %s [ options ]" % ( sys.argv[0].split("/")[-1] ) + print "options:" + print " -h: this help" + print " -i: input file name (format='align')" + print " -o: output file name (format='set', default=inFileName+'.set')" + print " -v: verbosity level (default=0/1)" + print + + +def align2set( inFileName, outFileName ): + alignFileHandler = open( inFileName, "r" ) + setFileHandler = open( outFileName, "w" ) + iAlign = Align() + countAlign = 0 + while True: + line = alignFileHandler.readline() + if line == "": + break + countAlign += 1 + iAlign.setFromString( line, "\t" ) + setFileHandler.write( "%i\t%s\t%s\t%i\t%i\n" % ( countAlign, + iAlign.getSubjectName(), + iAlign.getQueryName(), + iAlign.getQueryStart(), + iAlign.getQueryEnd() ) ) + alignFileHandler.close() + setFileHandler.close() + + +def main(): + + inFileName = "" + outFileName = "" + verbose = 0 + + try: + opts, args = getopt.getopt( sys.argv[1:], "hi:o:v:" ) + except getopt.GetoptError, err: + print str(err) + help() + sys.exit(1) + for o,a in opts: + if o == "-h": + help() + sys.exit(0) + elif o == "-i": + inFileName = a + elif o == "-o": + outFileName = a + elif o == "-v": + verbose = int(a) + + if inFileName == "": + print "ERROR: missing input file name" + help() + sys.exit(1) + + if verbose > 0: + print "START %s" % ( sys.argv[0].split("/")[-1] ) + sys.stdout.flush() + + if outFileName == "": + outFileName = "%s.set" % ( inFileName ) + +#TODO: move 'align2set' into 'AlignUtils.convertAlignFileIntoPSetFile' with a test +# AlignUtils.convertAlignFileIntoPSetFile( inFileName, outFileName ) + + align2set( inFileName, outFileName ) + + if verbose > 0: + print "END %s" % ( sys.argv[0].split("/")[-1] ) + sys.stdout.flush() + + return 0 + + +if __name__ == "__main__": + main() diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/parsing/GffParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/parsing/GffParser.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,149 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.parsing.TranscriptListParser import TranscriptListParser + + +class GffParser(TranscriptListParser): + """A class that parses a GFF file and create a transcript list""" + + + def __init__(self, fileName, verbosity = 0): + super(GffParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(GffParser, self).__del__() + + + def getFileFormats(): + return ["gff", "gff2", "gff3"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def getInfos(self): + self.chromosomes = set() + self.nbTranscripts = 0 + self.size = 0 + self.reset() + if self.verbosity >= 10: + print "Getting information on %s." % (self.fileName) + self.reset() + for line in self.handle: + line = line.strip() + if line == "" or line[0] == "#": + continue + parts = line.split("\t") + if len(parts) != 9: + raise Exception("Error! Line '%s' has %d tab-separated fields instead of 9!" % (line, len(parts))) + self.chromosomes.add(parts[0]) + if parts[8].find("Parent") == -1: + self.nbTranscripts += 1 + else: + self.size += max(int(parts[3]), int(parts[4])) - min(int(parts[3]), int(parts[4])) + 1 + if self.verbosity >= 10 and self.nbTranscripts % 100000 == 0: + sys.stdout.write(" %d transcripts read\r" % (self.nbTranscripts)) + sys.stdout.flush() + self.reset() + if self.verbosity >= 10: + print " %d transcripts read" % (self.nbTranscripts) + print "Done." + + + def parseLine(self, line): + if not line or line[0] == "#": + return None + m = re.search(r"^\s*(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+([+-.])\s+(\S+)\s+(\S.*)$", line) + if m == None: + raise Exception("\nLine %d '%s' does not have a GFF format\n" % (self.currentLineNb, line)) + interval = Interval() + interval.setChromosome(m.group(1)) + interval.setName("unnamed transcript") + interval.setStart(min(int(m.group(4)), int(m.group(5)))) + interval.setEnd(max(int(m.group(4)), int(m.group(5)))) + if m.group(7) == ".": + interval.setDirection("+") + else: + interval.setDirection(m.group(7)) + interval.setTagValue("feature", m.group(3)) + if m.group(6).isdigit(): + interval.setTagValue("score", m.group(6)) + + remainings = m.group(9).split(";") + for remaining in remainings: + remaining = remaining.strip() + if remaining == "": + continue + posSpace = remaining.find(" ") + posEqual = remaining.find("=") + if posEqual != -1 and (posEqual < posSpace or posSpace == -1): + parts = remaining.split("=") + else: + parts = remaining.split() + field = parts[0].strip() + value = " ".join(parts[1:]).strip(" \"") + if field in ("Name", "name", "Sequence", "TE", "SAT"): + interval.setName(value) + else: + try: + intValue = int(value) + interval.setTagValue(field, intValue) + except ValueError: + interval.setTagValue(field, value) + + self.currentTranscriptAddress = self.previousTranscriptAddress + if "Parent" in interval.getTagNames(): + if self.currentTranscript == None: + raise Exception("GFF file does not start with a transcript! First line is '%s'." % (line.strip())) + if interval.getTagValue("Parent") != self.currentTranscript.getTagValue("ID"): + raise Exception("Exon '%s' is not right after its transcript in GFF file!" % (interval)) + self.currentTranscript.addExon(interval) + if interval.name == None: + interval.name = self.currentTranscript.name + return None + + if interval is not None and interval.name.startswith("unnamed"): + if "ID" in interval.getTagNames(): + interval.name = interval.getTagValue("ID") + else: + interval.name = "unnamed transcript %s" % (self.currentLineNb) + + transcript = self.currentTranscript + self.currentTranscript = Transcript() + self.currentTranscript.copy(interval) + self.previousTranscriptAddress = self.currentAddress + return transcript diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/parsing/GtfParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/parsing/GtfParser.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,167 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.parsing.TranscriptListParser import TranscriptListParser + + +class GtfParser(TranscriptListParser): + """A class that parses a GTF file and create a transcript list""" + + + def __init__(self, fileName, verbosity = 0, assemblyTools=False): + super(GtfParser, self).__init__(fileName, verbosity) + self._assemblyTools=assemblyTools + + + def __del__(self): + super(GtfParser, self).__del__() + + + def getFileFormats(): + return ["gtf", "gtf2"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def parseLine(self, line): + if line[0] == "#": + return None + m = re.search(r"^\s*(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+([+-.])\s+(\S+)\s+(\S.*)$", line) + if m == None: + raise Exception("\nLine %d '%s' does not have a GTF format\n" % (self.currentLineNb, line)) + if(self._assemblyTools==False): + interval = Interval() + interval.setChromosome(m.group(1)) + interval.setName("unnamed transcript") + interval.setStart(min(int(m.group(4)), int(m.group(5)))) + interval.setEnd(max(int(m.group(4)), int(m.group(5)))) + if m.group(7) == ".": + interval.setDirection("+") + else: + interval.setDirection(m.group(7)) + if m.group(6).isdigit(): + interval.setTagValue("score", m.group(6)) + type = m.group(3) + + if type not in ("transcript", "exon"): + return None + + remainings = m.group(9).split(";") + for remaining in remainings: + remaining = remaining.strip() + if remaining == "": + continue + parts = remaining.split(" ", 1) + field = parts[0].strip() + value = " ".join(parts[1:]).strip(" \"") + if field == "transcript_id": + interval.setTagValue("ID", value) + elif field == "gene_name": + interval.setName(value) + elif field == "transcript_name": + interval.setName(value) + elif field == "exon_number": + continue + else: + try: + intValue = int(value) + interval.setTagValue(field, intValue) + except ValueError: + interval.setTagValue(field, value) + + self.currentTranscriptAddress = self.previousTranscriptAddress + if self.currentTranscript == None or interval.getTagValue("ID") != self.currentTranscript.getTagValue("ID"): + transcript = self.currentTranscript + self.currentTranscript = Transcript() + self.currentTranscript.copy(interval) + self.currentTranscript.setTagValue("feature", "transcript") + self.previousTranscriptAddress = self.currentAddress + return transcript + if type == "exon": + self.currentTranscript.addExon(interval) + return None + else: + if m.group(7) != ".": + interval = Interval() + interval.setChromosome(m.group(1)) + interval.setName("unnamed transcript") + interval.setStart(min(int(m.group(4)), int(m.group(5)))) + interval.setEnd(max(int(m.group(4)), int(m.group(5)))) + if m.group(7) == ".": + interval.setDirection("+") + else: + interval.setDirection(m.group(7)) + if m.group(6).isdigit(): + interval.setTagValue("score", m.group(6)) + type = m.group(3) + + if type not in ("transcript", "exon"): + return None + + remainings = m.group(9).split(";") + for remaining in remainings: + remaining = remaining.strip() + if remaining == "": + continue + parts = remaining.split(" ", 1) + field = parts[0].strip() + value = " ".join(parts[1:]).strip(" \"") + if field == "transcript_id": + interval.setTagValue("ID", value) + elif field == "gene_name": + interval.setName(value) + elif field == "transcript_name": + interval.setName(value) + elif field == "exon_number": + continue + else: + try: + intValue = int(value) + interval.setTagValue(field, intValue) + except ValueError: + interval.setTagValue(field, value) + + self.currentTranscriptAddress = self.previousTranscriptAddress + if self.currentTranscript == None or interval.getTagValue("ID") != self.currentTranscript.getTagValue("ID"): + transcript = self.currentTranscript + self.currentTranscript = Transcript() + self.currentTranscript.copy(interval) + self.currentTranscript.setTagValue("feature", "transcript") + self.previousTranscriptAddress = self.currentAddress + return transcript + if type == "exon": + self.currentTranscript.addExon(interval) + return None diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/parsing/TranscriptListParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/parsing/TranscriptListParser.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,185 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from SMART.Java.Python.structure.TranscriptList import TranscriptList +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +class TranscriptListParser(object): + """A (quite generic) class that reads a list of transcripts""" + + def __init__(self, fileName, verbosity = 0): + self.verbosity = verbosity + self.fileName = fileName + self.nbTranscripts = None + self.size = None + self.chromosomes = None + self.currentTranscript = None + self.currentLineNb = 0 + self.previousTranscriptAddress = None + try: + self.handle = open(self.fileName) + except IOError: + raise Exception("Error! Transcript file '%s' does not exist! Exiting..." % (self.fileName)) + self.skipFirstLines() + + + def __del__(self): + self.close() + + + def getFileFormats(): + pass + getFileFormats = staticmethod(getFileFormats) + + + def close(self): + if self.handle != None and not self.handle.close: + self.handle.close() + self.handle = None + + + def reset(self): + self.handle.seek(0) + self.skipFirstLines() + self.currentTranscript = None + self.currentLineNb = 0 + self.currentTranscriptAddress = self.handle.tell() + self.currentAddress = self.handle.tell() + + + def gotoAddress(self, address): + self.reset() + self.handle.seek(address) + self.currentTranscriptAddress = address + self.currentAddress = address + + + def parse(self): + transcriptList = TranscriptList() + progress = Progress(self.getNbTranscripts(), "Reading %s" % (self.fileName), self.verbosity) + for line in self.handle: + self.currentLineNb += 1 + transcript = self.parseLine(line) + transcriptList.addTranscript(transcript) + progress.inc() + progress.done() + return transcriptList + + + def getIterator(self): + self.reset() + transcript = self.getNextTranscript() + while transcript != None: + yield transcript + transcript = self.getNextTranscript() + + + def getCurrentAddress(self): + return self.currentAddress + + + def getCurrentTranscriptAddress(self): + return self.currentTranscriptAddress + + + def getNextTranscript(self): + self.currentAddress = self.handle.tell() + line = self.handle.readline() + while line != "": + line = line.strip() + self.currentLineNb += 1 + transcript = self.parseLine(line) + if transcript != None: + return transcript + self.currentAddress = self.handle.tell() + line = self.handle.readline() + transcript = self.currentTranscript + self.currentTranscriptAddress = self.previousTranscriptAddress + self.currentTranscript = None + return transcript + + + def getInfos(self): + self.chromosomes = set() + self.nbTranscripts = 0 + self.size = 0 + self.reset() + progress = UnlimitedProgress(100000, "Getting information on %s." % (self.fileName), self.verbosity-9) + transcript = self.getNextTranscript() + for transcript in self.getIterator(): + self.chromosomes.add(transcript.getChromosome()) + self.nbTranscripts += 1 + self.size += transcript.getSize() + progress.inc() + progress.done() + self.reset() + + + def getNbTranscripts(self): + if self.nbTranscripts != None: + return self.nbTranscripts + self.getInfos() + return self.nbTranscripts + + + def getNbItems(self): + return self.getNbTranscripts() + + + def getChromosomes(self): + if self.chromosomes != None: + return self.chromosomes + self.getInfos() + return self.chromosomes + + + def getSize(self): + if self.size != None: + return self.size + self.getInfos() + return self.size + + + def getNbNucleotides(self): + return self.getSize() + + + def setDefaultTagValue(self, name, value): + for transcript in self.getIterator(): + transcript.setTag(name, value) + + def __eq__(self, o): + if type(o) is type(self): + return self.fileName == o.fileName and self.nbTranscripts == o.nbTranscripts and self.size == o.size and self.chromosomes == o.chromosomes + return False + + def __ne__(self, o): + return not self.__eq__(o) diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/parsing/__init__.py diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/seq/AlignedBioseqDB.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/seq/AlignedBioseqDB.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,440 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import sys +from commons.core.seq.BioseqDB import BioseqDB +from commons.core.seq.Bioseq import Bioseq +from commons.core.coord.Align import Align +from commons.core.coord.Range import Range +from commons.core.stat.Stat import Stat +from math import log + + +## Multiple Sequence Alignment Representation +# +# +class AlignedBioseqDB( BioseqDB ): + + def __init__( self, name="" ): + BioseqDB.__init__( self, name ) + seqLength = self.getLength() + if self.getSize() > 1: + for bs in self.db[1:]: + if bs.getLength() != seqLength: + print "ERROR: aligned sequences have different length" + + + ## Get length of the alignment + # + # @return length + # @warning name before migration was 'length' + # + def getLength( self ): + length = 0 + if self.db != []: + length = self.db[0].getLength() + return length + + + ## Get the true length of a given sequence (without gaps) + # + # @param header string header of the sequence to analyze + # @return length integer + # @warning name before migration was 'true_length' + # + def getSeqLengthWithoutGaps( self, header ): + bs = self.fetch( header ) + count = 0 + for pos in xrange(0,len(bs.sequence)): + if bs.sequence[pos] != "-": + count += 1 + return count + + def cleanMSA( self ): + #TODO: Refactoring + """clean the MSA""" + i2del = [] + + # for each sequence in the MSA + for seqi in xrange(0,self.getSize()): + if seqi in i2del: + continue + #define it as the reference + ref = self.db[seqi].sequence + refHeader = self.db[seqi].header + # for each following sequence + for seq_next in xrange(seqi+1,self.getSize()): + if seq_next in i2del: + continue + keep = 0 + # for each position along the MSA + for posx in xrange(0,self.getLength()): + seq = self.db[seq_next].sequence + if seq[posx] != '-' and ref[posx] != '-': + keep = 1 + break + seqHeader = self.db[seq_next].header + # if there is at least one gap between the ref seq and the other seq + # keep track of the shortest by recording it in "i2del" + if keep == 0: + + if self.getSeqLengthWithoutGaps(refHeader) < self.getSeqLengthWithoutGaps(seqHeader): + if seqi not in i2del: + i2del.append( seqi ) + else: + if seq_next not in i2del: + i2del.append( seq_next ) + + # delete from the MSA each seq present in the list "i2del" + for i in reversed(sorted(set(i2del))): + del self.db[i] + + self.idx = {} + count = 0 + for i in self.db: + self.idx[i.header] = count + count += 1 + + ## Record the occurrences of symbols (A, T, G, C, N, -, ...) at each site + # + # @return: list of dico whose keys are symbols and values are their occurrences + # + def getListOccPerSite( self ): + lOccPerSite = [] # list of dictionaries, one per position on the sequence + n = 0 # nb of sequences parsed from the input file + firstSeq = True + + # for each sequence in the bank + for bs in self.db: + if bs.sequence == None: + break + n += 1 + + # if it is the first to be parsed, create a dico at each site + if firstSeq: + for i in xrange(0,len(bs.sequence)): + lOccPerSite.append( {} ) + firstSeq = False + + # for each site, add its nucleotide + for i in xrange(0,len(bs.sequence)): + nuc = bs.sequence[i].upper() + if lOccPerSite[i].has_key( nuc ): + lOccPerSite[i][nuc] += 1 + else: + lOccPerSite[i][nuc] = 1 + + return lOccPerSite + + #TODO: review minNbNt !!! It should be at least 2 nucleotides to build a consensus... + ## Make a consensus from the MSA + # + # @param minNbNt: minimum nb of nucleotides to edit a consensus + # @param minPropNt: minimum proportion for the major nucleotide to be used, otherwise add 'N' (default=0.0) + # @param verbose: level of information sent to stdout (default=0/1) + # @return: consensus + # + def getConsensus( self, minNbNt, minPropNt=0.0, verbose=0 , isHeaderSAtannot=False): + + maxPropN = 0.40 # discard consensus if more than 40% of N's + + nbInSeq = self.getSize() + if verbose > 0: + print "nb of aligned sequences: %i" % ( nbInSeq ); sys.stdout.flush() + if nbInSeq < 2: + print "ERROR: can't make a consensus with less than 2 sequences" + sys.exit(1) + if minNbNt >= nbInSeq: + minNbNt = nbInSeq - 1 + print "minNbNt=%i" % ( minNbNt ) + if minPropNt >= 1.0: + print "ERROR: minPropNt=%.2f should be a proportion (below 1.0)" % ( minPropNt ) + sys.exit(1) + + lOccPerSite = self.getListOccPerSite() + nbSites = len(lOccPerSite) + if verbose > 0: + print "nb of sites: %i" % ( nbSites ); sys.stdout.flush() + + seqConsensus = "" + + # for each site (i.e. each column of the MSA) + nbRmvColumns = 0 + countSites = 0 + for dNt2Occ in lOccPerSite: + countSites += 1 + if verbose > 1: + print "site %s / %i" % ( str(countSites).zfill( len(str(nbSites)) ), + nbSites ) + sys.stdout.flush() + occMaxNt = 0 # occurrences of the predominant nucleotide at this site + lBestNt = [] + nbNt = 0 # total nb of A, T, G and C (no gap) + + # for each distinct symbol at this site (A, T, G, C, N, -,...) + for j in dNt2Occ.keys(): + if j != "-": + nbNt += dNt2Occ[j] + if verbose > 1: + print "%s: %i" % ( j, dNt2Occ[j] ) + if dNt2Occ[j] > occMaxNt: + occMaxNt = dNt2Occ[j] + lBestNt = [ j ] + elif dNt2Occ[j] == occMaxNt: + lBestNt.append( j ) + if nbNt == 0: # some MSA programs can remove some sequences (e.g. Muscle after Recon) or when using Refalign (non-alignable TE fragments put together via a refseq) + nbRmvColumns += 1 + + if len( lBestNt ) >= 1: + bestNt = lBestNt[0] + + # if the predominant nucleotide occurs in less than x% of the sequences, put a "N" + if minPropNt > 0.0 and nbNt != 0 and float(occMaxNt)/float(nbNt) < minPropNt: + bestNt = "N" + + if int(nbNt) >= int(minNbNt): + seqConsensus += bestNt + if verbose > 1: + print "-> %s" % ( bestNt ) + + if nbRmvColumns: + if nbRmvColumns == 1: + print "WARNING: 1 site was removed (%.2f%%)" % (nbRmvColumns / float(nbSites) * 100) + else: + print "WARNING: %i sites were removed (%.2f%%)" % ( nbRmvColumns, nbRmvColumns / float(nbSites) * 100 ) + sys.stdout.flush() + if seqConsensus == "": + print "WARNING: no consensus can be built (no sequence left)" + return + + propN = seqConsensus.count("N") / float(len(seqConsensus)) + if propN >= maxPropN: + print "WARNING: no consensus can be built (%i%% of N's >= %i%%)" % ( propN * 100, maxPropN * 100 ) + return + elif propN >= maxPropN * 0.5: + print "WARNING: %i%% of N's" % ( propN * 100 ) + + consensus = Bioseq() + consensus.sequence = seqConsensus + if isHeaderSAtannot: + header = self.db[0].header + pyramid = header.split("Gr")[1].split("Cl")[0] + pile = header.split("Cl")[1].split(" ")[0] + consensus.header = "consensus=%s length=%i nbAlign=%i pile=%s pyramid=%s" % (self.name, len(seqConsensus), self.getSize(), pile, pyramid) + else: + consensus.header = "consensus=%s length=%i nbAlign=%i" % ( self.name, len(seqConsensus), self.getSize() ) + + if verbose > 0: + + statEntropy = self.getEntropy( verbose - 1 ) + print "entropy: %s" % ( statEntropy.stringQuantiles() ) + sys.stdout.flush() + + return consensus + + + ## Get the entropy of the whole multiple alignment (only for A, T, G and C) + # + # @param verbose level of verbosity + # + # @return statistics about the entropy of the MSA + # + def getEntropy( self, verbose=0 ): + + stats = Stat() + + # get the occurrences of symbols at each site + lOccPerSite = self.getListOccPerSite() + + countSite = 0 + + # for each site + for dSymbol2Occ in lOccPerSite: + countSite += 1 + + # count the number of nucleotides (A, T, G and C, doesn't count gap '-') + nbNt = 0 + dATGC2Occ = {} + for base in ["A","T","G","C"]: + dATGC2Occ[ base ] = 0.0 + for nt in dSymbol2Occ.keys(): + if nt != "-": + nbNt += dSymbol2Occ[ nt ] + checkedNt = self.getATGCNFromIUPAC( nt ) + if checkedNt in ["A","T","G","C"] and dSymbol2Occ.has_key( checkedNt ): + dATGC2Occ[ checkedNt ] += 1 * dSymbol2Occ[ checkedNt ] + else: # for 'N' + if dSymbol2Occ.has_key( checkedNt ): + dATGC2Occ[ "A" ] += 0.25 * dSymbol2Occ[ checkedNt ] + dATGC2Occ[ "T" ] += 0.25 * dSymbol2Occ[ checkedNt ] + dATGC2Occ[ "G" ] += 0.25 * dSymbol2Occ[ checkedNt ] + dATGC2Occ[ "C" ] += 0.25 * dSymbol2Occ[ checkedNt ] + if verbose > 2: + for base in dATGC2Occ.keys(): + print "%s: %i" % ( base, dATGC2Occ[ base ] ) + + # compute the entropy for the site + entropySite = 0.0 + for nt in dATGC2Occ.keys(): + entropySite += self.computeEntropy( dATGC2Occ[ nt ], nbNt ) + if verbose > 1: + print "site %i (%i nt): entropy = %.3f" % ( countSite, nbNt, entropySite ) + stats.add( entropySite ) + + return stats + + + ## Get A, T, G, C or N from an IUPAC letter + # IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N'] + # + # @return A, T, G, C or N + # + def getATGCNFromIUPAC( self, nt ): + iBs = Bioseq() + return iBs.getATGCNFromIUPAC( nt ) + + + ## Compute the entropy based on the occurrences of a certain nucleotide and the total number of nucleotides + # + def computeEntropy( self, nbOcc, nbNt ): + if nbOcc == 0.0: + return 0.0 + else: + freq = nbOcc / float(nbNt) + return - freq * log(freq) / log(2) + + + ## Save the multiple alignment as a matrix with '0' if gap, '1' otherwise + # + def saveAsBinaryMatrix( self, outFile ): + outFileHandler = open( outFile, "w" ) + for bs in self.db: + string = "%s" % ( bs.header ) + for nt in bs.sequence: + if nt != "-": + string += "\t%i" % ( 1 ) + else: + string += "\t%i" % ( 0 ) + outFileHandler.write( "%s\n" % ( string ) ) + outFileHandler.close() + + + ## Return a list of Align instances corresponding to the aligned regions (without gaps) + # + # @param query string header of the sequence considered as query + # @param subject string header of the sequence considered as subject + # + def getAlignList( self, query, subject ): + lAligns = [] + alignQ = self.fetch( query ).sequence + alignS = self.fetch( subject ).sequence + createNewAlign = True + indexAlign = 0 + indexQ = 0 + indexS = 0 + while indexAlign < len(alignQ): + if alignQ[ indexAlign ] != "-" and alignS[ indexAlign ] != "-": + indexQ += 1 + indexS += 1 + if createNewAlign: + iAlign = Align( Range( query, indexQ, indexQ ), + Range( subject, indexS, indexS ), + 0, + int( alignQ[ indexAlign ] == alignS[ indexAlign ] ), + int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) ) + lAligns.append( iAlign ) + createNewAlign = False + else: + lAligns[-1].range_query.end += 1 + lAligns[-1].range_subject.end += 1 + lAligns[-1].score += int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) + lAligns[-1].identity += int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) + else: + if not createNewAlign: + lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery() + createNewAlign = True + if alignQ[ indexAlign ] != "-": + indexQ += 1 + elif alignS[ indexAlign ] != "-": + indexS += 1 + indexAlign += 1 + if not createNewAlign: + lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery() + return lAligns + + + def removeGaps(self): + for iBs in self.db: + iBs.removeSymbol( "-" ) + + ## Compute mean per cent identity for MSA. + # First sequence in MSA is considered as reference sequence. + # + # + def computeMeanPcentIdentity(self): + seqRef = self.db[0] + sumPcentIdentity = 0 + + for seq in self.db[1:]: + pcentIdentity = self._computePcentIdentityBetweenSeqRefAndCurrentSeq(seqRef, seq) + sumPcentIdentity = sumPcentIdentity + pcentIdentity + + nbSeq = len(self.db[1:]) + meanPcentIdentity = round (sumPcentIdentity/nbSeq) + + return meanPcentIdentity + + def _computePcentIdentityBetweenSeqRefAndCurrentSeq(self, seqRef, seq): + indexOnSeqRef = 0 + sumIdentity = 0 + for nuclSeq in seq.sequence: + nuclRef = seqRef.sequence[indexOnSeqRef] + + if nuclRef != "-" and nuclRef == nuclSeq: + sumIdentity = sumIdentity + 1 + indexOnSeqRef = indexOnSeqRef + 1 + + return float(sumIdentity) / float(seqRef.getLength()) * 100 + + + + + + + + + + + + + + + diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/seq/Bioseq.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/seq/Bioseq.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,738 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import re +import sys +import random +import string +import cStringIO +from commons.core.coord.Map import Map +from commons.core.checker.RepetException import RepetException + +DNA_ALPHABET_WITH_N = set(['A', 'T', 'G', 'C', 'N']) +IUPAC = set(['A', 'T', 'G', 'C', 'U', 'R', 'Y', 'M', 'K', 'W', 'S', 'B', 'D', 'H', 'V', 'N']) + + +## Record a sequence with its header +# +class Bioseq(object): + + __slots__ = ("header", "sequence", '__dict__') + + ## constructor + # + # @param name the header of sequence + # @param seq sequence (DNA, RNA, protein) + # + def __init__(self, name = "", seq = ""): + self.header = name + self.sequence = seq + + + ## Equal operator + # + def __eq__(self, o): + if type(o) is type(self) and self.header == o.header and self.sequence == o.sequence: + return True + return False + + ## Not equal operator + # + def __ne__(self, o): + return not self.__eq__(o) + + ## overload __repr__ + # + def __repr__(self): + return "%s;%s" % (self.header, self.sequence) + + + ## set attribute header + # + # @param header a string + # + def setHeader(self, header): + self.header = header + + + ## get attribute header + # + # @return header + def getHeader(self): + return self.header + + + ## set attribute sequence + # + # @param sequence a string + # + def setSequence(self, sequence): + self.sequence = sequence + + + def getSequence(self): + return self.sequence + + ## reset + # + def reset(self): + self.setHeader("") + self.setSequence("") + + + ## Test if bioseq is empty + # + def isEmpty(self): + return self.header == "" and self.sequence == "" + + + ## Reverse the sequence + # + def reverse(self): + tmp = self.sequence + self.sequence = tmp[::-1] + + + ## Turn the sequence into its complement + # Force upper case letters + # @warning: old name in pyRepet.Bioseq realComplement + # + def complement(self): + complement = "" + self.upCase() + for i in xrange(0, len(self.sequence), 1): + if self.sequence[i] == "A": + complement += "T" + elif self.sequence[i] == "T": + complement += "A" + elif self.sequence[i] == "C": + complement += "G" + elif self.sequence[i] == "G": + complement += "C" + elif self.sequence[i] == "M": + complement += "K" + elif self.sequence[i] == "R": + complement += "Y" + elif self.sequence[i] == "W": + complement += "W" + elif self.sequence[i] == "S": + complement += "S" + elif self.sequence[i] == "Y": + complement += "R" + elif self.sequence[i] == "K": + complement += "M" + elif self.sequence[i] == "V": + complement += "B" + elif self.sequence[i] == "H": + complement += "D" + elif self.sequence[i] == "D": + complement += "H" + elif self.sequence[i] == "B": + complement += "V" + elif self.sequence[i] == "N": + complement += "N" + elif self.sequence[i] == "-": + complement += "-" + else: + print "WARNING: unknown symbol '%s', replacing it by N" % (self.sequence[i]) + complement += "N" + self.sequence = complement + + + ## Reverse and complement the sequence + # + # Force upper case letters + # @warning: old name in pyRepet.Bioseq : complement + # + def reverseComplement(self): + self.reverse() + self.complement() + + + ## Remove gap in the sequence + # + def cleanGap(self): + self.sequence = self.sequence.replace("-", "") + + + ## Copy current Bioseq Instance + # + # @return: a Bioseq instance, a copy of current sequence. + # + def copyBioseqInstance(self): + seq = Bioseq() + seq.sequence = self.sequence + seq.header = self.header + return seq + + + ## Add phase information after the name of sequence in header + # + # @param phase integer representing phase (1, 2, 3, -1, -2, -3) + # + def setFrameInfoOnHeader(self, phase): + if " " in self.header: + name, desc = self.header.split(" ", 1) + name = name + "_" + str(phase) + self.header = name + " " + desc + else: + self.header = self.header + "_" + str(phase) + + + ## Fill Bioseq attributes with fasta file + # + # @param faFileHandler file handler of a fasta file + # + def read(self, faFileHandler): + line = faFileHandler.readline() + if line == "": + self.header = None + self.sequence = None + return + while line == "\n": + line = faFileHandler.readline() + if line[0] == '>': + self.header = string.rstrip(line[1:]) + else: + print "error, line is", string.rstrip(line) + return + line = " " + seq = cStringIO.StringIO() + while line: + prev_pos = faFileHandler.tell() + line = faFileHandler.readline() + if line == "": + break + if line[0] == '>': + faFileHandler.seek(prev_pos) + break + seq.write(string.rstrip(line)) + self.sequence = seq.getvalue() + + + ## Create a subsequence with a modified header + # + # @param s integer start a required subsequence + # @param e integer end a required subsequence + # + # @return a Bioseq instance, a subsequence of current sequence + # + def subseq(self, s, e = 0): + if e == 0 : + e = len(self.sequence) + if s > e : + print "error: start must be < or = to end" + return + if s <= 0 : + print "error: start must be > 0" + return + sub = Bioseq() + sub.header = self.header + " fragment " + str(s) + ".." + str(e) + sub.sequence = self.sequence[(s - 1):e] + return sub + + + ## Get the nucleotide or aminoacid at the given position + # + # @param pos integer nucleotide or aminoacid position + # + # @return a string + # + def getNtFromPosition(self, pos): + result = None + if not (pos < 1 or pos > self.getLength()): + result = self.sequence[pos - 1] + return result + + + ## Print in stdout the Bioseq in fasta format with 60 characters lines + # + # @param l length of required sequence default is whole sequence + # + def view(self, l = 0): + print '>' + self.header + i = 0 + if(l == 0): + l = len(self.sequence) + seq = self.sequence[0:l] + + while i < len(seq): + print seq[i:i + 60] + i = i + 60 + + + ## Get length of sequence + # + # @param avoidN boolean don't count 'N' nucleotides + # + # @return length of current sequence + # + def getLength(self, countN = True): + if countN: + return len(self.sequence) + else: + return len(self.sequence) - self.countNt('N') + + + ## Return the proportion of a specific character + # + # @param nt character that we want to count + # + def propNt(self, nt): + return self.countNt(nt) / float(self.getLength()) + + + ## Count occurrence of specific character + # + # @param nt character that we want to count + # + # @return nb of occurrences + # + def countNt(self, nt): + return self.sequence.count(nt) + + + ## Count occurrence of each nucleotide in current seq + # + # @return a dict, keys are nucleotides, values are nb of occurrences + # + def countAllNt(self): + dNt2Count = {} + for nt in ["A", "T", "G", "C", "N"]: + dNt2Count[ nt ] = self.countNt(nt) + return dNt2Count + + + ## Return a dict with the number of occurrences for each combination of ATGC of specified size and number of word found + # + # @param size integer required length word + # + def occ_word(self, size): + occ = {} + if size == 0: + return occ, 0 + nbword = 0 + srch = re.compile('[^ATGC]+') + wordlist = self._createWordList(size) + for i in wordlist: + occ[i] = 0 + lenseq = len(self.sequence) + i = 0 + while i < lenseq - size + 1: + word = self.sequence[i:i + size].upper() + m = srch.search(word) + if m == None: + occ[word] = occ[word] + 1 + nbword = nbword + 1 + i = i + 1 + else: + i = i + m.end(0) + return occ, nbword + + + ## Return a dictionary with the frequency of occurs for each combination of ATGC of specified size + # + # @param size integer required length word + # + def freq_word(self, size): + dOcc, nbWords = self.occ_word(size) + freq = {} + for word in dOcc.keys(): + freq[word] = float(dOcc[word]) / nbWords + return freq + + + ## Find ORF in each phase + # + # @return: a dict, keys are phases, values are stop codon positions. + # + def findORF (self): + orf = {0:[], 1:[], 2:[]} + length = len (self.sequence) + for i in xrange(0, length): + triplet = self.sequence[i:i + 3] + if (triplet == "TAA" or triplet == "TAG" or triplet == "TGA"): + phase = i % 3 + orf[phase].append(i) + return orf + + + ## Convert the sequence into upper case + # + def upCase(self): + self.sequence = self.sequence.upper() + + + ## Convert the sequence into lower case + # + def lowCase(self): + self.sequence = self.sequence.lower() + + + ## Extract the cluster of the fragment (output from Grouper) + # + # @return cluster id (string) + # + def getClusterID(self): + data = self.header.split() + return data[0].split("Cl")[1] + + + ## Extract the group of the sequence (output from Grouper) + # + # @return group id (string) + # + def getGroupID(self): + data = self.header.split() + return data[0].split("Gr")[1].split("Cl")[0] + + + ## Get the header of the full sequence (output from Grouper) + # + # @example 'Dmel_Grouper_3091_Malign_3:LARD' from '>MbS1566Gr81Cl81 Dmel_Grouper_3091_Malign_3:LARD {Fragment} 1..5203' + # @return header (string) + # + def getHeaderFullSeq(self): + data = self.header.split() + return data[1] + + + ## Get the strand of the fragment (output from Grouper) + # + # @return: strand (+ or -) + # + def getFragStrand(self): + data = self.header.split() + coord = data[3].split("..") + if int(coord[0]) < int(coord[-1]): + return "+" + else: + return "-" + + + ## Get A, T, G, C or N from an IUPAC letter + # IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N'] + # + # @return A, T, G, C or N + # + def getATGCNFromIUPAC(self, nt): + subset = ["A", "T", "G", "C", "N"] + + if nt in subset: + return nt + elif nt == "U": + return "T" + elif nt == "R": + return random.choice("AG") + elif nt == "Y": + return random.choice("CT") + elif nt == "M": + return random.choice("CA") + elif nt == "K": + return random.choice("TG") + elif nt == "W": + return random.choice("TA") + elif nt == "S": + return random.choice("CG") + elif nt == "B": + return random.choice("CTG") + elif nt == "D": + return random.choice("ATG") + elif nt == "H": + return random.choice("ATC") + elif nt == "V": + return random.choice("ACG") + else: + return "N" + + ## Get nucleotide from an IUPAC letter and a nucleotide + # Works only for IUPAC code with two possibilities ['R','Y','M','K','W','S'] + # Examples: + # Y and C returns T + # Y and T returns C + # B and C throws RepetException + # + # @return A, T, G, C + # + def getATGCNFromIUPACandATGCN(self, IUPACCode, nt): + if IUPACCode == "R": + possibleNt = set(["A", "G"]) + if nt not in possibleNt: + raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) + return (possibleNt - set(nt)).pop() + + elif IUPACCode == "Y": + possibleNt = set(["C", "T"]) + if nt not in possibleNt: + raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) + return (possibleNt - set(nt)).pop() + + elif IUPACCode == "M": + possibleNt = set(["A", "C"]) + if nt not in possibleNt: + raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) + return (possibleNt - set(nt)).pop() + + elif IUPACCode == "K": + possibleNt = set(["T", "G"]) + if nt not in possibleNt: + raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) + return (possibleNt - set(nt)).pop() + + elif IUPACCode == "W": + possibleNt = set(["A", "T"]) + if nt not in possibleNt: + raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) + return (possibleNt - set(nt)).pop() + + elif IUPACCode == "S": + possibleNt = set(["C", "G"]) + if nt not in possibleNt: + raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) + return (possibleNt - set(nt)).pop() + + else: + raise RepetException("Can't retrieve the third nucleotide from IUPAC code '%s' and nucleotide '%s'" % (IUPACCode, nt)) + + def getSeqWithOnlyATGCN(self): + newSeq = "" + for nt in self.sequence: + newSeq += self.getATGCNFromIUPAC(nt) + return newSeq + + + ## Replace any symbol not in (A,T,G,C,N) by another nucleotide it represents + # + def partialIUPAC(self): + self.sequence = self.getSeqWithOnlyATGCN() + + + ## Remove non Unix end-of-line symbols, if any + # + def checkEOF(self): + symbol = "\r" # corresponds to '^M' from Windows + if symbol in self.sequence: + print "WARNING: Windows EOF removed in '%s'" % (self.header) + sys.stdout.flush() + newSeq = self.sequence.replace(symbol, "") + self.sequence = newSeq + + + ## Write Bioseq instance into a fasta file handler + # + # @param faFileHandler file handler of a fasta file + # + def write(self, faFileHandler): + faFileHandler.write(">%s\n" % (self.header)) + self.writeSeqInFasta(faFileHandler) + + + ## Write only the sequence of Bioseq instance into a fasta file handler + # + # @param faFileHandler file handler of a fasta file + # + def writeSeqInFasta(self, faFileHandler): + i = 0 + while i < self.getLength(): + faFileHandler.write("%s\n" % (self.sequence[i:i + 60])) + i += 60 + + + ## Append Bioseq instance to a fasta file + # + # @param faFile name of a fasta file as a string + # @param mode 'write' or 'append' + # + def save(self, faFile, mode = "a"): + faFileHandler = open(faFile, mode) + self.write(faFileHandler) + faFileHandler.close() + + + ## Append Bioseq instance to a fasta file + # + # @param faFile name of a fasta file as a string + # + def appendBioseqInFile(self, faFile): + self.save(faFile, "a") + + + ## Write Bioseq instance into a fasta file handler + # + # @param faFileHandler file handler on a file with writing right + # + def writeABioseqInAFastaFile(self, faFileHandler): + self.write(faFileHandler) + + + ## Write Bioseq instance with other header into a fasta file handler + # + # @param faFileHandler file handler on a file with writing right + # @param otherHeader a string representing a new header (without the > and the \n) + # + def writeWithOtherHeader(self, faFileHandler, otherHeader): + self.header = otherHeader + self.write(faFileHandler) + + + ## Append Bioseq header and Bioseq sequence in a fasta file + # + # @param faFileHandler file handler on a file with writing right + # @param otherHeader a string representing a new header (without the > and the \n) + # + def writeABioseqInAFastaFileWithOtherHeader(self, faFileHandler, otherHeader): + self.writeWithOtherHeader(faFileHandler, otherHeader) + + + ## get the list of Maps corresponding to seq without gap + # + # @warning This method was called getMap() in pyRepet.Bioseq + # @return a list of Map object + # + def getLMapWhithoutGap(self): + lMaps = [] + countSite = 1 + countSubseq = 1 + inGap = False + startMap = -1 + endMap = -1 + + # initialize with the first site + if self.sequence[0] == "-": + inGap = True + else: + startMap = countSite + + # for each remaining site + for site in self.sequence[1:]: + countSite += 1 + + # if it is a gap + if site == "-": + + # if this is the beginning of a gap, record the previous subsequence + if inGap == False: + inGap = True + endMap = countSite - 1 + lMaps.append(Map("%s_subSeq%i" % (self.header, countSubseq), self.header, startMap, endMap)) + countSubseq += 1 + + # if it is NOT a gap + if site != "-": + + # if it is the end of a gap, begin the next subsequence + if inGap == True: + inGap = False + startMap = countSite + + # if it is the last site + if countSite == self.getLength(): + endMap = countSite + lMaps.append(Map("%s_subSeq%i" % (self.header, countSubseq), self.header, startMap, endMap)) + + return lMaps + + + ## get the percentage of GC + # + # @return a percentage + # + def getGCpercentage(self): + tmpSeq = self.getSeqWithOnlyATGCN() + nbGC = tmpSeq.count("G") + tmpSeq.count("C") + return 100 * nbGC / float(self.getLength()) + + ## get the percentage of GC of a sequence without counting N in sequence length + # + # @return a percentage + # + def getGCpercentageInSequenceWithoutCountNInLength(self): + tmpSeq = self.getSeqWithOnlyATGCN() + nbGC = tmpSeq.count("G") + tmpSeq.count("C") + return 100 * nbGC / float(self.getLength() - self.countNt("N")) + + ## get the 5 prime subsequence of a given length at the given position + # + # @param position integer + # @param flankLength integer subsequence length + # @return a sequence string + # + def get5PrimeFlank(self, position, flankLength): + if(position == 1): + return "" + else: + startOfFlank = 1 + endOfFlank = position - 1 + + if((position - flankLength) > 0): + startOfFlank = position - flankLength + else: + startOfFlank = 1 + + return self.subseq(startOfFlank, endOfFlank).sequence + + + ## get the 3 prime subsequence of a given length at the given position + # In the case of indels, the polymorphism length can be specified + # + # @param position integer + # @param flankLength integer subsequence length + # @param polymLength integer polymorphism length + # @return a sequence string + # + def get3PrimeFlank(self, position, flankLength, polymLength = 1): + if((position + polymLength) > len(self.sequence)): + return "" + else: + startOfFlank = position + polymLength + + if((position + polymLength + flankLength) > len(self.sequence)): + endOfFlank = len(self.sequence) + else: + endOfFlank = position + polymLength + flankLength - 1 + + return self.subseq(startOfFlank, endOfFlank).sequence + + + def _createWordList(self, size, l = ['A', 'T', 'G', 'C']): + if size == 1 : + return l + else: + l2 = [] + for i in l: + for j in ['A', 'T', 'G', 'C']: + l2.append(i + j) + return self._createWordList(size - 1, l2) + + + def removeSymbol(self, symbol): + tmp = self.sequence.replace(symbol, "") + self.sequence = tmp diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/seq/BioseqDB.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/seq/BioseqDB.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,494 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import sys +import re +from commons.core.seq.Bioseq import Bioseq +from commons.core.stat.Stat import Stat + + +## Handle a collection of a Bioseq (header-sequence) +# +class BioseqDB( object ): + + def __init__( self, name="" ): + self.idx = {} + self.idx_renamed = {} + self.db = [] + self.name = name + if name != "": + faFile = open( name ) + self.read( faFile ) + faFile.close() + self.mean_seq_lgth = None + self.stat = Stat() + + + ## Equal operator + # + def __eq__( self, o ): + if type(o) is type(self): + selfSize = self.getSize() + if selfSize != o.getSize(): + return False + nbEqualInstances = 0 + for i in self.db: + atLeastOneIsEqual = False + for j in o.db: + if i == j: + atLeastOneIsEqual = True + continue + if atLeastOneIsEqual: + nbEqualInstances += 1 + if nbEqualInstances == selfSize: + return True + return False + + ## Not equal operator + # + def __ne__(self, o): + return not self.__eq__(o) + + ## Change the name of the BioseqDB + # + # @param name the BioseqDB name + # + def setName(self, name): + self.name = name + + + ## Record each sequence of the input file as a list of Bioseq instances + # + # @param faFileHandler handler of a fasta file + # + def read( self, faFileHandler ): + while True: + seq = Bioseq() + seq.read( faFileHandler ) + if seq.sequence == None: + break + self.add( seq ) + + + ## Write all Bioseq of BioseqDB in a formatted fasta file (60 character long) + # + # @param faFileHandler file handler of a fasta file + # + def write( self, faFileHandler ): + for bs in self.db: + bs.writeABioseqInAFastaFile( faFileHandler ) + + + ## Write all Bioseq of BioseqDB in a formatted fasta file (60 character long) + # + # @param outFaFileName file name of fasta file + # @param mode 'write' or 'append' + # + def save( self, outFaFileName, mode="w" ): + outFaFile = open( outFaFileName, mode ) + self.write( outFaFile ) + outFaFile.close() + + + ## Read a formatted fasta file and load it in the BioseqDB instance + # + # @param inFaFileName file name of fasta file + # + def load(self, inFaFileName): + fichier = open(inFaFileName) + self.read(fichier) + fichier.close() + + + ## Reverse each sequence of the collection + # + def reverse( self ): + for bs in self.db: + bs.reverse() + + + ## Turn each sequence into its complement + # + def complement( self ): + for bs in self.db: + bs.complement() + + + ## Reverse and complement each sequence + # + def reverseComplement( self ): + for bs in self.db: + bs.reverseComplement() + + + ## Set the collection from a list of Bioseq instances + # + def setData( self, lBioseqs ): + for i in lBioseqs: + self.add( i ) + + + ## Initialization of each attribute of the collection + # + def reset( self ): + self.db = [] + self.idx = {} + self.name = None + self.mean_seq_lgth = None + self.stat.reset() + + + ## Remove all the gap of the sequences of the collection + # + def cleanGap(self): + for iBioSeq in self.db: + iBioSeq.cleanGap() + + + ## Add a Bioseq instance and update the attributes + # + # @param bs a Bioseq instance + # + def add( self, bs ): + if self.idx.has_key( bs.header ): + sys.stderr.write( "ERROR: two sequences with same header '%s'\n" % ( bs.header ) ) + sys.exit(1) + self.db.append( bs ) + self.idx[ bs.header ] = len(self.db) - 1 + self.idx_renamed[ bs.header.replace("::","-").replace(":","-").replace(",","-").replace(" ","_") ] = len(self.db) - 1 + + + ## Give the Bioseq instance corresponding to specified index + # + # @return a Bioseq instance + # + def __getitem__(self,index): + if index < len(self.db): + return self.db[index] + + + ## Give the number of sequences in the bank + # + # @return an integer + # + def getSize( self ): + return len( self.db ) + + + ## Give the cumulative sequence length in the bank + # + # @return an integer + # + def getLength( self ): + cumLength = 0 + for iBioseq in self.db: + cumLength += iBioseq.getLength() + + return cumLength + + + ## Return the length of a given sequence via its header + # + # @return an integer + # + def getSeqLength( self, header ): + return self.fetch(header).getLength() + + + ## Return a list with the sequence headers + # + def getHeaderList( self ): + lHeaders = [] + for bs in self.db: + lHeaders.append( bs.header ) + return lHeaders + + + ## Return a list with the sequences + # + def getSequencesList( self ): + lSeqs = [] + for bs in self.db: + lSeqs.append( bs.getSequence() ) + return lSeqs + + + ## Give the Bioseq instance of the BioseqDB specified by its header + # + # @warning name of this method not appropriate getBioseqByHeader is proposed + # @param header string + # @return a Bioseq instance + # + def fetch( self, header ): + idx = self.idx.get(header,None) + if idx is not None: + return self.db[idx] + else: + idx = self.idx_renamed.get(header,None) + if idx is not None: + return self.db[idx] + else: + raise Exception("Header: "+header+" not found") + + + ## Get a list of Bioseq instances based on a list of headers + # + # @param lHeader list + # @return a list of Bioseq instances + # + def fetchList( self, lheader ): + result = [] + for headerName in lheader: + result.append(self.fetch( headerName )) + return result + + + ## Sort self on its Bioseq size, possibly by decreasing length + # + # @param reverse boolean + # + def sortByLength(self, reverse = False): + self.db.sort(key = lambda iBS: iBS.getLength(), reverse = reverse) + + + ## Give the Bioseq instance of the BioseqDB specified by its renamed header + # In renamed header "::", ":", "," character are been replaced by "-" and " " by "_" + # + # @param renamedHeader string + # @return a Bioseq instance + # + def getBioseqByRenamedHeader( self, renamedHeader ): + return self.db[self.idx_renamed[renamedHeader]] + + + ## Count the number of times the given nucleotide is present in the bank. + # + # @param nt character (nt or aa) + # @return an integer + # + def countNt( self, nt ): + total = 0 + for iBioseq in self.db: + total+= iBioseq.countNt( nt ) + return total + + + ## Count the number of times each nucleotide (A,T,G,C,N) is present in the bank. + # + # @return a dictionary with nucleotide as key and an integer as values + # + def countAllNt( self ): + dNt2Count = {} + for nt in ["A","T","G","C","N"]: + dNt2Count[ nt ] = self.countNt( nt ) + return dNt2Count + + + ## Extract a sub BioseqDB of specified size which beginning at specified start + # + # @param start integer index of first included Bioseq + # @param size integer size of expected BioseqDB + # @return a BioseqDB + # + def extractPart(self, start, size): + iShorterBioseqDB = BioseqDB() + for iBioseq in self.db[start:(start + size)]: + iShorterBioseqDB.add(iBioseq) + return iShorterBioseqDB + + + ## Extract a sub BioseqDB with the specified number of best length Bioseq + # + # @param numBioseq integer the number of Bioseq searched + # @return a BioseqDB + # + def bestLength(self, numBioseq): + length_list = [] + numseq = 0 + for each_seq in self.db: + if each_seq.sequence == None: + l=0 + else: + l = each_seq.getLength() + length_list.append(l) + numseq = numseq + 1 + + length_list.sort() + size = len(length_list) + if numBioseq < size: + len_min = length_list[size-numBioseq] + else: + len_min = length_list[0] + + numseq = 0 + nbsave = 0 + bestSeqs = BioseqDB() + bestSeqs.setName(self.name) + for each_seq in self.db: + if each_seq.sequence == None: + l=0 + else : + l = each_seq.getLength() + numseq = numseq + 1 + if l >= len_min: + bestSeqs.add(each_seq) + nbsave = nbsave + 1 + if nbsave == numBioseq : + break + return bestSeqs + + + ## Extract a sub BioseqDB from a file with Bioseq header containing the specified pattern + # + # @param pattern regular expression of wished Bioseq header + # @param inFileName name of fasta file in which we want extract the BioseqDB + # + def extractPatternOfFile(self, pattern, inFileName): + if pattern=="" : + return + srch=re.compile(pattern) + file_db=open(inFileName) + numseq=0 + nbsave=0 + while 1: + seq=Bioseq() + seq.read(file_db) + if seq.sequence==None: + break + numseq+=1 + m=srch.search(seq.header) + if m: + self.add(seq) + nbsave+=1 + file_db.close() + + + ## Extract a sub BioseqDB from the instance with all Bioseq header containing the specified pattern + # + # @param pattern regular expression of wished Bioseq header + # + # @return a BioseqDB + # + def getByPattern(self,pattern): + if pattern=="" : + return + iBioseqDB=BioseqDB() + srch=re.compile(pattern) + for iBioseq in self.db: + if srch.search(iBioseq.header): + iBioseqDB.add(iBioseq) + return iBioseqDB + + + ## Extract a sub BioseqDB from the instance with all Bioseq header not containing the specified pattern + # + # @param pattern regular expression of not wished Bioseq header + # + # @return a BioseqDB + # + def getDiffFromPattern(self,pattern): + if pattern=="" : + return + iBioseqDB=BioseqDB() + srch=re.compile(pattern) + for iBioseq in self.db: + if not srch.search(iBioseq.header): + iBioseqDB.add(iBioseq) + return iBioseqDB + + #TODO: to run several times to remove all concerned sequences when big data. How to fix it ? + ## Remove from the instance all Bioseq which header contains the specified pattern + # + # @param pattern regular expression of not wished Bioseq header + # + def rmByPattern(self,pattern): + if pattern=="" : + return + srch=re.compile(pattern) + for seq in self.db: + if srch.search(seq.header): + self.db.remove(seq) + + + ## Copy a part from another BioseqDB in the BioseqDB if Bioseq have got header containing the specified pattern + # + # @warning this method is called extractPattern in pyRepet.seq.BioseqDB + # + # @param pattern regular expression of wished Bioseq header + # @param sourceBioseqDB the BioseqDB from which we want extract Bioseq + # + def addBioseqFromABioseqDBIfHeaderContainPattern(self, pattern, sourceBioseqDB): + if pattern=="" : + return + srch=re.compile(pattern) + for seq in sourceBioseqDB.db: + m=srch.search(seq.header) + if m: + self.add(seq) + + + ## Up-case the sequence characters in all sequences + # + def upCase( self ): + for bs in self.db: + bs.upCase() + + + ## Split each gapped Bioseq in a list and store all in a dictionary + # + # @return a dict, keys are bioseq headers, values are list of Map instances + # + def getDictOfLMapsWithoutGaps( self ): + dSeq2Maps = {} + + for bs in self.db: + dSeq2Maps[ bs.header ] = bs.getLMapWhithoutGap() + + return dSeq2Maps + + ## Give the list of the sequence length in the bank + # + # @return an list + # + def getListOfSequencesLength( self ): + lLength = [] + for iBioseq in self.db: + lLength.append(iBioseq.getLength()) + + return lLength + + ## Return sequence length for a list of sequence header + # + def getSeqLengthByListOfName( self, lHeaderName ): + lseqLength=[] + for headerName in lHeaderName: + lseqLength.append(self.getSeqLength( headerName )) + return lseqLength diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/seq/BioseqUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/seq/BioseqUtils.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,296 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import math +import re +from commons.core.seq.Bioseq import Bioseq + +## Static methods for sequences manipulation +# +class BioseqUtils(object): + + ## Translate a nucleotide sequence + # + # @param bioSeqInstanceToTranslate a bioseq instance to translate + # @param phase a integer : 1 (default), 2 or 3 + # + def translateSequence(bioSeqInstanceToTranslate, phase=1): + pep = "" + #length = math.floor((len(self.sequence)-phase-1)/3)*3 + length = int( math.floor( ( len(bioSeqInstanceToTranslate.sequence )-( phase-1 ) )/3 )*3 ) + #We need capital letters ! + bioSeqInstanceToTranslate.upCase() + sequence = bioSeqInstanceToTranslate.sequence + for i in xrange(phase-1,length,3): + if (sequence[i:i+3] == "TTT" or sequence[i:i+3] == "TTC"): + pep = pep + "F" + elif ( sequence[i:i+3] == "TTA" or sequence[i:i+3] == "TTG" ): + pep = pep + "L" + elif ( sequence[i:i+2] == "CT" ): + pep = pep + "L" + elif ( sequence[i:i+3] == "ATT" or sequence[i:i+3] == "ATC" or sequence[i:i+3] == "ATA" ): + pep = pep + "I" + elif ( sequence[i:i+3] == "ATG" ): + pep = pep + "M" + elif ( sequence[i:i+2] == "GT" ): + pep = pep + "V" + elif ( sequence[i:i+2] == "TC" ) : + pep = pep + "S" + elif ( sequence[i:i+2] == "CC" ) : + pep = pep + "P" + elif ( sequence[i:i+2] == "AC" ) : + pep = pep + "T" + elif ( sequence[i:i+2] == "GC" ) : + pep = pep + "A" + elif ( sequence[i:i+3] == "TAT" or sequence[i:i+3] == "TAC" ) : + pep = pep + "Y" + elif ( sequence[i:i+3] == "TAA" or sequence[i:i+3] == "TAG" ) : + pep = pep + "*" + elif ( sequence[i:i+3] == "CAT" or sequence[i:i+3] == "CAC" ) : + pep = pep + "H" + elif ( sequence[i:i+3] == "CAA" or sequence[i:i+3] == "CAG" ) : + pep = pep + "Q" + elif ( sequence[i:i+3] == "AAT" or sequence[i:i+3] == "AAC" ) : + pep = pep + "N" + elif ( sequence[i:i+3] == "AAA" or sequence[i:i+3] == "AAG" ) : + pep = pep + "K" + elif ( sequence[i:i+3] == "GAT" or sequence[i:i+3] == "GAC" ) : + pep = pep + "D" + elif ( sequence[i:i+3] == "GAA" or sequence[i:i+3] == "GAG" ) : + pep = pep + "E" + elif ( sequence[i:i+3] == "TGT" or sequence[i:i+3] == "TGC" ) : + pep = pep + "C" + elif ( sequence[i:i+3] == "TGA" ) : + pep = pep + "*" + elif ( sequence[i:i+3] == "TGG" ) : + pep = pep + "W" + elif ( sequence[i:i+2] == "CG" ) : + pep = pep + "R" + elif ( sequence[i:i+3] == "AGT" or sequence[i:i+3] == "AGC" ) : + pep = pep + "S" + elif ( sequence[i:i+3] == "AGA" or sequence[i:i+3] == "AGG" ) : + pep = pep + "R" + elif ( sequence[i:i+2] == "GG" ): + pep = pep + "G" + #We don't know the amino acid because we don't have the nucleotide + #R Purine (A or G) + #Y Pyrimidine (C, T, or U) + #M C or A + #K T, U, or G + #W T, U, or A + #S C or G + #B C, T, U, or G (not A) + #D A, T, U, or G (not C) + #H A, T, U, or C (not G) + #V A, C, or G (not T, not U) + #N Unknown nucleotide + elif ( re.search("N|R|Y|M|K|W|S|B|D|H|V", sequence[i:i+3])): + pep = pep + "X" + bioSeqInstanceToTranslate.sequence = pep + + translateSequence = staticmethod(translateSequence) + + ## Add the frame info in header + # + # @param bioSeqInstance a bioseq instance to translate + # @param phase a integer : 1 , 2 or 3 + # + def setFrameInfoOnHeader(bioSeqInstance, phase): + if " " in bioSeqInstance.header: + name, desc = bioSeqInstance.header.split(" ", 1) + name = name + "_" + str(phase) + bioSeqInstance.header = name + " " + desc + else: + bioSeqInstance.header = bioSeqInstance.header + "_" + str(phase) + + setFrameInfoOnHeader = staticmethod(setFrameInfoOnHeader) + + ## Translate a nucleotide sequence for all frames (positives and negatives) + # + # @param bioSeqInstanceToTranslate a bioseq instance to translate + # + def translateInAllFrame( bioSeqInstanceToTranslate ): + positives = BioseqUtils._translateInPositiveFrames( bioSeqInstanceToTranslate ) + negatives = BioseqUtils._translateInNegativeFrames( bioSeqInstanceToTranslate ) + listAll6Frames = [] + listAll6Frames.extend(positives) + listAll6Frames.extend(negatives) + return listAll6Frames + + translateInAllFrame = staticmethod(translateInAllFrame) + + ## Replace the stop codons by X in sequence + # + # @param bioSeqInstance a bioseq instance + # + def replaceStopCodonsByX( bioSeqInstance ): + bioSeqInstance.sequence = bioSeqInstance.sequence.replace ("*", "X") + + replaceStopCodonsByX = staticmethod(replaceStopCodonsByX) + + ## Translate in a list all the frames of all the bioseq of bioseq list + # + # @param bioseqList a list of bioseq instances + # @return a list of translated bioseq instances + # + def translateBioseqListInAllFrames( bioseqList ): + bioseqListInAllFrames = [] + for bioseq in bioseqList : + bioseqListInAllFrames.extend(BioseqUtils.translateInAllFrame(bioseq)) + return bioseqListInAllFrames + + translateBioseqListInAllFrames = staticmethod( translateBioseqListInAllFrames ) + + ## Replace the stop codons by X for each sequence of a bioseq list + # + # @param lBioseqWithStops a list of bioseq instances + # @return a list of bioseq instances + # + def replaceStopCodonsByXInBioseqList ( lBioseqWithStops ): + bioseqListWithStopsreplaced = [] + for bioseq in lBioseqWithStops: + BioseqUtils.replaceStopCodonsByX(bioseq) + bioseqListWithStopsreplaced.append(bioseq) + return bioseqListWithStopsreplaced + + replaceStopCodonsByXInBioseqList = staticmethod( replaceStopCodonsByXInBioseqList ) + + ## Write a list of bioseq instances in a fasta file (60 characters per line) + # + # @param lBioseq a list of bioseq instances + # @param fileName string + # + def writeBioseqListIntoFastaFile( lBioseq, fileName ): + fout = open(fileName, "w") + for bioseq in lBioseq: + bioseq.write(fout) + fout.close() + + writeBioseqListIntoFastaFile = staticmethod( writeBioseqListIntoFastaFile ) + + ## read in a fasta file and create a list of bioseq instances + # + # @param fileName string + # @return a list of bioseq + # + def extractBioseqListFromFastaFile( fileName ): + file = open( fileName ) + lBioseq = [] + currentHeader = "" + while currentHeader != None: + bioseq = Bioseq() + bioseq.read(file) + currentHeader = bioseq.header + if currentHeader != None: + lBioseq.append(bioseq) + return lBioseq + + extractBioseqListFromFastaFile = staticmethod( extractBioseqListFromFastaFile ) + + ## Give the length of a sequence search by name + # + # @param lBioseq a list of bioseq instances + # @param seqName string + # @return an integer + # + def getSeqLengthWithSeqName( lBioseq, seqName ): + length = 0 + for bioseq in lBioseq: + if bioseq.header == seqName: + length = bioseq.getLength() + break + return length + + getSeqLengthWithSeqName = staticmethod( getSeqLengthWithSeqName ) + + def _translateInPositiveFrames( bioSeqInstanceToTranslate ): + seq1 = bioSeqInstanceToTranslate.copyBioseqInstance() + BioseqUtils.setFrameInfoOnHeader(seq1, 1) + BioseqUtils.translateSequence(seq1, 1) + seq2 = bioSeqInstanceToTranslate.copyBioseqInstance() + BioseqUtils.setFrameInfoOnHeader(seq2, 2) + BioseqUtils.translateSequence(seq2, 2) + seq3 = bioSeqInstanceToTranslate.copyBioseqInstance() + BioseqUtils.setFrameInfoOnHeader(seq3, 3) + BioseqUtils.translateSequence(seq3, 3) + return [seq1, seq2, seq3] + + _translateInPositiveFrames = staticmethod( _translateInPositiveFrames ) + + def _translateInNegativeFrames(bioSeqInstanceToTranslate): + seq4 = bioSeqInstanceToTranslate.copyBioseqInstance() + seq4.reverseComplement() + BioseqUtils.setFrameInfoOnHeader(seq4, 4) + BioseqUtils.translateSequence(seq4, 1) + seq5 = bioSeqInstanceToTranslate.copyBioseqInstance() + seq5.reverseComplement() + BioseqUtils.setFrameInfoOnHeader(seq5, 5) + BioseqUtils.translateSequence(seq5, 2) + seq6 = bioSeqInstanceToTranslate.copyBioseqInstance() + seq6.reverseComplement() + BioseqUtils.setFrameInfoOnHeader(seq6, 6) + BioseqUtils.translateSequence(seq6, 3) + return [seq4, seq5, seq6] + + _translateInNegativeFrames = staticmethod( _translateInNegativeFrames ) + + + ## Return a dictionary which keys are sequence headers and values sequence lengths. + # + def getLengthPerSeqFromFile( inFile ): + dHeader2Length = {} + inFileHandler = open( inFile, "r" ) + while True: + iBs = Bioseq() + iBs.read( inFileHandler ) + if iBs.sequence == None: + break + dHeader2Length[ iBs.header ] = iBs.getLength() + inFileHandler.close() + return dHeader2Length + + getLengthPerSeqFromFile = staticmethod( getLengthPerSeqFromFile ) + + + ## Return the list of Bioseq instances, these being sorted in decreasing length + # + def getBioseqListSortedByDecreasingLength( lBioseqs ): + return sorted( lBioseqs, key=lambda iBs: ( iBs.getLength() ), reverse=True ) + + getBioseqListSortedByDecreasingLength = staticmethod( getBioseqListSortedByDecreasingLength ) + + + ## Return the list of Bioseq instances, these being sorted in decreasing length (without gaps) + # + def getBioseqListSortedByDecreasingLengthWithoutGaps( lBioseqs ): + return sorted( lBioseqs, key=lambda iBs: ( len(iBs.sequence.replace("-","")) ), reverse=True ) + + getBioseqListSortedByDecreasingLengthWithoutGaps = staticmethod( getBioseqListSortedByDecreasingLengthWithoutGaps ) diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/seq/ClusterConsensusCollection.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/seq/ClusterConsensusCollection.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,71 @@ +import re +from commons.core.seq.BioseqDB import BioseqDB + +## Record a collection of bioseqDB representing cluster consensus +# +class ClusterConsensusCollection(object): + + ## constructor + # + # @param clusterFileName string name of file containing the cluster of consensus + # + def __init__(self, clusterFileName): + self._clusterFileName = clusterFileName + self._lClusterConsensus = [] + + def __eq__(self, o): + if type(o) is type(self): + return self._clusterFileName == o._clusterFileName and self._lClusterConsensus == o._lClusterConsensus + return False + + def __ne__(self, o): + return not self.__eq__(o) + + def getLClusterConsensus(self): + return self._lClusterConsensus + + def fillCollection(self): + iBioseqDBAllCluster = BioseqDB() + fClusterFile = open(self._clusterFileName, "r") + iBioseqDBAllCluster.read(fClusterFile) + fClusterFile.close() + lHeader = iBioseqDBAllCluster.getHeaderList() + firstHeader = lHeader[0] + previousClusterName, seqHeader = self._getClusterNameAndSeqHeader(firstHeader) + clusterConsensus = BioseqDB() + clusterConsensus.setName(previousClusterName) + self._addBioseqInClusterConsensus(iBioseqDBAllCluster, firstHeader, seqHeader, clusterConsensus) + for header in lHeader[1:]: + clusterName, seqHeader = self._getClusterNameAndSeqHeader(header) + if clusterName != previousClusterName: + self._lClusterConsensus.append(clusterConsensus) + previousClusterName = clusterName + clusterConsensus = BioseqDB() + clusterConsensus.setName(previousClusterName) + self._addBioseqInClusterConsensus(iBioseqDBAllCluster, header, seqHeader, clusterConsensus) + self._lClusterConsensus.append(clusterConsensus) + + def _getClusterNameAndSeqHeader(self, header): + m = re.match("(\D*)(\d+)Mb\d+\s.*", header) + clusterNumber = m.group(2) + clusterName = m.group(1) + clusterNumber + lPartsHeaderheader = header.split(" ") + seqHeader = lPartsHeaderheader[1] + return clusterName, seqHeader + + def _addBioseqInClusterConsensus(self, iBioseqDBAllCluster, firstHeader, seqHeader, clusterConsensus): + ibioseq = iBioseqDBAllCluster.fetch(firstHeader) + ibioseq.setHeader(seqHeader) + clusterConsensus.add(ibioseq) + + def getNumClusterForAConsensus(self, seqName): + nbCluster = 1 + for bioseqDB in self._lClusterConsensus: + if seqName in bioseqDB.getHeaderList(): + return nbCluster + nbCluster += 1 + + def getNumConsensusInCluster(self, numCluster): + return self._lClusterConsensus[numCluster - 1].getSize() + + diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/seq/FastaUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/seq/FastaUtils.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,1163 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import os +import re +import sys +import math +import glob +import string +import shutil +from commons.core.utils.FileUtils import FileUtils +from commons.core.seq.Bioseq import Bioseq +from commons.core.seq.BioseqDB import BioseqDB +from commons.core.coord.MapUtils import MapUtils +from commons.core.coord.Range import Range +from commons.core.coord.ConvCoord import ConvCoord +from commons.core.parsing.FastaParser import FastaParser +from commons.core.checker.CheckerUtils import CheckerUtils +from commons.core.launcher.LauncherUtils import LauncherUtils + + +## Static methods for fasta file manipulation +# +class FastaUtils( object ): + + ## Count the number of sequences in the input fasta file + # + # @param inFile name of the input fasta file + # + # @return integer number of sequences in the input fasta file + # + @staticmethod + def dbSize( inFile ): + nbSeq = 0 + with open(inFile) as fH: + for line in fH: + if line[0] == ">": + nbSeq += 1 + + return nbSeq + + + ## Compute the cumulative sequence length in the input fasta file + # + # @param inFile handler of the input fasta file + # + @staticmethod + def dbCumLength( inFile ): + cumLength = 0 + for line in inFile: + if line[0] != ">": + cumLength += len(string.rstrip(line)) + + return cumLength + + + ## Return a list with the length of each sequence in the input fasta file + # + # @param inFile string name of the input fasta file + # + @staticmethod + def dbLengths(inFile): + lLengths = [] + currentLength = 0 + with open(inFile) as fH: + for line in fH: + if line[0] == ">": + if currentLength != 0: + lLengths.append(currentLength) + currentLength = 0 + else: + currentLength += len(line[:-1]) + lLengths.append(currentLength) + + return lLengths + + + ## Retrieve the sequence headers present in the input fasta file + # + # @param inFile string name of the input fasta file + # @param verbose integer level of verbosity + # + # @return list of sequence headers + # + @staticmethod + def dbHeaders(inFile, verbose = 0): + lHeaders = [line[1:].rstrip() for line in open(inFile) if line[0] == ">"] + if verbose: + for header in lHeaders: + print header + + return lHeaders + + + ## Cut a data bank into chunks according to the input parameters + # If a sequence is shorter than the threshold, it is only renamed (not cut) + # + # @param inFileName string name of the input fasta file + # @param chkLgth string chunk length (in bp, default=200000) + # @param chkOver string chunk overlap (in bp, default=10000) + # @param wordN string N stretch word length (default=11, 0 for no detection) + # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map') + # @param clean boolean remove 'cut' and 'Nstretch' files + # @param verbose integer (default = 0) + # + @staticmethod + def dbChunks(inFileName, chkLgth = "200000", chkOver = "10000", wordN = "11", outFilePrefix = "", clean = False, verbose = 0): + nbSeq = FastaUtils.dbSize(inFileName) + if verbose > 0: + print "cut the %i input sequences with cutterDB..." % nbSeq + sys.stdout.flush() + + prg = "cutterDB" + cmd = prg + cmd += " -l %s" % chkLgth + cmd += " -o %s" % chkOver + cmd += " -w %s" % wordN + cmd += " %s" % inFileName + returnStatus = os.system(cmd) + if returnStatus != 0: + msg = "ERROR: '%s' returned '%i'" % (prg, returnStatus) + sys.stderr.write("%s\n" % msg) + sys.exit(1) + + nbChunks = FastaUtils.dbSize("%s_cut" % inFileName) + if verbose > 0: + print "done (%i chunks)" % ( nbChunks ) + sys.stdout.flush() + + if verbose > 0: + print "rename the headers..." + sys.stdout.flush() + + if outFilePrefix == "": + outFastaName = inFileName + "_chunks.fa" + outMapName = inFileName + "_chunks.map" + else: + outFastaName = outFilePrefix + ".fa" + outMapName = outFilePrefix + ".map" + + with open("%s_cut" % inFileName) as inFile: + outFasta = open(outFastaName, "w") + outMap = open(outMapName, "w") + + for line in inFile: + if line[0] == ">": + if verbose > 1: + print "rename '%s'" % (line[:-1]); sys.stdout.flush() + data = line[:-1].split(" ") + seqID = data[0].split(">")[1] + newHeader = "chunk%s" % (str(seqID).zfill(len(str(nbChunks)))) + oldHeader = data[2] + seqStart = data[4].split("..")[0] + seqEnd = data[4].split("..")[1] + outMap.write("%s\t%s\t%s\t%s\n" % (newHeader, oldHeader, seqStart, seqEnd)) + outFasta.write(">%s\n" % newHeader) + + else: + outFasta.write(line.upper()) + + outFasta.close() + outMap.close() + + #stats on .Nstretch.map file + genomeLength = FastaUtils.dbCumLength(open(inFileName)) + NstretchMapFile = inFileName + ".Nstretch.map" + outNstrechStats = open('%s.NstretchStats.txt' % inFileName , "w") + if FileUtils.isEmpty(NstretchMapFile) or not FileUtils.isRessourceExists(NstretchMapFile): + outNstrechStats.write("No N in stretch length > %s\n" % wordN) + else: + with open(NstretchMapFile) as f: + dHeader2lLengths = {} + for line in f: + data = line.rstrip().split() + header = data[1] + length = int(data[3]) - int(data[2]) + 1 + if header not in dHeader2lLengths: + dHeader2lLengths[header] = [] + dHeader2lLengths[header].append(length) + + for header in sorted(dHeader2lLengths): + lLengths = dHeader2lLengths[header] + outNstrechStats.write("%s\tmin: %s\tmax: %s\tcumul: %s\n" % (header, min(lLengths), max(lLengths), sum(lLengths))) + + cumulAllStretch = sum([sum(lengths) for lengths in dHeader2lLengths.values()]) + + NstretchPrct = float(cumulAllStretch)/genomeLength*100 + outNstrechStats.write("Total N in stretch length > %s: %s bp represent %6.2f %% of genome\n" % (wordN, cumulAllStretch, NstretchPrct)) + outNstrechStats.close() + + if clean == True: + os.remove(inFileName + "_cut") + os.remove(NstretchMapFile) + + + ## Split the input fasta file in several output files + # + # @param inFile string name of the input fasta file + # @param nbSeqPerBatch integer number of sequences per output file + # @param newDir boolean put the sequences in a new directory called 'batches' + # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1') + # @param prefix prefix in output file name + # @param verbose integer verbosity level (default = 0) + # + @staticmethod + def dbSplit(inFile, nbSeqPerBatch, newDir, useSeqHeader = False, prefix = "batch", verbose = 0): + if not os.path.exists(inFile): + msg = "ERROR: file '%s' doesn't exist" % inFile + sys.stderr.write("%s\n" % msg) + sys.exit(1) + + nbSeq = FastaUtils.dbSize(inFile) + + nbBatches = int(math.ceil(nbSeq / float(nbSeqPerBatch))) + if verbose > 0: + print "save the %i input sequences into %i batches" % (nbSeq, nbBatches) + sys.stdout.flush() + + if nbSeqPerBatch != 1 and useSeqHeader: + useSeqHeader = False + + if newDir == True: + if os.path.exists("batches"): + shutil.rmtree("batches") + os.mkdir("batches") + os.chdir("batches") + os.system("ln -s ../%s ." % inFile) + + with open(inFile) as inFileHandler: + countBatch = 0 + countSeq = 0 + for line in inFileHandler: + if line[0] == ">": + countSeq += 1 + if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1: + try: + outFile.close() + except: pass + countBatch += 1 + if useSeqHeader: + outFileName = "%s.fa" % (line[1:-1].replace(" ", "_")) + else: + outFileName = "%s_%s.fa" % (prefix, str(countBatch).zfill(len(str(nbBatches)))) + outFile = open(outFileName, "w") + + if verbose > 1: + print "saving seq '%s' in file '%s'..." % (line[1:].rstrip(), outFileName) + sys.stdout.flush() + outFile.write(line) + + if newDir: + os.remove(os.path.basename(inFile)) + os.chdir("..") + + + ## Split the input fasta file in several output files + # + # @param inFileName string name of the input fasta file + # @param maxSize integer max cumulative length for each output file + # + @staticmethod + def splitFastaFileInBatches(inFileName, maxSize = 200000): + iBioseqDB = BioseqDB(inFileName) + lHeadersSizeTuples = [] + for iBioseq in iBioseqDB.db: + lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength())) + + lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize) + os.mkdir("batches") + os.chdir("batches") + + iterator = 0 + for lHeader in lHeadersList: + iterator += 1 + with open("batch_%s.fa" % iterator, 'w') as f: + for header in lHeader : + iBioseq = iBioseqDB.fetch(header) + iBioseq.write(f) + os.chdir("..") + + + ## Split the input fasta file in several output files according to their cluster identifier + # + # @param inFileName string name of the input fasta file + # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust) + # @param simplifyHeader boolean simplify the headers + # @param createDir boolean put the sequences in different directories + # @param outPrefix string prefix of the output files (default='seqCluster') + # @param verbose integer (default = 0) + # + @staticmethod + def splitSeqPerCluster(inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix = "seqCluster", verbose = 0): + if not os.path.exists(inFileName): + print "ERROR: %s doesn't exist" % inFileName + sys.exit(1) + + inFile = open(inFileName) + + line = inFile.readline() + if line: + name = line.split(" ")[0] + if "Cluster" in name: + clusterID = name.split("Cluster")[1].split("Mb")[0] + seqID = name.split("Mb")[1] + else: + clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust + if "Q" in name.split("Gr")[0]: + seqID = name.split("Gr")[0].split("MbQ")[1] + elif "S" in name: + seqID = name.split("Gr")[0].split("MbS")[1] + sClusterIDs = set( [ clusterID ] ) + if simplifyHeader == True: + header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) + else: + header = line[1:-1] + if createDir == True: + if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): + os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) + os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) + outFileName = "%s%s.fa" % ( outPrefix, clusterID ) + outFile = open( outFileName, "w" ) + outFile.write( ">%s\n" % ( header ) ) + prevClusterID = clusterID + + line = inFile.readline() + while line: + if line[0] == ">": + name = line.split(" ")[0] + if "Cluster" in name: + clusterID = name.split("Cluster")[1].split("Mb")[0] + seqID = name.split("Mb")[1] + else: + clusterID = name.split("Cl")[0].split("Gr")[1] + if "Q" in name.split("Gr")[0]: + seqID = name.split("Gr")[0].split("MbQ")[1] + elif "S" in name: + seqID = name.split("Gr")[0].split("MbS")[1] + + if clusterID != prevClusterID: + outFile.close() + + if simplifyHeader == True: + header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) + else: + header = line[1:-1] + + if createDir == True: + os.chdir( ".." ) + if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): + os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) + os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) + + outFileName = "%s%s.fa" % ( outPrefix, clusterID ) + if not os.path.exists( outFileName ): + outFile = open( outFileName, "w" ) + else: + if clusterID != prevClusterID: + outFile.close() + outFile = open( outFileName, "a" ) + outFile.write( ">%s\n" % ( header ) ) + prevClusterID = clusterID + sClusterIDs.add( clusterID ) + + else: + outFile.write( line ) + + line = inFile.readline() + + outFile.close() + if verbose > 0: + print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush() + + if createDir == True: + os.chdir("..") + else: + print "WARNING: empty input file - no cluster found"; sys.stdout.flush() + + + ## Filter a fasta file in two fasta files using the length of each sequence as a criterion + # + # @param len_min integer length sequence criterion to filter + # @param inFileName string name of the input fasta file + # @param verbose integer (default = 0) + # + @staticmethod + def dbLengthFilter(len_min, inFileName, verbose = 0): + file_db = open(inFileName,) + file_dbInf = open(inFileName + ".Inf" + str(len_min), "w") + file_dbSup = open(inFileName + ".Sup" + str(len_min), "w") + seq = Bioseq() + numseq = 0 + nbsaveInf = 0 + nbsaveSup = 0 + seq.read(file_db) + while seq.getHeader(): + l = seq.getLength() + numseq = numseq + 1 + if l >= len_min: + seq.write(file_dbSup) + if verbose > 0: + nbsaveSup = nbsaveSup + 1 + else: + seq.write(file_dbInf) + if verbose > 0: + nbsaveInf = nbsaveInf + 1 + seq.read(file_db) + + file_db.close() + file_dbInf.close() + file_dbSup.close() + if verbose > 0: + print "%i saved sequences in %s: %i sequences for %s.Inf%s and %i sequences for %s.Sup%s" % (nbsaveInf + nbsaveSup, inFileName, nbsaveInf, inFileName, str(len_min), nbsaveSup, inFileName, str(len_min)) + + + ## Extract the longest sequences from a fasta file + # + # @param num integer maximum number of sequences in the output file + # @param inFileName string name of the input fasta file + # @param outFileName string name of the output fasta file + # @param minThresh integer minimum length threshold (default=0) + # @param verbose integer (default = 0) + # + @staticmethod + def dbLongestSequences(num, inFileName, outFileName = "", verbose = 0, minThresh = 0): + bsDB = BioseqDB(inFileName) + if verbose > 0: + print "nb of input sequences: %i" % bsDB.getSize() + + if outFileName == "": + outFileName = inFileName + ".best" + str(num) + outFile = open( outFileName, "w" ) + + if bsDB.getSize()==0: + return 0 + + num = int(num) + if verbose > 0: + print "keep the %i longest sequences" % num + if minThresh > 0: + print "with length > %i bp" % minThresh + sys.stdout.flush() + + tmpLSeqLgth = bsDB.getListOfSequencesLength() + tmpLSeqLgth.sort(reverse = True) + + # select the longests + lSeqLgth = [] + for i in xrange( 0, min(num,len(tmpLSeqLgth)) ): + if tmpLSeqLgth[i] >= minThresh: + lSeqLgth.append( tmpLSeqLgth[i] ) + if verbose > 0: + print "selected max length: %i" % max(lSeqLgth) + print "selected min length: %i" % min(lSeqLgth) + sys.stdout.flush() + + # save the longest + inFile = open( inFileName ) + seqNum = 0 + nbSave = 0 + for bs in bsDB.db: + seqNum += 1 + if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh: + bs.write( outFile ) + if verbose > 1: + print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] ) + sys.stdout.flush() + nbSave += 1 + if nbSave == num: + break + inFile.close() + outFile.close() + if verbose > 0: + print nbSave, "saved sequences in ", outFileName + sys.stdout.flush() + + return 0 + + + ## Extract all the sequence headers from a fasta file and write them in a new file + # + # @param inFileName string name of the input fasta file + # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers') + # + @staticmethod + def dbExtractSeqHeaders(inFileName, outFileName = ""): + if not outFileName: + outFileName = inFileName + ".headers" + + with open(outFileName, "w") as f: + for header in FastaUtils.dbHeaders(inFileName): + f.write("%s\n" % header) + + return 0 + + + ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file + # + # @param pattern regular expression to search in headers + # @param inFileName string name of the input fasta file + # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') + # @param verbose integer verbosity level (default = 0) + # + @staticmethod + def dbExtractByPattern(pattern, inFileName, outFileName = "", verbose = 0): + if not pattern: + return + + if not outFileName: + outFileName = inFileName + '.extracted' + outFile = open(outFileName, 'w') + + patternTosearch = re.compile(pattern) + bioseq = Bioseq() + bioseqNb = 0 + savedBioseqNb = 0 + inFile = open(inFileName) + bioseq.read(inFile) + while bioseq.sequence: + bioseqNb = bioseqNb + 1 + m = patternTosearch.search(bioseq.header) + if m: + bioseq.write(outFile) + if verbose > 1: + print 'sequence num', bioseqNb, 'matched on', m.group(), '[', bioseq.header[0:40], '...] saved !!' + savedBioseqNb = savedBioseqNb + 1 + bioseq.read(inFile) + inFile.close() + + outFile.close() + + if verbose > 0: + print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) + + + ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file + # + # @param patternFileName string file containing regular expression to search in headers + # @param inFileName string name of the input fasta file + # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') + # @param verbose integer verbosity level (default = 0) + # + @staticmethod + def dbExtractByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0): + if not patternFileName: + print "ERROR: no file of pattern" + sys.exit(1) + + bioseq = Bioseq() + bioseqNb = 0 + savedBioseqNb = 0 + lHeaders = [] + + with open(inFileName) as inFile: + bioseq.read(inFile) + while bioseq.sequence: + lHeaders.append(bioseq.header) + bioseq.read(inFile) + + lHeadersToKeep = [] + with open(patternFileName) as patternFile: + for pattern in patternFile: + pattern = pattern.rstrip() + if verbose > 0: + print "pattern: ", pattern; sys.stdout.flush() + + patternToSearch = re.compile(pattern) + lHeadersToKeep.extend([h for h in lHeaders if patternToSearch.search(h)]) + + if not outFileName: + outFileName = inFileName + ".extracted" + + with open(outFileName, "w") as outFile: + with open(inFileName) as inFile: + bioseq.read(inFile) + while bioseq.sequence: + bioseqNb += 1 + if bioseq.header in lHeadersToKeep: + bioseq.write(outFile) + savedBioseqNb += 1 + if verbose > 1: + print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush() + bioseq.read(inFile) + + if verbose > 0: + print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) + + + ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file + # + # @param pattern regular expression to search in headers + # @param inFileName string name of the input fasta file + # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') + # @param verbose integer verbosity level (default = 0) + # + @staticmethod + def dbCleanByPattern(pattern, inFileName, outFileName = "", verbose = 0): + if not pattern: + return + + patternToSearch = re.compile(pattern) + + if outFileName == "": + outFileName = inFileName + '.cleaned' + + with open(outFileName, 'w') as outFile: + bioseq = Bioseq() + bioseqNb = 0 + savedBioseqNb = 0 + with open(inFileName) as inFile: + bioseq.read(inFile) + while bioseq.sequence: + bioseqNb += 1 + if not patternToSearch.search(bioseq.header): + bioseq.write(outFile) + savedBioseqNb += 1 + if verbose > 1: + print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!' + bioseq.read(inFile) + + if verbose > 0: + print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) + + + ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file + # + # @param patternFileName string file containing regular expression to search in headers + # @param inFileName string name of the input fasta file + # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') + # @param verbose integer verbosity level (default = 0) + # + @staticmethod + def dbCleanByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0): + if not patternFileName: + print "ERROR: no file of pattern" + sys.exit(1) + + bioseq = Bioseq() + bioseqNb = 0 + savedBioseqNb = 0 + lHeaders = [] + with open(inFileName) as inFile: + bioseq.read(inFile) + while bioseq.sequence: + lHeaders.append(bioseq.header) + bioseq.read(inFile) + + with open(patternFileName) as patternFile: + lHeadersToRemove = [] + for pattern in patternFile: + pattern = pattern.rstrip() + if verbose > 0: + print "pattern: ", pattern; sys.stdout.flush() + + patternToSearch = re.compile(pattern) + lHeadersToRemove.extend([h for h in lHeaders if patternToSearch.search(h)]) + + if not outFileName: + outFileName = inFileName + '.cleaned' + + with open(outFileName, 'w') as outFile: + bioseqNum = 0 + with open(inFileName) as inFile: + bioseq.read(inFile) + while bioseq.sequence: + bioseqNum += 1 + if bioseq.header not in lHeadersToRemove: + bioseq.write(outFile) + savedBioseqNb += 1 + if verbose > 1: + print 'sequence num', bioseqNum, '/', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush() + bioseq.read(inFile) + + if verbose > 0: + print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) + + + ## Find sequence's ORFs from a fasta file and write them in a tab file + # + # @param inFileName string name of the input fasta file + # @param orfMaxNb integer Select orfMaxNb best ORFs + # @param orfMinLength integer Keep ORFs with length > orfMinLength + # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map') + # @param verbose integer verbosity level (default = 0) + # + @staticmethod + def dbORF(inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose = 0): + if not outFileName: + outFileName = inFileName + ".ORF.map" + outFile = open(outFileName, "w") + + bioseq = Bioseq() + bioseqNb = 0 + + inFile = open(inFileName) + bioseq.read(inFile) + while bioseq.sequence: + bioseq.upCase() + bioseqNb += 1 + if verbose > 0: + print 'sequence num', bioseqNb, '=', bioseq.getLength(), '[', bioseq.header[0:40], '...]' + + orf = bioseq.findORF() + bestOrf = [] + for i in orf.keys(): + orfLen = len(orf[i]) + for j in xrange(1, orfLen): + start = orf[i][j - 1] + 4 + end = orf[i][j] + 3 + if end - start >= orfMinLength: + bestOrf.append((end - start, i + 1, start, end)) + + bioseq.reverseComplement() + + orf = bioseq.findORF() + seqLen = bioseq.getLength() + for i in orf.keys(): + orfLen = len(orf[i]) + for j in xrange(1, orfLen): + start = seqLen - orf[i][j - 1] - 3 + end = seqLen - orf[i][j] - 2 + if start - end >= orfMinLength: + bestOrf.append((start - end, (i + 1) * -1, start, end)) + + bestOrf.sort(reverse = True) + bestOrfNb = len(bestOrf) + if orfMaxNb != 0 and orfMaxNb < bestOrfNb: + bestOrfNb = orfMaxNb + for i in xrange(0, bestOrfNb): + if verbose > 0: + print bestOrf[i] + outFile.write("%s\t%s\t%d\t%d\n" % ("ORF|" + str(bestOrf[i][1]) + \ + "|" + str(bestOrf[i][0]), bioseq.header, + bestOrf[i][2], bestOrf[i][3])) + bioseq.read(inFile) + + inFile.close() + outFile.close() + + return 0 + + + ## Sort sequences by increasing length (write a new file) + # + # @param inFileName string name of the input fasta file + # @param outFileName string name of the output fasta file + # @param verbose integer verbosity level + # + @staticmethod + def sortSequencesByIncreasingLength(inFileName, outFileName, verbose = 0): + if verbose > 0: + print "sort sequences by increasing length" + sys.stdout.flush() + if not os.path.exists(inFileName): + print "ERROR: file '%s' doesn't exist" % (inFileName) + sys.exit(1) + + # read each seq one by one + # save them in distinct temporary files + # with their length in the name + inFileHandler = open(inFileName, "r") + countSeq = 0 + bs = Bioseq() + bs.read(inFileHandler) + while bs.header: + countSeq += 1 + tmpFile = "%ibp_%inb" % (bs.getLength(), countSeq) + bs.appendBioseqInFile(tmpFile) + if verbose > 1: + print "%s (%i bp) saved in '%s'" % (bs.header, bs.getLength(), tmpFile) + bs.header = "" + bs.sequence = "" + bs.read(inFileHandler) + inFileHandler.close() + + # sort temporary file names + # concatenate them into the output file + if os.path.exists(outFileName): + os.remove(outFileName) + lFiles = glob.glob("*bp_*nb") + lFiles.sort(key = lambda s:int(s.split("bp_")[0])) + for fileName in lFiles: + cmd = "cat %s >> %s" % (fileName, outFileName) + returnValue = os.system(cmd) + if returnValue != 0: + print "ERROR while concatenating '%s' with '%s'" % (fileName, outFileName) + sys.exit(1) + os.remove(fileName) + + return 0 + + + ## Sort sequences by header + # + # @param inFileName string name of the input fasta file + # @param outFileName string name of the output fasta file + # @param verbose integer verbosity level + # + @staticmethod + def sortSequencesByHeader(inFileName, outFileName = ""): + if outFileName == "": + outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0] + iBioseqDB = BioseqDB(inFileName) + with open(outFileName, "w") as f: + for header in sorted(iBioseqDB.getHeaderList()): + iBioseq = iBioseqDB.fetch(header) + iBioseq.write(f) + + + ## Return a dictionary which keys are the headers and values the length of the sequences + # + # @param inFile string name of the input fasta file + # @param verbose integer verbosity level + # + @staticmethod + def getLengthPerHeader(inFile, verbose = 0): + dHeader2Length = {} + + with open(inFile) as inFileHandler: + currentSeqHeader = "" + currentSeqLength = 0 + for line in inFileHandler: + if line[0] == ">": + if currentSeqHeader != "": + dHeader2Length[currentSeqHeader] = currentSeqLength + currentSeqLength = 0 + currentSeqHeader = line[1:-1] + if verbose > 0: + print "current header: %s" % currentSeqHeader + sys.stdout.flush() + else: + currentSeqLength += len(line.replace("\n", "")) + dHeader2Length[currentSeqHeader] = currentSeqLength + + return dHeader2Length + + + ## Convert headers from a fasta file having chunk coordinates + # + # @param inFile string name of the input fasta file + # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes + # @param outFile string name of the output file + # + @staticmethod + def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile): + inFileHandler = open(inFile, "r") + outFileHandler = open(outFile, "w") + dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile) + iConvCoord = ConvCoord() + for line in inFileHandler: + if line[0] == ">": + if "{Fragment}" in line: + chkName = line.split(" ")[1] + chrName = dChunk2Map[chkName].seqname + lCoordPairs = line.split(" ")[3].split(",") + lRangesOnChk = [] + for i in lCoordPairs: + iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1])) + lRangesOnChk.append(iRange) + lRangesOnChr = [] + for iRange in lRangesOnChk: + lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map)) + newHeader = line[1:-1].split(" ")[0] + newHeader += " %s" % chrName + newHeader += " {Fragment}" + newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end) + for iRange in lRangesOnChr[1:]: + newHeader += ",%i..%i" % (iRange.start, iRange.end) + outFileHandler.write(">%s\n" % newHeader) + else: + chkName = line.split("_")[1].split(" ")[0] + chrName = dChunk2Map[chkName].seqname + coords = line[line.find("[")+1 : line.find("]")] + start = int(coords.split(",")[0]) + end = int(coords.split(",")[1]) + iRangeOnChk = Range(chkName, start, end) + iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map) + newHeader = line[1:-1].split("_")[0] + newHeader += " %s" % chrName + newHeader += " %s" % line[line.find("(") : line.find(")")+1] + newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd()) + outFileHandler.write(">%s\n" % newHeader) + else: + outFileHandler.write(line) + inFileHandler.close() + outFileHandler.close() + + + ## Convert a fasta file to a length file + # + # @param inFile string name of the input fasta file + # @param outFile string name of the output file + # + @staticmethod + def convertFastaToLength(inFile, outFile = ""): + if not outFile: + outFile = "%s.length" % inFile + + if inFile: + with open(inFile) as inFH: + with open(outFile, "w") as outFH: + bioseq = Bioseq() + bioseq.read(inFH) + while bioseq.sequence: + seqLen = bioseq.getLength() + outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen)) + bioseq.read(inFH) + + + ## Convert a fasta file to a seq file + # + # @param inFile string name of the input fasta file + # @param outFile string name of the output file + # + @staticmethod + def convertFastaToSeq(inFile, outFile = ""): + if not outFile: + outFile = "%s.seq" % inFile + + if inFile: + with open(inFile) as inFH: + with open(outFile, "w") as outFH: + bioseq = Bioseq() + bioseq.read(inFH) + while bioseq.sequence: + seqLen = bioseq.getLength() + outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \ + bioseq.sequence, bioseq.header, seqLen)) + bioseq.read(inFH) + + + ## Splice an input fasta file using coordinates in a Map file + # + # @note the coordinates should be merged beforehand! + # + @staticmethod + def spliceFromCoords(genomeFile, coordFile, obsFile): + genomeFileHandler = open(genomeFile) + obsFileHandler = open(obsFile, "w") + dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile(coordFile) + + bs = Bioseq() + bs.read(genomeFileHandler) + while bs.sequence: + + if dChr2Maps.has_key(bs.header): + lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax(dChr2Maps[bs.header]) + splicedSeq = [] + currentSite = 0 + for iMap in lCoords: + minSplice = iMap.getMin() - 1 + if minSplice > currentSite: + splicedSeq += bs.sequence[currentSite : minSplice] + if currentSite <= iMap.getMax(): + currentSite = iMap.getMax() + splicedSeq += bs.sequence[currentSite:] + bs.sequence = "".join(splicedSeq) + bs.write(obsFileHandler) + bs.read(genomeFileHandler) + + genomeFileHandler.close() + obsFileHandler.close() + + + ## Shuffle input sequences (single file or files in a directory) + # + @staticmethod + def dbShuffle(inData, outData, verbose = 0): + if CheckerUtils.isExecutableInUserPath("esl-shuffle"): + prg = "esl-shuffle" + else : prg = "shuffle" + genericCmd = prg + " --seed 1 -d INPUT > OUTPUT" + if os.path.isfile(inData): + if verbose > 0: + print "shuffle input file '%s'" % inData + cmd = genericCmd.replace("INPUT", inData).replace("OUTPUT", outData) + print cmd + returnStatus = os.system(cmd) + if returnStatus: + sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus) + sys.exit(1) + + elif os.path.isdir(inData): + if verbose > 0: + print "shuffle files in input directory '%s'" % inData + if os.path.exists(outData): + shutil.rmtree(outData) + os.mkdir(outData) + lInputFiles = glob.glob("%s/*.fa" % inData) + nbFastaFiles = 0 + for inputFile in lInputFiles: + nbFastaFiles += 1 + if verbose > 1: + print "%3i / %3i" % (nbFastaFiles, len(lInputFiles)) + fastaBaseName = os.path.basename(inputFile) + prefix = os.path.splitext(fastaBaseName)[0] + cmd = genericCmd.replace("INPUT", inputFile).replace("OUTPUT", "%s/%s_shuffle.fa" % (outData, prefix)) + returnStatus = os.system(cmd) + if returnStatus: + sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus) + sys.exit(1) + + + ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers + # + # @param inClusterFileName string input cluster file name + # @param inFastaFileName string input fasta file name + # @param outFileName string output file name + # @param verbosity integer verbosity + # + @staticmethod + def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0): + dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity) + iFastaParser = FastaParser(inFastaFileName) + with open(outFileName, "w") as f: + for iSequence in iFastaParser.getIterator(): + + header = iSequence.getName() + if dHeader2ClusterClusterMember.get(header): + cluster = dHeader2ClusterClusterMember[header][0] + member = dHeader2ClusterClusterMember[header][1] + else: + clusterIdForSingletonCluster += 1 + cluster = clusterIdForSingletonCluster + member = 1 + + newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header) + iSequence.setName(newHeader) + f.write(iSequence.printFasta()) + + @staticmethod + def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0): + dHeader2ClusterClusterMember = {} + clusterId = 0 + with open(inClusterFileName) as f: + for line in f: + lineWithoutLastChar = line.rstrip() + lHeaders = lineWithoutLastChar.split("\t") + clusterId += 1 + if verbosity > 0: + print "%i sequences in cluster %i" % (len(lHeaders), clusterId) + memberId = 0 + for header in lHeaders: + memberId += 1 + dHeader2ClusterClusterMember[header] = (clusterId, memberId) + if verbosity > 0: + print "%i clusters" % clusterId + return dHeader2ClusterClusterMember, clusterId + + @staticmethod + def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""): + """ + Write a map file from fasta output of clustering tool. + Warning: only works if input fasta headers are formated like LTRharvest fasta output. + """ + if not outMapFileName: + outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0]) + + fileDb = open(fastaFileNameFromClustering) + fileMap = open(outMapFileName, "w") + seq = Bioseq() + numseq = 0 + seq.read(fileDb) + while seq.sequence: + numseq = numseq + 1 + ID = seq.header.split(' ')[0].split('_')[0] + chunk = seq.header.split(' ')[0].split('_')[1] + start = seq.header.split(' ')[-1].split(',')[0][1:] + end = seq.header.split(' ')[-1].split(',')[1][:-1] + line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end) + fileMap.write("%s\n" % line) + seq.read(fileDb) + + fileDb.close() + fileMap.close() + print "saved in %s" % outMapFileName + + @staticmethod + def getNstretchesRangesList(fastaFileName, nbN = 2): + lNstretchesRanges = [] + if nbN != 0: + iBSDB = BioseqDB(fastaFileName) + for iBS in iBSDB.db: + nbNFound = 0 + start = 1 + pos = 1 + lastPos = 0 + + while pos <= iBS.getLength(): + if nbNFound == 0: + start = pos + + while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']: + nbNFound += 1 + pos += 1 + lastPos = pos + + if pos - lastPos >= nbN: + if nbNFound >= nbN: + lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1)) + nbNFound = 0 + pos += 1 + + if nbNFound >= nbN: + lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1)) + + lNstretchesRanges.sort(key = lambda iRange: (iRange.getSeqname(), iRange.getStart(), iRange.getEnd()), reverse = False) + + return lNstretchesRanges + + + @staticmethod + def writeNstretches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"): + lNstretchesRanges = FastaUtils.getNstretchesRangesList(fastaFileName, nbN) + + outFormat = outFormat.lower() + if outFormat in ["gff", "gff3"]: + outFormat = "gff3" + else: + outFormat = "map" + + if outFileName == "": + outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat) + + with open(outFileName, "w") as fH: + if outFormat == "gff3": + fH.write("##gff-version 3\n") + for iRange in lNstretchesRanges: + seq = iRange.getSeqname() + start = iRange.getStart() + end = iRange.getEnd() + if outFormat == "gff3": + fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end)) + else: + fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end)) + \ No newline at end of file diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/seq/SequenceModifications.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/seq/SequenceModifications.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,114 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +from operator import itemgetter +from commons.core.coord.Range import Range + +class SequenceModifications(object): + + def __init__(self, originalHeader = "", mutatedHeader = ""): + self._originalHeader = originalHeader + self._mutatedHeader = mutatedHeader + self._lMutations = [] + self._lDeletionsRanges = [] + self._lInsertionsRanges = [] + + def __str__(self): + result = "%s\t%s\n" % (self.getOriginalHeader(), self.getMutatedHeader()) + result += "Insertions\n" + for insertion in self._lInsertionsRanges: + result += "%s\n" % insertion.toString() + result += "Deletions\n" + for insertion in self._lDeletionsRanges: + result += "%s\n" % insertion.toString() + result += "Mutations" + for mutation in self._lMutations: + result += "\n%i\t%s\t%s" % (mutation[0], mutation[1], mutation[2]) + return result + + def __eq__(self, o): + if type(o) is type(self): + self.sort() + o.sort() + return self._originalHeader == o._originalHeader and self._mutatedHeader == o._mutatedHeader and self._lMutations == o._lMutations \ + and self._lDeletionsRanges == o._lDeletionsRanges and self._lInsertionsRanges == o._lInsertionsRanges + return False + + def __ne__(self, o): + return not self.__eq__(o) + + def getOriginalHeader(self): + return self._originalHeader + + def getMutatedHeader(self): + return self._mutatedHeader + + def getMutations(self): + self.sort() + return self._lMutations + + def getInsertions(self): + self.sort() + return self._lInsertionsRanges + + def getDeletions(self): + self.sort() + return self._lDeletionsRanges + + def setOriginalHeader(self, originalHeader): + self._originalHeader = originalHeader + + def setMutatedHeader(self, mutatedHeader): + self._mutatedHeader = mutatedHeader + + def setMutations(self, lMutations): + self._lMutations = lMutations + + def addMutation(self, tupleMute): + #tuple: (position, oldNT, newNT) + self._lMutations.append(tupleMute) + + def addInsertion(self, start, end, insertedSeqName = "."): + self._lInsertionsRanges.append(Range(insertedSeqName, start, end)) + + def addDeletion(self, start, end): + self._lDeletionsRanges.append(Range(self.getOriginalHeader(), start, end)) + + def clear(self): + self._lMutations = [] + self._lDeletionsRanges = [] + self._lInsertionsRanges = [] + + def sort(self): + self._lMutations.sort(key = itemgetter(0), reverse = False) + self._lDeletionsRanges.sort(key = lambda delRange: delRange.getStart(), reverse = False) + self._lInsertionsRanges.sort(key = lambda insRange: insRange.getStart(), reverse = False) \ No newline at end of file diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/seq/SequenceModificationsCollection.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/seq/SequenceModificationsCollection.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,312 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import os +import time +import shutil +from commons.core.seq.BioseqDB import BioseqDB +from commons.core.seq.SequenceModifications import SequenceModifications +from commons.core.checker.RepetException import RepetException + +class SequenceModificationsCollection(object): + + def __init__(self): + self._lSeqModif = [] + + def __str__(self): + result = "" + for iSeqModif in self._lSeqModif: + result += "%s\n" % iSeqModif.__str__() + return result + + def __eq__(self, o): + if type(o) is type(self): + self.sort() + o.sort() + return self._lSeqModif == o._lSeqModif + return False + + def __ne__(self, o): + return not self.__eq__(o) + + def clear(self): + self._lSeqModif = [] + + def add(self, iSeqModif, override = False): + for seqModif in self._lSeqModif: + if seqModif.getOriginalHeader() == iSeqModif.getOriginalHeader(): + if override: + self._lSeqModif.pop(self._lSeqModif.index(seqModif)) + else: + raise RepetException("ERROR: '%s' already in SequenceModificationsCollection" % iSeqModif.getOriginalHeader()) + + self._lSeqModif.append(iSeqModif) + + def get(self, header, mutated = False): + for iSeqModif in self._lSeqModif: + if mutated: + linkToGoodMethod = iSeqModif.getMutatedHeader + else: + linkToGoodMethod = iSeqModif.getOriginalHeader + + if linkToGoodMethod() == header: + return iSeqModif + return None + + def getHeadersList(self, mutated = False): + lHeaders = [] + if mutated: + for iSeqModif in self._lSeqModif: + lHeaders.append(iSeqModif.getMutatedHeader()) + else: + for iSeqModif in self._lSeqModif: + lHeaders.append(iSeqModif.getOriginalHeader()) + lHeaders.sort(key = lambda header: header.lower()) + return lHeaders + + def sort(self): + self._lSeqModif.sort(key = lambda seqMod: seqMod.getOriginalHeader().lower(), reverse = False) + + def writeMutations(self, fileName, outFormat = ""): + self.sort() + with open(fileName, "w") as fH: + if outFormat.lower() in ["gff", "gff3"]: + fH.write("##gff-version 3\n") + for iSeqModif in self._lSeqModif: + for mutation in iSeqModif.getMutations(): + pos = mutation[0] + old = mutation[1] + new = mutation[2] + fH.write("%s\tMutateSequence\tSNP\t%i\t%i\t.\t.\t.\tName=SNP_%i;REF=%s;ALT=%s\n" % (iSeqModif.getOriginalHeader(), pos, pos, pos, old, new)) + else: + fH.write("#Mutations:\n") + fH.write("seqName\tposition\toldNt\tnewNt\n") + for iSeqModif in self._lSeqModif: + for mutation in iSeqModif.getMutations(): + fH.write("%s\t%i\t%s\t%s\n" % (iSeqModif.getOriginalHeader(), mutation[0], mutation[1], mutation[2])) + + def writeInsertions(self, fileName, outFormat = ""): + self.sort() + with open(fileName, "w") as fH: + if outFormat.lower() in ["gff", "gff3"]: + fH.write("##gff-version 3\n") + for iSeqModif in self._lSeqModif: + for iRange in iSeqModif.getInsertions(): + if iRange.getSeqname() != ".": + fH.write("%s\tMutateSequence\tinsertion\t%s\t%s\t.\t.\t.\tName=insertion_%s-%s;insert=%s\n" % (iSeqModif.getOriginalHeader(), iRange.getStart(), iRange.getEnd(), iRange.getStart(), iRange.getEnd(), iRange.getSeqname())) + else: + fH.write("%s\tMutateSequence\tinsertion\t%s\t%s\t.\t.\t.\tName=insertion_%s-%s\n" % (iSeqModif.getOriginalHeader(), iRange.getStart(), iRange.getEnd(), iRange.getStart(), iRange.getEnd())) + else: + fH.write("#Insertions:\n") + fH.write("seqName\tstart\tend\tinsertedSeqName\n") + for iSeqModif in self._lSeqModif: + for iRange in iSeqModif.getInsertions(): + fH.write("%s\t%i\t%i\t%s\n" % (iSeqModif.getOriginalHeader(), iRange.getStart(), iRange.getEnd(), iRange.getSeqname())) + + def writeDeletions(self, fileName, outFormat = ""): + self.sort() + with open(fileName, "w") as fH: + if outFormat.lower() in ["gff", "gff3"]: + fH.write("##gff-version 3\n") + for iSeqModif in self._lSeqModif: + for iRange in iSeqModif.getDeletions(): + fH.write("%s\tMutateSequence\tdeletion\t%s\t%s\t.\t.\t.\tName=deletion_%s-%s\n" % (iSeqModif.getOriginalHeader(), iRange.getStart(), iRange.getEnd(), iRange.getStart(), iRange.getEnd())) + else: + fH.write("#Deletions:\n") + fH.write("seqName\tstart\tend\n") + for iSeqModif in self._lSeqModif: + for iRange in iSeqModif.getDeletions(): + fH.write("%s\t%i\t%i\n" % (iSeqModif.getOriginalHeader(), iRange.getStart(), iRange.getEnd())) + + def write(self, mutationsFileName = "", insertionsFileName = "", deletionsFileName = "", outFormat = ""): + self.sort() + self.writeMutations(mutationsFileName, outFormat) + self.writeInsertions(insertionsFileName, outFormat) + self.writeDeletions(deletionsFileName, outFormat) + + def writeVCF(self, VCFFileName, fastaFileName, software = "MutateSequences"): + self.sort() + tmpVCFFileName = "%s.tmp" % VCFFileName + VCFFH = open(tmpVCFFileName, "w") + VCFFH.write("##fileformat=VCFv4.1\n") + VCFFH.write("##fileDate=%s\n" % time.strftime("%Y%m%d")) + VCFFH.write("##reference=%s\n" % os.path.abspath(fastaFileName)) + VCFFH.write("##INFO=\n") + VCFFH.write("##INFO=\n") + VCFFH.write("##INFO=\n") + VCFFH.write("##INFO=\n") + VCFFH.write("##INFO=\n") + VCFFH.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\n") + + iBSDB = BioseqDB(fastaFileName) + + for iSeqModif in self._lSeqModif: + for mutation in iSeqModif.getMutations(): + pos = mutation[0] + old = mutation[1] + new = mutation[2] + VCFFH.write("%s\t%s\t.\t%s\t%s\t.\t.\tAN=2;REF=%s;ALT=%s;SOFTWARE=%s\n" % (iSeqModif.getOriginalHeader(), pos, old, new, old, new, software)) + + for insRange in iSeqModif.getInsertions(): + if insRange.getStart() != 1: + refSeq = iBSDB.fetch(iSeqModif.getOriginalHeader()).getNtFromPosition(insRange.getStart() - 1) + altSeq = "." + + INFO = "SVTYPE=INS;AN=2;SVLEN=%d;SOFTWARE=%s" % (insRange.getEnd() - insRange.getStart() + 1, software) + if insRange.getSeqname() != ".": + INFO += ";INSERTED=%s" % insRange.getSeqname() + VCFLine = "%s\t%d\t.\t%s\t%s\t%s\t%s\t%s\n" % (iSeqModif.getOriginalHeader(), insRange.getStart() - 1, refSeq, altSeq, ".", ".", INFO) + + else: + refSeq = iBSDB.fetch(iSeqModif.getOriginalHeader()).getNtFromPosition(insRange.getStart()) + refSeq = "." + altSeq = "." + + INFO = "SVTYPE=INS;AN=2;SVLEN=%d;SOFTWARE=%s" % (insRange.getEnd() - insRange.getStart() + 1, software) + if insRange.getSeqname() != ".": + INFO += ";INSERTED=%s" % insRange.getSeqname() + VCFLine = "%s\t%d\t.\t%s\t%s\t%s\t%s\t%s\n" % (iSeqModif.getOriginalHeader(), insRange.getStart(), refSeq, altSeq, ".", ".", INFO) + + VCFFH.write(VCFLine) + + for delRange in iSeqModif.getDeletions(): + if delRange.getStart() != 1: + refSeq = iBSDB.fetch(iSeqModif.getOriginalHeader()).subseq(delRange.getStart() - 1, delRange.getEnd()).getSequence() + altSeq = refSeq[0] + + INFO = "SVTYPE=DEL;AN=2;SVLEN=-%d;SOFTWARE=%s" % (len(refSeq)-1, software) + VCFLine = "%s\t%d\t.\t%s\t%s\t%s\t%s\t%s\n" % (iSeqModif.getOriginalHeader(), delRange.getStart() - 1, refSeq, altSeq, ".", ".", INFO) + + else: + refSeq = iBSDB.fetch(iSeqModif.getOriginalHeader()).subseq(delRange.getStart(), delRange.getEnd() + 1).getSequence() + altSeq = refSeq[-1] + altSeq = "." + + INFO = "SVTYPE=DEL;AN=2;SVLEN=-%d;SOFTWARE=%s" % (len(refSeq)-1, software) + VCFLine = "%s\t%d\t.\t%s\t%s\t%s\t%s\t%s\n" % (iSeqModif.getOriginalHeader(), delRange.getStart(), refSeq, altSeq, ".", ".", INFO) + + VCFFH.write(VCFLine) + + #This command line can sort this VCF file properly. But can't manage to launch it properly through os.system or subprocess... +# cmd = "(head -n 9 %s && tail -n +10 %s | head -n -1 | sort -f -k1,1 -k2,2n) > %s" % (tmpVCFFileName, tmpVCFFileName, VCFFileName) + shutil.move(tmpVCFFileName, VCFFileName) + + def getCollectionBasedOnMutatedSequence(self): + transformedSeqModifCollec = SequenceModificationsCollection() + + for header in self.getHeadersList(): + currentSeqModif = self.get(header) + + lModifsTuples = [("insertion", iRange) for iRange in currentSeqModif.getInsertions()] + for iRange in currentSeqModif.getDeletions(): + lModifsTuples.append(("deletion", iRange)) + lModifsTuples.sort(key = lambda modifTuple: modifTuple[1].getStart(), reverse = False) + + sumIns = 0 + sumDel = 0 + + iseqModif = SequenceModifications(currentSeqModif.getMutatedHeader(), currentSeqModif.getOriginalHeader()) + for modifTuple in lModifsTuples: + varType = modifTuple[0] + varRange = modifTuple[1] + + if varType == "insertion": + iseqModif.addDeletion(varRange.getStart() + sumIns - sumDel, varRange.getEnd() + sumIns - sumDel) + sumIns += varRange.getLength() + + if varType == "deletion": + iseqModif.addInsertion(varRange.getStart() + sumIns - sumDel, varRange.getEnd() + sumIns - sumDel) + sumDel += varRange.getLength() + + for tSnp in currentSeqModif.getMutations(): + iseqModif.addMutation((tSnp[0], tSnp[2], tSnp[1])) + + iseqModif.sort() + transformedSeqModifCollec.add(iseqModif) + + transformedSeqModifCollec.sort() + + return transformedSeqModifCollec + + def loadSeqModifCollectionFromFiles(self, inInsertionsFileName, inDeletionsFileName, inSNPsFileName, SNPsrate = "0.020000"): + self.clear() + + with open(inInsertionsFileName, "r") as f: + line = f.readline() + while line: + if "seqName" not in line and "#" not in line: + splittedLine = line.split() + seqname = splittedLine[0] + start = int(splittedLine[1]) + end = int(splittedLine[2]) + insertedSeqName = splittedLine[3] + + if self.get(seqname) is None: + self.add(SequenceModifications(seqname)) + self.get(seqname).setMutatedHeader("%s_mutated_%s" % (seqname, SNPsrate)) + self.get(seqname).addInsertion(start, end, insertedSeqName) + line = f.readline() + + with open(inDeletionsFileName, "r") as f: + line = f.readline() + while line: + if "seqName" not in line and "#" not in line: + splittedLine = line.split() + seqname = splittedLine[0] + start = int(splittedLine[1]) + end = int(splittedLine[2]) + + if self.get(seqname) is None: + self.add(SequenceModifications(seqname)) + self.get(seqname).setMutatedHeader("%s_mutated_%s" % (seqname, SNPsrate)) + self.get(seqname).addDeletion(start, end) + line = f.readline() + + with open(inSNPsFileName, "r") as f: + line = f.readline() + while line: + if "seqName" not in line and "#" not in line: + splittedLine = line.split() + seqname = splittedLine[0] + position = int(splittedLine[1]) + oldNt = splittedLine[2] + newNt = splittedLine[3] + + if self.get(seqname) is None: + self.add(SequenceModifications(seqname)) + self.get(seqname).setMutatedHeader("%s_mutated_%s" % (seqname, SNPsrate)) + self.get(seqname).addMutation((position, oldNt, newNt)) + line = f.readline() + + for header in self.getHeadersList(): + self.get(header).sort() + self.sort() \ No newline at end of file diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/seq/__init__.py diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/utils/Classif.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/utils/Classif.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,385 @@ +import re +import os +from collections import OrderedDict + +DWICKERCODE = { + "ClassI":"RXX", + "ClassII":"DXX", + "LTR":"RLX", + "DIRS":"RYX", + "PLE":"RPX", + "LINE":"RIX", + "SINE":"RSX", + "TIR":"DTX", + "Crypton":"DYX", + "Helitron":"DHX", + "Maverick":"DMX", + + "TIR-MITE":"DTX", + "LTR-LARD":"RLX", + "LTR-TRIM":"RLX" + } + +class Classif(object): + """ The class Classif is a object what determine a line in classif file. + """ + + def __init__(self, consensusName = "", code = "NA", outConfuseness = "", outCompleteness = "", projectName = "", isShorten = False, consensusLength = "NA", consensusStrand = "NA", consensusClass = "NA", consensusOrder = "NA", consensusSuperFam = "NA", consensusCI = "NA"): + self._consensusName = consensusName + self._confusness = outConfuseness + self._completeness = outCompleteness + self._projectName = projectName + self._isShorten = isShorten + self._consensusLength = consensusLength + self._consensusStrand = consensusStrand + self._consensusClass = consensusClass + self._consensusOrder = consensusOrder + self._consensusSuperFam = consensusSuperFam + self._consensusCI = consensusCI + self._consensusCoding = "" + self._consensusStruct = "" + self._consensusOther = "" + self._isNoChim = "" + self._hasCodingPart = False + self._hasStructPart = False + self._hasOtherPart = False + self._code = code + self._evidence = {} + + def __eq__(self, o): + if type(o) is type(self): + return self._consensusName == o._consensusName and self._code == o._code \ + and self._confusness == o._confusness and self._completeness == o._completeness + return False + + def __ne__(self, o): + return not self.__eq__(o) + + def getConsensusName(self): + return self._consensusName + + def getCode(self): + return self._code + + def getconfusness(self): + return self._confusness + + def getcompleteness(self): + return self._completeness + + def getprojectName(self): + return self._projectName + + def getConsensusLength(self): + return self._consensusLength + + def getConsensusStrand(self): + return self._consensusStrand + + def getConsensusClass(self): + return self._consensusClass + + def getConsensusOrder(self): + return self._consensusOrder + + def getConsensusSuperFamily(self): + return self._consensusSuperFam + + def getConsensusCI(self): + return str(self._consensusCI) + + def getInfoEvidence(self): + return self._evidence + + def getConsensusCoding(self): + if self._confusness == 'ok': + coding = self.writeCodingFeaturesLine(self._evidence) + else: + lOrder = self.getConsensusOrder().split("|") + coding = self.writeCodingFeaturesLine(self._evidence[lOrder[0]]) + for order in lOrder[1:]: + if self._evidence[order].keys() != ['other']: + coding = coding + "|" + self.writeCodingFeaturesLine(self._evidence[order]) + return "coding=" + coding + + def getConsensusStructure(self): + if self._confusness == 'ok': + Structure = self.writeStructFeaturesLine(self._evidence) + else: + lOrder = self.getConsensusOrder().split("|") + Structure = self.writeStructFeaturesLine(self._evidence[lOrder[0]]) + for order in lOrder[1:]: + if self._evidence[order].keys() != ['other']: + Structure = Structure + "|" + self.writeStructFeaturesLine(self._evidence[order]) + return "struct=" + Structure + + def getConsensusOther(self): + if self._confusness == 'ok': + Other = self.writeOtherFeaturesLine(self._evidence) + else: + lOrder = self.getConsensusOrder().split("|") + Other = self.writeOtherFeaturesLine(self._evidence[lOrder[0]]) + for order in lOrder[1:]: + Other = Other + "|" + self.writeOtherFeaturesLine(self._evidence[order]) + return "other=" + Other + + def setConsensusName(self, consensusName): + self._consensusName = consensusName + + def setInfoEvidence(self, evidence): + self._evidence = evidence + + def setCode(self): + self._code = self._decisionRuleForWickerCode(self.getConsensusClass(), self.getConsensusOrder()) + + def setConfusness(self, Confusness): + self._confusness = Confusness + + def setCompleteness(self, completeness): + self._completeness = completeness + + def setProjectName(self, projectName): + self._projectName = projectName + + def setConsensusLength(self, cLength): + self._consensusLength = cLength + + def setConsensusStrand(self, cStrand): + self._consensusStrand = cStrand + + def setConsensusClass(self, cClass): + self._consensusClass = cClass + + def setConsensusOrder(self, cOrder): + self._consensusOrder = cOrder + + def setConsensusSuperFamily(self, cSuperFamily): + self._consensusSuperFamily = cSuperFamily + + def setConsensusCI(self, CI): + self._consensusCI = CI + + def setConsensusCoding(self, coding): + self._consensusCoding = coding + + def setConsensusStructure(self, structure): + self._consensusStruct = structure + + def setConsensusOther(self, other): + self._consensusOther = other + + def setCodStrOthFromMessage(self, dico): + self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico) + self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico) + self._consensusOther = "other="+self.writeOtherFeaturesLine(dico) + + def setCodStrOthFromMessage2(self, dico, cOrder): + if 'rDNA' in cOrder: + cOrder = cOrder.replace('rDNA', 'RDNA') + lOrder = cOrder.split("|") + lDicoKeys = dico.keys() + if lOrder[0] not in lDicoKeys: + self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico) + self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico) + self._consensusOther = "other="+self.writeOtherFeaturesLine(dico) + else: + self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico[lDicoKeys[0]]) + self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico[lDicoKeys[0]]) + self._consensusOther = "other="+self.writeOtherFeaturesLine(dico[lDicoKeys[0]]) + if len(lDicoKeys) != 1: + for order in lDicoKeys[1:]: + if dico[order].keys() == ['other']: + self._consensusOther = self._consensusOther+"|"+self.writeOtherFeaturesLine(dico[order]) + else: + self._consensusCoding = self._consensusCoding+"|"+self.writeCodingFeaturesLine(dico[order]) + self._consensusStruct = self._consensusStruct+"|"+self.writeStructFeaturesLine(dico[order]) + self._consensusOther = self._consensusOther+"|"+self.writeOtherFeaturesLine(dico[order]) + + def createNewConsensusName(self): + pastecClassif = "%s" % self._code + if self._completeness != "": + pastecClassif += "-%s" % self._completeness + if self._confusness != "": + pastecClassif += "-%s" % self._confusness + if self._isShorten: + pattern = "%s_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9_]+" % self._projectName + if re.match(pattern, self._consensusName) and not "%s_RS_" % self._projectName in self._consensusName: + header = self.shortenConsensusName() + header = "%s_%s" % (pastecClassif, header) + else: + header = "%s_%s" % (pastecClassif, self._consensusName) + else: + header = "%s_%s" % (pastecClassif, self._consensusName) + + return header + + def shortenConsensusName(self): + desc = self._consensusName.split(self._projectName)[1] + palignMeth = desc.split("_")[1] + clustMeth = desc.split("_")[2] + clustID = desc.split("_")[3] + lmalignMeth = desc.split("_")[4:] + if len(lmalignMeth) > 2: + malignMeth = "%s%s_%s" % (lmalignMeth[0], lmalignMeth[1], lmalignMeth[2]) + else: + malignMeth = "".join(lmalignMeth) + consensusShorten = "%s-%s-%s%s-%s" % (self._projectName, palignMeth[0], clustMeth[0], clustID, malignMeth) + + return consensusShorten + + def renameHeaderInConsensusFastaFile(self, fileName = ""): + newFileName = fileName.split(".")[0]+"New.fa" + + oldFile = open(fileName, "r") + newFile = open(newFileName, "w") + + inputLine = oldFile.readline() + while inputLine != "" : + if ">" in inputLine: + self.setConsensusName(inputLine) + outputLine = ">%s" % self.shortenConsensusName() + newFile.write(outputLine) + else: + newFile.write(inputLine) + + inputLine = oldFile.readline() + + oldFile.close() + newFile.close() + + os.system("mv %s.fa %sOld.fa" % (fileName.split(".")[0], fileName.split(".")[0])) + os.system("mv %sNew.fa %s.fa" % (fileName.split(".")[0], fileName.split(".")[0])) + os.system("rm -f %sOld.fa" % fileName.split(".")[0]) + + def writeOtherFeaturesLine(self, dEvidence): + other = "(NA)" + if dEvidence.has_key('other'): + lResults = [] + dOtherResults = dEvidence['other'] + lResultsWithCoding = self.formatCodingFeatures(dOtherResults, lResults) + lResultsFilled = self.formatStructFeatures(dOtherResults, lResultsWithCoding) + if len(lResultsFilled) != 0: + subOther = "; ".join(lResultsFilled) + other = '(%s)' % subOther + self._hasOtherPart = True + return other + + def writeCodingFeaturesLine(self, dEvidence): + lResults = [] + lResultsFilled = self.formatCodingFeatures(dEvidence, lResults) + if len(lResultsFilled) != 0: + subCoding = "; ".join(lResultsFilled) + coding = '(%s)' % subCoding + else: + coding = "(NA)" + return coding + + def writeStructFeaturesLine(self, dEvidence): + lResults = [] + lResultsFilled = self.formatStructFeatures(dEvidence, lResults) + if len(lResultsFilled) != 0: + subStruct = "; ".join(lResultsFilled) + struct = '(%s)' % subStruct + else: + struct = "(NA)" + return struct + + def formatCodingFeatures(self, dEvidence, lResults): + if dEvidence.has_key('Repbase_tbx') and dEvidence['Repbase_tbx'] != []: + lResults.append("TE_BLRtx: %s" % ", ".join(map(str, dEvidence['Repbase_tbx']))) + + if dEvidence.has_key('Repbase_bx') and dEvidence['Repbase_bx'] != []: + lResults.append("TE_BLRx: %s" % ", ".join(map(str, dEvidence['Repbase_bx']))) + + if (dEvidence.has_key('te_hmmer')) and (dEvidence['te_hmmer'] != None): + lResults.append('profiles: %s' % self.formatProfilesResults(dEvidence['te_hmmer'])) + + if dEvidence.has_key('Other_profiles'): + lResults.append('Other_profiles: %s' % self.formatProfilesResults(dEvidence['Other_profiles'])) + + if dEvidence.has_key("rDNA") and (dEvidence["rDNA"] != None): + lResults.append("rDNA_BLRn: %s" % dEvidence["rDNA"]) + + if dEvidence.has_key("HG") and (dEvidence["HG"] != None): + lResults.append("HG_BLRn: %s" % dEvidence["HG"]) + + if len(lResults) != 0: + self._hasCodingPart = True + return lResults + + def formatProfilesResults(self, dProfilesResults): + if len(dProfilesResults.keys()) == 0: + return "" + lResults = [] + for key in dProfilesResults.keys(): + iPDM = dProfilesResults[key] + cov = "%.2f%%" % iPDM.getCoverageOnSubject() + profilesResult = '%s: %s' % (key, cov) + lResults.append(profilesResult) + return ", ".join(lResults) + + def formatStructFeatures(self, dEvidence, lResults): + if dEvidence.has_key('length') and (dEvidence['length']!= None): + lResults.append('TElength: %s' % dEvidence['length']) + + if dEvidence.has_key('TR') and (dEvidence['TR'] != None): + lResults.append('TermRepeats: %s' % ", ".join(map(str, dEvidence['TR']))) + + if dEvidence.has_key('ORF') and (dEvidence['ORF'] != None): + lResults.append('ORF: %s' % ", ".join(dEvidence['ORF'])) + + if dEvidence.has_key('SSR') and (dEvidence['SSR'] != None): + lResults.append('SSR: %s' % ", ".join(dEvidence['SSR'])) + + if dEvidence.has_key('SSRCoverage') and (dEvidence['SSRCoverage'] != None) : + lResults.append('SSRCoverage=%s' % dEvidence['SSRCoverage']) + + if dEvidence.has_key('polyAtail'): + lResults.append('polyAtail') + + if dEvidence.has_key('helitronExtremities') and (dEvidence['helitronExtremities'] != None): + lResults.append('helitronExtremities: %s' % ", ".join(map(str, dEvidence['helitronExtremities']))) + if len(lResults) != 0: + self._hasStructPart = True + return lResults + + def _decisionRuleForWickerCode(self, teClass, order): + code = 'NA' + if order in DWICKERCODE.keys(): + code = DWICKERCODE[order] + elif teClass in DWICKERCODE.keys(): + code = DWICKERCODE[teClass] + elif order == "Unclassified" and teClass == "Unclassified": + code = "NA" + elif re.search("\|", order) and teClass == "Unclassified": + code = "XXX" + elif re.search("\|", order) and re.search("\|",teClass): + lClass = teClass.split("|") + for iC in lClass[1:]: + if lClass[0] != iC: + code = "XXX" + return code + code = DWICKERCODE[lClass[0]] + return code + + def renameLARDTRIMAndMITE(self): + order = self.getConsensusOrder() + order = order.replace("MITE", "TIR-MITE") + order = order.replace("LARD", "LTR-LARD") + order = order.replace("TRIM", "LTR-TRIM") + self.setConsensusOrder(order) + dEvidence = self.getInfoEvidence() + if 'LARD' in dEvidence.keys(): + dEvidence["LTR-LARD"] = dEvidence["LARD"] + del dEvidence["LARD"] + if 'TRIM' in dEvidence.keys(): + dEvidence["LTR-TRIM"] = dEvidence["TRIM"] + del dEvidence["TRIM"] + if 'MITE' in dEvidence.keys(): + dEvidence["TIR-MITE"] = dEvidence["MITE"] + del dEvidence["MITE"] + self.setInfoEvidence(dEvidence) + + + + \ No newline at end of file diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/utils/ClassifUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/utils/ClassifUtils.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,311 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import os +import json +from collections import OrderedDict +from commons.tools.RenameHeaderClassif import RenameHeaderClassif + +class ClassifUtils(object): + + @staticmethod + def _formatProfilesResultsAsDict(lProfilesResults): + if len(lProfilesResults) == 0: + return OrderedDict() + + dResults = OrderedDict() + + for refNameAndCoverage in lProfilesResults: + refName, coverage = refNameAndCoverage.split(": ") + + coverage = coverage.split("%(") + coverageOnSubject = float(coverage.pop(1).replace("%)", "")) + coverage = float(coverage.pop(0)) + + profilesResult = OrderedDict() + profilesResult["cov"] = coverage + profilesResult["covOnSubject"] = coverageOnSubject + dResults[refName] = profilesResult + return dResults + + @staticmethod + def _formatCodingFeaturesAsDict(lineOfEvidence, dCoding): + codingEvidences = lineOfEvidence.split("; ") + + for codingTypeData in codingEvidences: + codingTypeData = codingTypeData.split(": ") + codingType = codingTypeData.pop(0) + + codingTypeData = ": ".join(codingTypeData) + codingTypeData = codingTypeData.split(", ") + + if codingType == "TE_BLRtx": + if not dCoding.has_key("TE_BLRtx"): + dCoding["TE_BLRtx"] = OrderedDict() + for refNameAndCoverage in codingTypeData: + blrtxResult = OrderedDict() + refName, coverage = refNameAndCoverage.rsplit(": ", 1) + coverage = float(coverage.replace("%", "")) + blrtxResult["cov"] = coverage + dCoding["TE_BLRtx"][refName] = blrtxResult + + if codingType == "TE_BLRx": + if not dCoding.has_key("TE_BLRx"): + dCoding["TE_BLRx"] = OrderedDict() + for refNameAndCoverage in codingTypeData: + blrxResult = OrderedDict() + refName, coverage = refNameAndCoverage.rsplit(": ", 1) + coverage = float(coverage.replace("%", "")) + blrxResult["cov"] = coverage + dCoding["TE_BLRx"][refName] = blrxResult + + if codingType == "profiles": + dCoding["profiles"] = ClassifUtils._formatProfilesResultsAsDict(codingTypeData) + + if codingType == "Other_profiles": + dCoding["Other_profiles"] = ClassifUtils._formatProfilesResultsAsDict(codingTypeData) + + if codingType == "rDNA_BLRn": + dCoding["rDNA_BLRn"] = OrderedDict() + codingTypeData = ", ".join(codingTypeData) + try: + refName, coverage = codingTypeData.rsplit(": ", 1) + coverage = float(coverage.replace("%", "")) + except ValueError: + refName = codingTypeData + coverage = -1.0 + + dCoding["rDNA_BLRn"]["name"] = refName + dCoding["rDNA_BLRn"]["cov"] = coverage + + if codingType == "HG_BLRn": + dCoding["HG_BLRn"] = OrderedDict() + refName, coverage = codingTypeData[0].rsplit(": ", 1) + coverage = float(coverage.replace("%", "")) + + dCoding["HG_BLRn"]["name"] = refName + dCoding["HG_BLRn"]["cov"] = coverage + + @staticmethod + def _formatStructFeaturesAsDict(lineOfEvidence, dStruct): + structEvidences = lineOfEvidence.split("; ") + for structTypeData in structEvidences: + + structTypeData = structTypeData.split(": ") + structType = structTypeData.pop(0) + + structTypeData = ": ".join(structTypeData) + structTypeData = structTypeData.split(", ") + + if structType == "TElength": + dStruct["TElength"] = structTypeData.pop() + + if structType == "TermRepeats": + dStruct["TermRepeats"] = OrderedDict() + for refNameAndLength in structTypeData: + refName, length = refNameAndLength.rsplit(": ", 1) + dStruct["TermRepeats"][refName] = int(length) + + if structType == "ORF": + if not dStruct.has_key("ORF"): + dStruct["ORF"] = structTypeData + + if structType in ["SSR", "SSRtrf"]: + if not dStruct.has_key(structType): + dStruct[structType] = structTypeData + + if "SSRCoverage" in structType : + dummy, cov = structType.split("=") + dStruct["SSRCoverage"] = float(cov) + + if structType == "polyAtail": + dStruct["polyAtail"] = True + + if structType == "helitronExtremities": + structTypeData = ", ".join(structTypeData) + structTypeData = structTypeData.split("), ") + dStruct["helitronExtremities"] = OrderedDict() + for helitronData in structTypeData: + helName, helData = helitronData.split(": (") + helData = helData.replace(")", "") + eValue, start, end = helData.split(", ") + + helitronExtResult = OrderedDict() + helitronExtResult["start"] = int(start) + helitronExtResult["end"] = int(end) + helitronExtResult["eValue"] = float(eValue) + dStruct["helitronExtremities"][helName] = helitronExtResult + + @staticmethod + def _formatOtherFeaturesAsDict(lineOfEvidence, dOther): + if lineOfEvidence != "": + ClassifUtils._formatCodingFeaturesAsDict(lineOfEvidence, dOther) + ClassifUtils._formatStructFeaturesAsDict(lineOfEvidence, dOther) + + @staticmethod + def getClassifLineAsDict(line): + dClassif = OrderedDict() + iRenameHeaderClassif = RenameHeaderClassif() + lClassifItem = line.split("\t") + if len(lClassifItem) != 8: + msg = "Can't parse line: \"%s\"\n" % line.strip() + print("WARNING - ClassifUtils - %s" % msg) + return dClassif + + teClass = lClassifItem[4] + teOrder = lClassifItem[5] + # TODO: recompute wicker code like this or force the user to provide a classif file as input with the wicker code already added + wCode = iRenameHeaderClassif._decisionRuleForWickerCode(teClass, teOrder) + + dClassif["name"] = lClassifItem[0] + dClassif["wCode"] = wCode + dClassif["length"] = int(lClassifItem[1]) + dClassif["strand"] = lClassifItem[2] + dClassif["chimeric"] = False if lClassifItem[3] == "ok" else True + + dClassif["class"] = teClass + dClassif["order"] = teOrder + + if(lClassifItem[6] == "complete"): + dClassif["complete"] = True + elif(lClassifItem[6] == "incomplete"): + dClassif["complete"] = False + else: + dClassif["complete"] = None + + allFields = lClassifItem[7].split("; ") + + CI = allFields.pop(0) + CI = CI.split("=")[-1] + if CI != "NA": + try: + CI = int(CI) + except ValueError as e: + print "Couldn't convert %s to int : %s" % (CI, e) + dClassif["CI"] = CI + + dClassif["coding"] = OrderedDict() + dClassif["struct"] = OrderedDict() + dClassif["other"] = OrderedDict() + + allFields = "; ".join(allFields) + codingField = "" + structField = "" + otherField = "" + + codingStart = allFields.find("coding=(") + if codingStart != -1: + pCount = 1 + trueStart = codingStart + len("coding=(") + end = trueStart + for char in allFields[trueStart:]: + if char == "(": + pCount += 1 + if char == ")": + pCount -= 1 + if pCount == 0: + break; + end += 1 + if pCount == 0: + codingField = allFields[trueStart:end] + + structStart = allFields.find("struct=(") + if structStart != -1: + pCount = 1 + trueStart = structStart + len("struct=(") + end = trueStart + for char in allFields[trueStart:]: + if char == "(": + pCount += 1 + if char == ")": + pCount -= 1 + if pCount == 0: + break; + end += 1 + structField = allFields[trueStart:end] + + otherStart = allFields.find("other=(") + if otherStart != -1: + pCount = 1 + trueStart = otherStart + len("other=(") + end = trueStart + for char in allFields[trueStart:]: + if char == "(": + pCount += 1 + if char == ")": + pCount -= 1 + if pCount == 0: + break; + end += 1 + otherField = allFields[trueStart:end] + + if codingField != "": + ClassifUtils._formatCodingFeaturesAsDict(codingField, dClassif["coding"]) + if structField != "": + ClassifUtils._formatStructFeaturesAsDict(structField, dClassif["struct"]) + if otherField != "": + ClassifUtils._formatOtherFeaturesAsDict(otherField, dClassif["other"]) + + return dClassif + + ## Retrieve the classification informations of a classif file + # + # @param fileName Name of the classif file + # @return A dict containing the classification infos + # + @staticmethod + def getClassifInfosAsDict(fileName): + dConsensusInfo = OrderedDict() + + ext = os.path.splitext(fileName)[1] + if ext != ".classif": + msg = "Input file must be a classif file from TEdenovo\n" + print("ERROR - ClassifUtils - %s" % msg) + exit(1) + + with open(fileName, "r") as classifFile: + for line in classifFile: + seqName = line.split("\t")[0] + dConsensusInfo[seqName] = ClassifUtils.getClassifLineAsDict(line) + + return dConsensusInfo + + ## Convert a classif file to JSON format + # + # @param fileName Name of the classif file + # @param outFileName Name of the output JSON file (optional) + # + @staticmethod + def convertClassifToJson(fileName, outFileName = ""): + dConsensusInfo = ClassifUtils.getClassifInfosAsDict(fileName) + if outFileName == "": + outFileName = "%s_classif.json" % (os.path.basename(fileName).rsplit(".", 1)[0]) + with open(outFileName, 'w') as outFile: + json.dump(dConsensusInfo, outFile) diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/utils/FileUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/utils/FileUtils.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,479 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import os +import re +import sys +import math +import glob +import shutil +import subprocess +from operator import itemgetter +try: + import hashlib +except: + pass + + +class FileUtils( object ): + + ## Return the number of lines in the given file + # + @staticmethod + def getNbLinesInSingleFile( fileName ): + cmd = "wc -l %s" % fileName + r = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0] + nbLines = int(r.split()[0]) + + toAdd = 0 + if nbLines: + cmd = "tail -1 %s" % fileName + r = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0] + + if r == '\n': + toAdd -= 1 + elif '\n' not in r: + toAdd += 1 + + return nbLines + toAdd + + ## Return the number of lines in the files in the given list + # + @staticmethod + def getNbLinesInFileList( lFileNames ): + count = 0 + for fileName in lFileNames: + count += FileUtils.getNbLinesInSingleFile( fileName ) + return count + + ## Return True if the given file exists, False otherwise + # + @staticmethod + def isRessourceExists( fileName ): + return os.path.exists( fileName ) + + ## Return True if the given file is empty, False otherwise + # + @staticmethod + def isEmpty( fileName ): + return 0 == FileUtils.getNbLinesInSingleFile( fileName ) + + ## Return True if both files are identical, False otherwise + # + @staticmethod + def are2FilesIdentical( file1, file2 ): + tmpFile = "diff_%s_%s" % ( os.path.basename(file1), os.path.basename(file2) ) + cmd = "diff %s %s >> %s" % ( file1, file2, tmpFile ) + returnStatus = os.system( cmd ) + if returnStatus != 0: + print "WARNING: 'diff' returned '%i'" % returnStatus + os.remove( tmpFile ) + return False + if FileUtils.isEmpty( tmpFile ): + os.remove( tmpFile ) + return True + else: + os.remove( tmpFile ) + return False + + ## Return a string with all the content of the files in the given list + # + @staticmethod + def getFileContent( lFiles ): + content = "" + lFiles.sort() + for fileName in lFiles: + currentFile = open( fileName, "r" ) + content += currentFile.read() + currentFile.close() + return content + + ## Save content of the given file after having sorted it + # + @staticmethod + def sortFileContent( inFile, outFile="" ): + inFileHandler = open(inFile, "r" ) + lines = inFileHandler.readlines() + inFileHandler.close() + lines.sort() + if outFile == "": + outFile = inFile + outFileHandler = open( outFile, "w" ) + outFileHandler.writelines( lines ) + outFileHandler.close() + + ## Add end-of-line symbol to the given file content if necessary + # + @staticmethod + def addNewLineAtTheEndOfFileContent( fileContent ): + if not fileContent.endswith('\n') and len(fileContent) != 0: + fileContent += '\n' + return fileContent + + ## Concatenate files in the given list + # + @staticmethod + def catFilesFromList( lFiles, outFile, sort=True, skipHeaders = False, separator = "" ): + if sort: + lFiles.sort() + outFileHandler = open( outFile, "a" ) + isFirstFile = True + for singleFile in lFiles: + if not isFirstFile: + outFileHandler.write(separator) + isFirstFile = False + singleFileHandler = open( singleFile, "r" ) + if skipHeaders: + singleFileHandler.readline() + line = singleFileHandler.readline() + while line: + outFileHandler.write(line) + line = singleFileHandler.readline() + singleFileHandler.close() + outFileHandler.close() + + ## Concatenate files according to the given pattern + # + @staticmethod + def catFilesByPattern( pattern, outFile, skipHeaders = False, separator = "" ): + lFiles = glob.glob( pattern ) + FileUtils.catFilesFromList( lFiles, outFile, skipHeaders = skipHeaders, separator = separator ) + + ## Cat all files of a given directory + # + # @param dir string directory name + # @param outFileName string output file name + # + @staticmethod + def catFilesOfDir(directory, outFileName): + FileUtils.catFilesByPattern("%s/*" % directory, outFileName) + + ## Remove files listed according to the given pattern + # + # @example prefix="/home/tmp/dummy*.txt" + # + @staticmethod + def removeFilesByPattern( prefix ): + lFiles = glob.glob( prefix ) + for f in lFiles: + os.remove( f ) + + ## Remove files listed according to the suffixes in the given list + # + @staticmethod + def removeFilesBySuffixList( targetPath, lSuffixes ): + if targetPath[-1] == "/": + targetPath = targetPath[:-1] + for suffix in lSuffixes: + pattern = "%s/*%s" % ( targetPath, suffix ) + FileUtils.removeFilesByPattern( pattern ) + + ## Remove repeated blanks in the given file + # + @staticmethod + def removeRepeatedBlanks( inFile, outFile="" ): + if outFile == "": + outFile = inFile + tmpFile = "tr_%s_%s" % ( inFile, outFile ) + cmd = "tr -s ' ' < %s > %s" % ( inFile, tmpFile ) + os.system( cmd ) + os.rename( tmpFile, outFile ) + + ## Remove files in the given list + # + @staticmethod + def removeFilesFromList(lFiles): + for f in lFiles: + os.remove(f) + + ## Remove files in the given list if exist + # + @staticmethod + def removeFilesFromListIfExist(lFiles): + for fileName in lFiles: + if FileUtils.isRessourceExists(fileName): + os.remove(fileName) + + ## Append the content of a file to another file + # + # @param inFile string name of the input file + # @param outFile string name of the output file + # + @staticmethod + def appendFileContent( inFile, outFile ): + outFileHandler = open( outFile, "a" ) + inFileHandler = open( inFile, "r" ) + shutil.copyfileobj( inFileHandler, outFileHandler ) + inFileHandler.close() + outFileHandler.close() + + + ## Replace Windows end-of-line by Unix end-of-line + # + @staticmethod + def fromWindowsToUnixEof( inFile ): + tmpFile = "%s.tmp" % ( inFile ) + shutil.copyfile( inFile, tmpFile ) + os.remove( inFile ) + tmpFileHandler = open( tmpFile, "r" ) + inFileHandler = open( inFile, "w" ) + while True: + line = tmpFileHandler.readline() + if line == "": + break + inFileHandler.write( line.replace("\r\n","\n") ) + tmpFileHandler.close() + inFileHandler.close() + os.remove( tmpFile ) + + + ## Remove duplicated lines in a file + # + # @note it preserves the initial order and handles blank lines + # + @staticmethod + def removeDuplicatedLines( inFile ): + tmpFile = "%s.tmp" % ( inFile ) + shutil.copyfile( inFile, tmpFile ) + os.remove( inFile ) + + tmpFileHandler = open( tmpFile, "r" ) + lLines = list( tmpFileHandler.read().split("\n") ) + if lLines[-1] == "": + del lLines[-1] + sLines = set( lLines ) + tmpFileHandler.close() + os.remove( tmpFile ) + + inFileHandler = open( inFile, "w" ) + for line in lLines: + if line in sLines: + inFileHandler.write( "%s\n" % ( line ) ) + sLines.remove( line ) + inFileHandler.close() + + + ## Write a list of lines in a given file + # + @staticmethod + def writeLineListInFile( inFile, lLines ): + inFileHandler = open( inFile, "w" ) + for line in lLines: + inFileHandler.write( line ) + inFileHandler.close() + + + ## Give the list of absolute path of each directory in the given directory + # + # @param rootPath string absolute path of the given directory + # + # @return lDirPath list of absolute directory path + # + @staticmethod + def getAbsoluteDirectoryPathList(rootPath): + lDirPath = [] + lPaths = glob.glob(rootPath + "/*") + for ressource in lPaths: + if os.path.isdir(ressource) : + lDirPath.append(ressource) + return lDirPath + + + ## Get a sublist of which each element matches/doesn't match a pattern + # + # @param lPath string list of paths + # + # @param pattern string pattern + # + # @param match bool + # + # @return lPathMatching list of path matching pattern + # + @staticmethod + def getSubListAccordingToPattern(lPath, pattern, match = True): + lPathMatching = [] + for path in lPath: + if match: + if re.match(".*%s.*" % pattern, path): + lPathMatching.append(path) + else: + if not re.match(".*%s.*" % pattern, path): + lPathMatching.append(path) + return lPathMatching + + + ## Give the list of file names found in the given directory + # + # @param dirPath string absolute path of the given directory + # + # @return lFilesInDir list of file names + # + @staticmethod + def getFileNamesList( dirPath, patternFileFilter = ".*" ): + lFilesInDir = [] + lPaths = glob.glob( dirPath + "/*" ) + for ressource in lPaths: + if os.path.isfile( ressource ): + fileName = os.path.basename( ressource ) + if re.match(patternFileFilter, fileName): + lFilesInDir.append( fileName ) + return lFilesInDir + + ## Return the MD5 sum of a file + # + @staticmethod + def getMd5SecureHash( inFile ): + if "hashlib" in sys.modules: + md5 = hashlib.md5() + inFileHandler = open( inFile, "r" ) + while True: + line = inFileHandler.readline() + if line == "": + break + md5.update( line ) + inFileHandler.close() + return md5.hexdigest() + else: + return "" + + ## Return True if size file > 0 octet + # + # @param fileName string file name + # + @staticmethod + def isSizeNotNull(fileName): + size = os.path.getsize(fileName) + if size > 0: + return True + return False + + ## Split one file into N Files by lines + # + # @param fileName string file name + # @param N int number of files to create + # + @staticmethod + def splitFileIntoNFiles(fileName, N): + nbLine = FileUtils.getNbLinesInSingleFile(fileName) + nbLinesInEachFile = nbLine + if N > nbLine: + N = nbLine + if N != 0: + nbLinesInEachFile = math.ceil(float(nbLine) / N) + else: + N = 1 + filePrefix, fileExt = os.path.splitext(os.path.basename(fileName)) + fileHandler = open(fileName, "r") + for i in range(1,N+1): + with open("%s-%s%s" %(filePrefix, i, fileExt), "w") as f: + j = 0 + while j < nbLinesInEachFile: + j += 1 + f.write(fileHandler.readline()) + fileHandler.close() + + ## Split one file into files of N lines + # + # @param fileName string input file name + # @param N int lines number per files + # + @staticmethod + def splitFileAccordingToLineNumber(fileName, N): + filePrefix, fileExt = os.path.splitext(os.path.basename(fileName)) + with open(fileName) as inF: + fileNb = 1 + line = inF.readline() + if not line or N == 0: + outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt) + f = open(outFileName, "wb") + shutil.copyfileobj(open(fileName, "rb"), f) + f.close() + else: + while line: + outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt) + with open(outFileName, "w") as outF: + lineNb = 1 + while lineNb <= N and line: + outF.write(line) + line = inF.readline() + lineNb += 1 + fileNb += 1 + + ## Concatenates names from a list, using a given separator and a given extension. + # + # @param lNames list of file names + # @param sep separator used to join names + # @param ext extension of the return file name. If None, the most represented extension in lNames is used. + # If there is several, the first extension of theses several in alphabetical order is used + # + # @return concatName name concatenated + # + @staticmethod + def concatenateFileNamesFromList(lNames, sep = "_", ext = None): + concatName = "" + if lNames: + lNames.sort() + tBaseNames, tExt = zip(*[os.path.splitext(os.path.basename(name)) for name in lNames]) + + if ext is None: + dtExtToNb = {} + for extension in set(tExt): + dtExtToNb[extension] = tExt.count(extension) + + items = sorted(dtExtToNb.items(), key = itemgetter(0)) + items.sort(key = itemgetter(1), reverse = True) + ext = items[0][0] + + if ext and ext[0] != '.': + ext = ".%s" % ext + + concatName = "%s%s" % (sep.join(tBaseNames), ext) + return concatName + + ## Concatenates names from a string, using a given separator and a given extension. Names are split from the string using splitSep + # + # @param filesNames list of file names + # @param splitSep separator used to split names from the input string + # @param joinSep separator used to join names + # @param ext extension of the return file name. If None, the most represented extension in lNames is used. + # If there is several, the first extension of theses several in alphabetical order is used + # + # @return concatName,lFilesNames name concatenated and split files list sorted alphabetically. Return original name if splitSep is empty. + # + @staticmethod + def concatenateFileNamesFromString(filesNames, splitSep = ",", joinSep = "_", ext = None): + if splitSep: + lFilesNames = filesNames.split(splitSep) + return FileUtils.concatenateFileNamesFromList(lFilesNames, joinSep, ext), lFilesNames + else: + print "WARNING: no split separator provided, returning input string" + return filesNames, [filesNames] + \ No newline at end of file diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/utils/RepetConfigParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/utils/RepetConfigParser.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,38 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from ConfigParser import ConfigParser + + +class RepetConfigParser(ConfigParser): + + def optionxform(self, optionstr): + return optionstr \ No newline at end of file diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/utils/RepetOptionParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/commons/core/utils/RepetOptionParser.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,79 @@ +#!/usr/bin/env python + +""" +Class overriding optparse.OptionParser default epilog formatter. +The resulting epilog display format is the same as if the corresponding string was printed. +""" + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +from optparse import OptionParser +from optparse import BadOptionError +from optparse import OptionValueError +SUPPRESS_USAGE = "SUPPRESS"+"USAGE" + +class RepetOptionParser(OptionParser): + + def parse_args(self, args=None, values=None): + rargs = self._get_args(args) + if not rargs: + rargs = ["-h"] + if values is None: + values = self.get_default_values() + self.rargs = rargs + self.largs = largs = [] + self.values = values + try: + self._process_args(largs, rargs, values) + except (BadOptionError, OptionValueError), err: + self.error(str(err)) + args = largs + rargs + return self.check_values(values, args) + + def set_usage(self, usage): + if not usage or usage is SUPPRESS_USAGE: + self.usage = None + elif usage.lower().startswith("usage: "): + self.usage = usage[7:] + else: + self.usage = usage + + def format_epilog(self, formatter): + if self.epilog != None: + return self.epilog + else : + return "" + + def format_description(self, formatter): + if self.description != None: + return "Description: %s" % self.description + else : + return "" diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/commons/core/utils/__init__.py diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/TEisotools-1.0/setup_TEiso.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEisotools-1.0/setup_TEiso.py Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,139 @@ +import os, shutil, glob, re, sys, time +from distutils.util import convert_path +from distutils.core import setup +from distutils.core import Command + +class Devtools(object): + + @staticmethod + def find_sub_packages(lrootPackage, lExclude=()): + lPackages = Devtools.find_packages(exclude=lExclude) + lSubPackage = [] + for package in lPackages: + for rootPackage in lrootPackage: + if package.startswith(rootPackage): + lSubPackage.append(package) + return lSubPackage + + @staticmethod + def find_packages(where='.', exclude=()): + out = [] + stack=[(convert_path(where), '')] + while stack: + where,prefix = stack.pop(0) + for name in os.listdir(where): + fn = os.path.join(where,name) + if ('.' not in name and os.path.isdir(fn) and os.path.isfile(os.path.join(fn,'__init__.py'))): + out.append(prefix+name); stack.append((fn,prefix+name+'.')) + for pat in list(exclude)+['ez_setup', 'distribute_setup']: + from fnmatch import fnmatchcase + out = [item for item in out if not fnmatchcase(item,pat)] + return out + + @staticmethod + def findall(directory = os.curdir): + all_files = [] + for base, dirs, files in os.walk(directory): + if base==os.curdir or base.startswith(os.curdir+os.sep): + base = base[2:] + if base: + files = [os.path.join(base, f) for f in files] + all_files.extend(filter(os.path.isfile, files)) + return all_files + + +class Install(Command): + description = "Install TEiso_tools" + user_options = [] + + def initialize_options(self): + """Use this to set option defaults before parsing.""" + pass + + def finalize_options(self): + """Code to validate/modify command-line/config input goes here.""" + pass + + def _isToLink(self, fileName): + if re.search("__\w*.py", fileName): + return False + elif re.search("Test\w*.py", fileName): + return False + elif re.search("\w*.py$", fileName): + return True + return False + + def run(self): + cwd = os.getcwd() + print "Build TEiso in %s" % (cwd) + if not os.path.isdir("bin"): + os.mkdir("bin") + os.chdir("bin") + lPackages = ["TEiso"] + for package in lPackages: + if os.path.isdir("../%s" % package): + print "processing %s/..." % package + [os.symlink(i, os.path.basename(i)) for i in Devtools.findall("../%s" % package) if self._isToLink(os.path.basename(i))] + os.system("chmod 0755 *") + print "TEiso is ready to use" + +class PublicRelease(Command): + description = "public release for TEiso_tools" + user_options = [] + + def initialize_options(self): + """Use this to set option defaults before parsing.""" + pass + + def finalize_options(self): + """Code to validate/modify command-line/config input goes here.""" + pass + + def run(self): + + print "START Releasing (%s)" % time.strftime("%Y-%m-%d %H:%M:%S") + sys.stdout.flush() + + cwd = os.getcwd() + + print "Removing all '.pyc' files and dead symlinks in bin, then adding execution rights to all 'py' files (%s)" % time.strftime("%Y-%m-%d %H:%M:%S") + sys.stdout.flush() + + os.system("find . -name '*.pyc' -exec rm {} \;") + os.system("find . -name '*.py' -exec chmod +x {} \;") + os.system("find -L ./bin -type l -exec rm {} +") + +# lSetupFiles = FileUtils.getFileNamesList(cwd, "setup_.*.py") +# lSetupFiles.remove("setup_REPET.py") +# FileUtils.removeFilesFromList(lSetupFiles) + + os.chdir("..") + os.system("tar -czf %s.tar.gz %s" % (os.path.basename(cwd), os.path.basename(cwd))) + + print "END Releasing (%s)" % time.strftime("%Y-%m-%d %H:%M:%S") + sys.stdout.flush() + +setup( + name = "TEisotools", + version = "1.0", + description='Set of tools to analyse RNA_seq for the France Genomics projects.', + author='URGI team', + author_email='urgi-support@versailles.inra.fr', + url='https://urgi.versailles.inra.fr/Projects/TEiso', + packages=[], + #Additionnal Files + data_files=[('TEiso',["TEiso/LaunchTEiso.py", "TEiso/Tophat.py", "TEiso/Cufflinks.py", "TEiso/Cuffcompare.py", "TEiso/Bowtie_build.py", "TEiso/Bedtools_closest.py","TEiso/ClosestToStartSite.py", "TEiso/GFFToBed.py", "TEiso/CufflinksGTFToBed.py"]), + ("commons/core", glob.glob("commons/core/*.py")), + ("commons", glob.glob("commons/*.py")), + ("commons/core/utils", glob.glob("commons/core/utils/*.py")), + ("commons/core/checker", glob.glob("commons/core/checker/*.py")), + ("commons/core/seq", glob.glob("commons/core/seq/*.py")), + ("commons/core/coord", glob.glob("commons/core/coord/*.py")), + ('commons/core/parsing',["commons/core/parsing/GtfParser.py", "commons/core/parsing/TranscriptListParser.py", "commons/core/parsing/GffParser.py", "commons/core/parsing/__init__.py"]), + ('',['TEiso/doc/README_TEiso.txt']),('',['LICENSE'])], + cmdclass={ + 'install' : Install + } + ) + + diff -r 782306d67e39 -r 22b0494ec883 TEisotools-1.0/tmpNQQAnB --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/tmpNQQAnB Wed Jul 20 08:18:51 2016 -0400 @@ -0,0 +1,14 @@ + + + + Index of /download/TEiso + + +

Index of /download/TEiso

+ + + + +
[ICO]NameLast modifiedSizeDescription

[DIR]Parent Directory  -
[   ]TEisotools-1.1.tar.gz19-Jul-2016 16:09 97K

+
Apache/2.2.3 (CentOS) Server at melkor.versailles.inra.fr Port 80
+